Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:02:58 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:02:58 +0000
commit: 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree: 173a775858bd501c378080a10dca74132f05bc50 /library/stdarch
parent: Initial commit. (diff)
download: rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
249 files changed, 520270 insertions, 0 deletions
diff --git a/library/stdarch/.cirrus.yml b/library/stdarch/.cirrus.yml
new file mode 100644
index 000000000..7b75ba39d
--- /dev/null
+++ b/library/stdarch/.cirrus.yml
@@ -0,0 +1,17 @@
+env:
+  # Temporary fix for https://github.com/rust-lang/rustup/issues/2774.
+  RUSTUP_IO_THREADS: "1"
+
+task:
+  name: x86_64-unknown-freebsd
+  freebsd_instance:
+    image: freebsd-12-2-release-amd64
+  setup_script:
+    - pkg install -y curl
+    - curl https://sh.rustup.rs -sSf --output rustup.sh
+    - sh rustup.sh --default-toolchain nightly -y
+    - . $HOME/.cargo/env
+    - rustup default nightly
+  test_script:
+    - . $HOME/.cargo/env
+    - cargo build --all
diff --git a/library/stdarch/.github/workflows/main.yml b/library/stdarch/.github/workflows/main.yml
new file mode 100644
index 000000000..fd8713ff8
--- /dev/null
+++ b/library/stdarch/.github/workflows/main.yml
@@ -0,0 +1,225 @@
+name: CI
+on:
+  push:
+    branches:
+    - auto
+    - try
+  pull_request:
+    branches:
+    - master
+
+jobs:
+  style:
+    name: Check Style
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - name: Install Rust
+      run: rustup update nightly && rustup default nightly
+    - run: ci/style.sh
+
+  docs:
+    name: Build Documentation
+    needs: [style]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - name: Install Rust
+      run: rustup update nightly && rustup default nightly
+    - run: ci/dox.sh
+      env:
+        CI: 1
+    - name: Publish documentation
+      run: |
+        cd target/doc
+        git init
+        git add .
+        git -c user.name='ci' -c user.email='ci' commit -m init
+        git push -f -q https://git:${{ secrets.github_token }}@github.com/${{ github.repository }} HEAD:gh-pages
+      if: github.event_name == 'push' && github.event.ref == 'refs/heads/master'
+
+  verify:
+    name: Automatic intrinsic verification
+    needs: [style]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - name: Install Rust
+      run: rustup update nightly && rustup default nightly
+    - run: cargo test --manifest-path crates/stdarch-verify/Cargo.toml
+
+  env_override:
+    name: Env Override
+    needs: [style]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - name: Install Rust
+      run: rustup update nightly && rustup default nightly
+    - run: RUST_STD_DETECT_UNSTABLE=avx cargo test --features=std_detect_env_override --manifest-path crates/std_detect/Cargo.toml env_override_no_avx
+
+  test:
+    needs: [style]
+    name: Test
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        target:
+        # Dockers that are run through docker on linux
+        - i686-unknown-linux-gnu
+        - x86_64-unknown-linux-gnu
+        - x86_64-unknown-linux-gnu-emulated
+        - arm-unknown-linux-gnueabihf
+        - armv7-unknown-linux-gnueabihf
+        - aarch64-unknown-linux-gnu
+        - riscv64gc-unknown-linux-gnu
+        - powerpc64le-unknown-linux-gnu
+        - mips-unknown-linux-gnu
+        - mips64-unknown-linux-gnuabi64
+        - mips64el-unknown-linux-gnuabi64
+        - s390x-unknown-linux-gnu
+        - wasm32-wasi
+        - i586-unknown-linux-gnu
+        - x86_64-linux-android
+        - arm-linux-androideabi
+        - mipsel-unknown-linux-musl
+        - aarch64-linux-android
+        - nvptx64-nvidia-cuda
+        - thumbv6m-none-eabi
+        - thumbv7m-none-eabi
+        - thumbv7em-none-eabi
+        - thumbv7em-none-eabihf
+
+        # macOS targets
+        #- x86_64-apple-darwin
+        - aarch64-apple-darwin
+        # FIXME: gh-actions build environment doesn't have linker support
+        # - i686-apple-darwin
+
+        # Windows targets
+        - x86_64-pc-windows-msvc
+        - i686-pc-windows-msvc
+        # FIXME: Disassembly not implemented for the # following targets:
+        # - x86_64-pc-windows-gnu:
+        # - i686-pc-windows-gnu:
+        # - aarch64-pc-windows-msvc:
+
+        include:
+        - target: i686-unknown-linux-gnu
+          os: ubuntu-latest
+        - target: x86_64-unknown-linux-gnu
+          os: ubuntu-latest
+        - target: x86_64-unknown-linux-gnu-emulated
+          os: ubuntu-latest
+          test_everything: true
+          rustflags: --cfg stdarch_intel_sde
+        - target: arm-unknown-linux-gnueabihf
+          os: ubuntu-latest
+        - target: armv7-unknown-linux-gnueabihf
+          os: ubuntu-latest
+        - target: mips-unknown-linux-gnu
+          os: ubuntu-latest
+          norun: true
+        - target: mips64-unknown-linux-gnuabi64
+          os: ubuntu-latest
+          norun: true
+        - target: mips64el-unknown-linux-gnuabi64
+          os: ubuntu-latest
+          norun: true
+        - target: powerpc64le-unknown-linux-gnu
+          os: ubuntu-latest
+          disable_assert_instr: true
+        - target: s390x-unknown-linux-gnu
+          os: ubuntu-latest
+        - target: wasm32-wasi
+          os: ubuntu-latest
+        - target: aarch64-apple-darwin
+          os: macos-latest
+          norun: true
+        - target: aarch64-unknown-linux-gnu
+          os: ubuntu-latest
+        - target: x86_64-apple-darwin
+          os: macos-11
+        - target: x86_64-pc-windows-msvc
+          os: windows-latest
+        - target: i686-pc-windows-msvc
+          os: windows-latest
+        - target: i586-unknown-linux-gnu
+          os: ubuntu-latest
+        - target: x86_64-linux-android
+          os: ubuntu-latest
+          disable_assert_instr: 1
+        - target: arm-linux-androideabi
+          os: ubuntu-latest
+          disable_assert_instr: 1
+        - target: mipsel-unknown-linux-musl
+          os: ubuntu-latest
+          norun: 1
+        - target: aarch64-linux-android
+          os: ubuntu-latest
+          disable_assert_instr: 1
+        - target: nvptx64-nvidia-cuda
+          os: ubuntu-latest
+        - target: thumbv6m-none-eabi
+          os: ubuntu-latest
+        - target: thumbv7m-none-eabi
+          os: ubuntu-latest
+        - target: thumbv7em-none-eabi
+          os: ubuntu-latest
+        - target: thumbv7em-none-eabihf
+          os: ubuntu-latest
+        - target: riscv64gc-unknown-linux-gnu
+          os: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@master
+      with:
+        submodules: recursive
+    - name: Install Rust (rustup)
+      run: |
+        rustup update nightly --no-self-update
+        rustup default nightly
+      if: matrix.os != 'macos-latest'
+    - name: Install Rust (macos)
+      run: |
+        curl https://sh.rustup.rs | sh -s -- -y --default-toolchain nightly
+        echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+        rustup update nightly --no-self-update
+        rustup default nightly
+      if: matrix.os == 'macos-latest'
+    - run: |
+        rustup default nightly
+        rustup target add ${{ matrix.target }}
+      if: "!endsWith(matrix.target, 'emulated')"
+    - name: Setup (aarch64-apple-darwin)
+      run: |
+        echo "SDKROOT=$(xcrun -sdk macosx11.0 --show-sdk-path)" >> $GITHUB_ENV
+        echo "MACOS_DEPLOYMENT_TARGET=$(xcrun -sdk macosx11.0 --show-sdk-platform-version)" >> $GITHUB_ENV
+      if: matrix.target == 'aarch64-apple-darwin'
+    - run: cargo generate-lockfile
+
+    # Configure some env vars based on matrix configuration
+    - run: echo "NORUN=1" >> $GITHUB_ENV
+      if: matrix.norun != '' || startsWith(matrix.target, 'thumb') || matrix.target == 'nvptx64-nvidia-cuda'
+    - run: echo "STDARCH_TEST_EVERYTHING=1" >> $GITHUB_ENV
+      if: matrix.test_everything != ''
+    - run: echo "RUSTFLAGS=${{ matrix.rustflags }}" >> $GITHUB_ENV
+      if: matrix.rustflags != ''
+    - run: echo "STDARCH_DISABLE_ASSERT_INSTR=1" >> $GITHUB_ENV
+      if: matrix.disable_assert_instr != ''
+    - run: echo "NOSTD=1" >> $GITHUB_ENV
+      if: startsWith(matrix.target, 'thumb') || matrix.target == 'nvptx64-nvidia-cuda'
+
+    # Windows & OSX go straight to `run.sh` ...
+    - run: ./ci/run.sh
+      shell: bash
+      if: matrix.os != 'ubuntu-latest' || startsWith(matrix.target, 'thumb')
+      env:
+        TARGET: ${{ matrix.target }}
+
+    # ... while Linux goes to `run-docker.sh`
+    - run: ./ci/run-docker.sh ${{ matrix.target }}
+      shell: bash
+      if: "matrix.os == 'ubuntu-latest' && !startsWith(matrix.target, 'thumb')"
+      env:
+        TARGET: ${{ matrix.target }}
diff --git a/library/stdarch/CONTRIBUTING.md b/library/stdarch/CONTRIBUTING.md
new file mode 100644
index 000000000..ebccd73ea
--- /dev/null
+++ b/library/stdarch/CONTRIBUTING.md
@@ -0,0 +1,93 @@
+# Contributing to stdarch
+
+The `stdarch` crate is more than willing to accept contributions! First you'll
+probably want to check out the repository and make sure that tests pass for you:
+
+```
+$ git clone https://github.com/rust-lang/stdarch
+$ cd stdarch
+$ TARGET="<your-target-arch>" ci/run.sh
+```
+
+Where `<your-target-arch>` is the target triple as used by `rustup`, e.g. `x86_x64-unknown-linux-gnu` (without any preceding `nightly-` or similar).
+Also remember that this repository requires the nightly channel of Rust!
+The above tests do in fact require nightly rust to be the default on your system, to set that use `rustup default nightly` (and `rustup default stable` to revert).
+
+If any of the above steps don't work, [please let us know][new]!
+
+Next up you can [find an issue][issues] to help out on, we've selected a few
+with the [`help wanted`][help] and [`impl-period`][impl] tags which could
+particularly use some help. You may be most interested in [#40][vendor],
+implementing all vendor intrinsics on x86. That issue's got some good pointers
+about where to get started!
+
+If you've got general questions feel free to [join us on gitter][gitter] and ask
+around! Feel free to ping either @BurntSushi or @alexcrichton with questions.
+
+[gitter]: https://gitter.im/rust-impl-period/WG-libs-simd
+
+# How to write examples for stdarch intrinsics
+
+There are a few features that must be enabled for the given intrinsic to work
+properly and the example must only be run by `cargo test --doc` when the feature
+is supported by the CPU. As a result, the default `fn main` that is generated by
+`rustdoc` will not work (in most cases). Consider using the following as a guide
+to ensure your example works as expected.
+
+```rust
+/// # // We need cfg_target_feature to ensure the example is only
+/// # // run by `cargo test --doc` when the CPU supports the feature
+/// # #![feature(cfg_target_feature)]
+/// # // We need target_feature for the intrinsic to work
+/// # #![feature(target_feature)]
+/// #
+/// # // rustdoc by default uses `extern crate stdarch`, but we need the
+/// # // `#[macro_use]`
+/// # #[macro_use] extern crate stdarch;
+/// #
+/// # // The real main function
+/// # fn main() {
+/// #     // Only run this if `<target feature>` is supported
+/// #     if cfg_feature_enabled!("<target feature>") {
+/// #         // Create a `worker` function that will only be run if the target feature
+/// #         // is supported and ensure that `target_feature` is enabled for your worker
+/// #         // function
+/// #         #[target_feature(enable = "<target feature>")]
+/// #         unsafe fn worker() {
+///
+/// // Write your example here. Feature specific intrinsics will work here! Go wild!
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+```
+
+If some of the above syntax does not look familiar, the [Documentation as tests] section
+of the [Rust Book] describes the `rustdoc` syntax quite well. As always, feel free
+to [join us on gitter][gitter] and ask us if you hit any snags, and thank you for helping
+to improve the documentation of `stdarch`!
+
+# Alternative Testing Instructions
+
+It is generally recommended that you use `ci/run.sh` to run the tests.
+However this might not work for you, e.g. if you are on Windows.
+
+In that case you can fall back to running `cargo +nightly test` and `cargo +nightly test --release -p core_arch` for testing the code generation.
+Note that these require the nightly toolchain to be installed and for `rustc` to know about your target triple and its CPU.
+In particular you need to set the `TARGET` environment variable as you would for `ci/run.sh`.
+In addition you need to set `RUSTCFLAGS` (need the `C`) to indicate target features, e.g. `RUSTCFLAGS="-C -target-features=+avx2"`.
+You can also set `-C -target-cpu=native` if you're "just" developing against your current CPU.
+
+Be warned that when you use these alternative instructions, [things may go less smoothly than they would with `ci/run.sh`][ci-run-good], e.g. instruction generation tests may fail because the disassembler named them differently, e.g. it may generate `vaesenc` instead of `aesenc` instructions despite them behaving the same.
+Also these instructions execute less tests than would normally be done, so don't be surprised that when you eventually pull-request some errors may show up for tests not covered here.
+
+
+[new]: https://github.com/rust-lang/stdarch/issues/new
+[issues]: https://github.com/rust-lang/stdarch/issues
+[help]: https://github.com/rust-lang/stdarch/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22
+[impl]: https://github.com/rust-lang/stdarch/issues?q=is%3Aissue+is%3Aopen+label%3Aimpl-period
+[vendor]: https://github.com/rust-lang/stdarch/issues/40
+[Documentation as tests]: https://doc.rust-lang.org/book/first-edition/documentation.html#documentation-as-tests
+[Rust Book]: https://doc.rust-lang.org/book/first-edition
+[ci-run-good]: https://github.com/rust-lang/stdarch/issues/931#issuecomment-711412126
diff --git a/library/stdarch/Cargo.toml b/library/stdarch/Cargo.toml
new file mode 100644
index 000000000..6efd6b189
--- /dev/null
+++ b/library/stdarch/Cargo.toml
@@ -0,0 +1,22 @@
+[workspace]
+members = [
+  "crates/stdarch-verify",
+  "crates/core_arch",
+  "crates/std_detect",
+  "crates/stdarch-gen",
+  "crates/intrinsic-test",
+  "examples/"
+]
+exclude = [
+  "crates/wasm-assert-instr-tests"
+]
+
+[profile.release]
+debug = true
+opt-level = 3
+incremental = true
+
+[profile.bench]
+debug = 1
+opt-level = 3
+incremental = true
diff --git a/library/stdarch/LICENSE-APACHE b/library/stdarch/LICENSE-APACHE
new file mode 100644
index 000000000..16fe87b06
--- /dev/null
+++ b/library/stdarch/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/library/stdarch/LICENSE-MIT b/library/stdarch/LICENSE-MIT
new file mode 100644
index 000000000..52d82415d
--- /dev/null
+++ b/library/stdarch/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2017 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/library/stdarch/README.md b/library/stdarch/README.md
new file mode 100644
index 000000000..70ec256e6
--- /dev/null
+++ b/library/stdarch/README.md
@@ -0,0 +1,18 @@
+stdarch - Rust's standard library SIMD components
+=======
+
+[![Actions Status](https://github.com/rust-lang/stdarch/workflows/CI/badge.svg)](https://github.com/rust-lang/stdarch/actions)
+
+
+# Crates
+
+This repository contains two main crates:
+
+* [`core_arch`](crates/core_arch/README.md) implements `core::arch` - Rust's
+  core library architecture-specific intrinsics, and
+  
+* [`std_detect`](crates/std_detect/README.md) implements `std::detect` - Rust's
+  standard library run-time CPU feature detection.
+
+The `std::simd` component now lives in the
+[`packed_simd_2`](https://github.com/rust-lang/packed_simd) crate.
diff --git a/library/stdarch/ci/android-install-ndk.sh b/library/stdarch/ci/android-install-ndk.sh
new file mode 100644
index 000000000..944a8389a
--- /dev/null
+++ b/library/stdarch/ci/android-install-ndk.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env sh
+# Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+set -ex
+
+curl --retry 5 -O \
+     https://dl.google.com/android/repository/android-ndk-r15b-linux-x86_64.zip
+unzip -q android-ndk-r15b-linux-x86_64.zip
+
+case "${1}" in
+  aarch64)
+    arch=arm64
+    ;;
+
+  i686)
+    arch=x86
+    ;;
+
+  *)
+    arch="${1}"
+    ;;
+esac;
+
+android-ndk-r15b/build/tools/make_standalone_toolchain.py \
+        --unified-headers \
+        --install-dir "/android/ndk-${1}" \
+        --arch "${arch}" \
+        --api 24
+
+rm -rf ./android-ndk-r15b-linux-x86_64.zip ./android-ndk-r15b
diff --git a/library/stdarch/ci/android-install-sdk.sh b/library/stdarch/ci/android-install-sdk.sh
new file mode 100644
index 000000000..1beeb312a
--- /dev/null
+++ b/library/stdarch/ci/android-install-sdk.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env sh
+# Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+set -ex
+
+# Prep the SDK and emulator
+#
+# Note that the update process requires that we accept a bunch of licenses, and
+# we can't just pipe `yes` into it for some reason, so we take the same strategy
+# located in https://github.com/appunite/docker by just wrapping it in a script
+# which apparently magically accepts the licenses.
+
+mkdir sdk
+curl --retry 5 https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip -O
+unzip -d sdk sdk-tools-linux-3859397.zip
+
+case "$1" in
+  arm | armv7)
+    abi=armeabi-v7a
+    ;;
+
+  aarch64)
+    abi=arm64-v8a
+    ;;
+
+  i686)
+    abi=x86
+    ;;
+
+  x86_64)
+    abi=x86_64
+    ;;
+
+  *)
+    echo "invalid arch: $1"
+    exit 1
+    ;;
+esac;
+
+# --no_https avoids
+# javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: No trusted certificate found
+yes | ./sdk/tools/bin/sdkmanager --licenses --no_https
+yes | ./sdk/tools/bin/sdkmanager --no_https \
+    "emulator" \
+    "platform-tools" \
+    "platforms;android-24" \
+    "system-images;android-24;default;$abi"
+
+echo "no" |
+    ./sdk/tools/bin/avdmanager create avd \
+        --name "${1}" \
+        --package "system-images;android-24;default;$abi"
diff --git a/library/stdarch/ci/android-sysimage.sh b/library/stdarch/ci/android-sysimage.sh
new file mode 100644
index 000000000..31a6762cb
--- /dev/null
+++ b/library/stdarch/ci/android-sysimage.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+set -ex
+
+URL=https://dl.google.com/android/repository/sys-img/android
+
+main() {
+    local arch="${1}"
+    local name="${2}"
+    local dest=/system
+    local td
+    td="$(mktemp -d)"
+
+    apt-get install --no-install-recommends e2tools
+
+    pushd "$td"
+    curl --retry 5 -O "${URL}/${name}"
+    unzip -q "${name}"
+
+    local system
+    system=$(find . -name system.img)
+    mkdir -p $dest/{bin,lib,lib64}
+
+    # Extract android linker and libraries to /system
+    # This allows android executables to be run directly (or with qemu)
+    if [ "${arch}" = "x86_64" ] || [ "${arch}" = "arm64" ]; then
+        e2cp -p "${system}:/bin/linker64" "${dest}/bin/"
+        e2cp -p "${system}:/lib64/libdl.so" "${dest}/lib64/"
+        e2cp -p "${system}:/lib64/libc.so" "${dest}/lib64/"
+        e2cp -p "${system}:/lib64/libm.so" "${dest}/lib64/"
+    else
+        e2cp -p "${system}:/bin/linker" "${dest}/bin/"
+        e2cp -p "${system}:/lib/libdl.so" "${dest}/lib/"
+        e2cp -p "${system}:/lib/libc.so" "${dest}/lib/"
+        e2cp -p "${system}:/lib/libm.so" "${dest}/lib/"
+    fi
+
+    # clean up
+    apt-get purge --auto-remove -y e2tools
+
+    popd
+
+    rm -rf "${td}"
+}
+
+main "${@}"
diff --git a/library/stdarch/ci/docker/aarch64-linux-android/Dockerfile b/library/stdarch/ci/docker/aarch64-linux-android/Dockerfile
new file mode 100644
index 000000000..27bde89c5
--- /dev/null
+++ b/library/stdarch/ci/docker/aarch64-linux-android/Dockerfile
@@ -0,0 +1,47 @@
+FROM ubuntu:16.04
+
+RUN dpkg --add-architecture i386 && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+  file \
+  make \
+  curl \
+  ca-certificates \
+  python \
+  unzip \
+  expect \
+  openjdk-9-jre \
+  libstdc++6:i386 \
+  libpulse0 \
+  gcc \
+  libc6-dev
+
+WORKDIR /android/
+COPY android* /android/
+
+ENV ANDROID_ARCH=aarch64
+ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools
+
+RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
+RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
+RUN mv /root/.android /tmp
+RUN chmod 777 -R /tmp/.android
+RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*
+
+ENV PATH=$PATH:/rust/bin \
+    CARGO_TARGET_AARCH64_LINUX_ANDROID_LINKER=aarch64-linux-android-gcc \
+    CARGO_TARGET_AARCH64_LINUX_ANDROID_RUNNER=/tmp/runtest \
+    OBJDUMP=aarch64-linux-android-objdump \
+    HOME=/tmp
+
+ADD runtest-android.rs /tmp/runtest.rs
+ENTRYPOINT [ \
+  "bash", \
+  "-c", \
+  # set SHELL so android can detect a 64bits system, see
+  # http://stackoverflow.com/a/41789144
+  "SHELL=/bin/dash /android/sdk/emulator/emulator @aarch64 -no-window & \
+   rustc /tmp/runtest.rs -o /tmp/runtest && \
+   exec \"$@\"", \
+  "--" \
+]
diff --git a/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..49464dacf
--- /dev/null
+++ b/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,18 @@
+FROM ubuntu:21.10
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  g++ \
+  ca-certificates \
+  libc6-dev \
+  gcc-aarch64-linux-gnu \
+  g++-aarch64-linux-gnu \
+  libc6-dev-arm64-cross \
+  qemu-user \
+  make \
+  file \
+  clang-13 \
+  lld
+
+ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \
+    CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64 -L /usr/aarch64-linux-gnu" \
+    OBJDUMP=aarch64-linux-gnu-objdump
diff --git a/library/stdarch/ci/docker/arm-linux-androideabi/Dockerfile b/library/stdarch/ci/docker/arm-linux-androideabi/Dockerfile
new file mode 100644
index 000000000..995a9e30e
--- /dev/null
+++ b/library/stdarch/ci/docker/arm-linux-androideabi/Dockerfile
@@ -0,0 +1,47 @@
+FROM ubuntu:16.04
+
+RUN dpkg --add-architecture i386 && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+  file \
+  make \
+  curl \
+  ca-certificates \
+  python \
+  unzip \
+  expect \
+  openjdk-9-jre \
+  libstdc++6:i386 \
+  libpulse0 \
+  gcc \
+  libc6-dev
+
+WORKDIR /android/
+COPY android* /android/
+
+ENV ANDROID_ARCH=arm
+ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools
+
+RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
+RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
+RUN mv /root/.android /tmp
+RUN chmod 777 -R /tmp/.android
+RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*
+
+ENV PATH=$PATH:/rust/bin \
+    CARGO_TARGET_ARM_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \
+    CARGO_TARGET_ARM_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \
+    OBJDUMP=arm-linux-androideabi-objdump \
+    HOME=/tmp
+
+ADD runtest-android.rs /tmp/runtest.rs
+ENTRYPOINT [ \
+  "bash", \
+  "-c", \
+  # set SHELL so android can detect a 64bits system, see
+  # http://stackoverflow.com/a/41789144
+  "SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \
+   rustc /tmp/runtest.rs -o /tmp/runtest && \
+   exec \"$@\"", \
+  "--" \
+]
diff --git a/library/stdarch/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile b/library/stdarch/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
new file mode 100644
index 000000000..757b79e7e
--- /dev/null
+++ b/library/stdarch/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
@@ -0,0 +1,13 @@
+FROM ubuntu:18.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  ca-certificates \
+  libc6-dev \
+  gcc-arm-linux-gnueabihf \
+  libc6-dev-armhf-cross \
+  qemu-user \
+  make \
+  file
+ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
+    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \
+    OBJDUMP=arm-linux-gnueabihf-objdump
diff --git a/library/stdarch/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile b/library/stdarch/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
new file mode 100644
index 000000000..74181a4cb
--- /dev/null
+++ b/library/stdarch/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
@@ -0,0 +1,17 @@
+FROM ubuntu:21.10
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  g++ \
+  ca-certificates \
+  libc6-dev \
+  gcc-arm-linux-gnueabihf \
+  g++-arm-linux-gnueabihf \
+  libc6-dev-armhf-cross \
+  qemu-user \
+  make \
+  file \
+  clang-13 \
+  lld
+ENV CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
+    CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \
+    OBJDUMP=arm-linux-gnueabihf-objdump
diff --git a/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..0e4d1c6eb
--- /dev/null
+++ b/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:20.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc-multilib \
+  libc6-dev \
+  file \
+  make \
+  ca-certificates
diff --git a/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..0e4d1c6eb
--- /dev/null
+++ b/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:20.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc-multilib \
+  libc6-dev \
+  file \
+  make \
+  ca-certificates
diff --git a/library/stdarch/ci/docker/mips-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/mips-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..3bd471e87
--- /dev/null
+++ b/library/stdarch/ci/docker/mips-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,13 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-mips-linux-gnu libc6-dev-mips-cross \
+        qemu-system-mips \
+        qemu-user \
+        make \
+        file
+
+ENV CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_LINKER=mips-linux-gnu-gcc \
+    CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_RUNNER="qemu-mips -L /usr/mips-linux-gnu" \
+    OBJDUMP=mips-linux-gnu-objdump
+\ No newline at end of file
diff --git a/library/stdarch/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile b/library/stdarch/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
new file mode 100644
index 000000000..f26f1f38e
--- /dev/null
+++ b/library/stdarch/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-mips64-linux-gnuabi64 libc6-dev-mips64-cross \
+        qemu-system-mips64 qemu-user
+
+ENV CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_LINKER=mips64-linux-gnuabi64-gcc \
+    CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64 -L /usr/mips64-linux-gnuabi64" \
+    OBJDUMP=mips64-linux-gnuabi64-objdump
+\ No newline at end of file
diff --git a/library/stdarch/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile b/library/stdarch/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
new file mode 100644
index 000000000..7d9f0bd99
--- /dev/null
+++ b/library/stdarch/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-mips64el-linux-gnuabi64 libc6-dev-mips64el-cross \
+        qemu-system-mips64el
+
+ENV CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_LINKER=mips64el-linux-gnuabi64-gcc \
+    CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64el -L /usr/mips64el-linux-gnuabi64" \
+    OBJDUMP=mips64el-linux-gnuabi64-objdump
+\ No newline at end of file
diff --git a/library/stdarch/ci/docker/mipsel-unknown-linux-musl/Dockerfile b/library/stdarch/ci/docker/mipsel-unknown-linux-musl/Dockerfile
new file mode 100644
index 000000000..5d19d7c93
--- /dev/null
+++ b/library/stdarch/ci/docker/mipsel-unknown-linux-musl/Dockerfile
@@ -0,0 +1,25 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    ca-certificates \
+    gcc \
+    libc6-dev \
+    make \
+    qemu-user \
+    qemu-system-mips \
+    bzip2 \
+    curl \
+    file
+
+RUN mkdir /toolchain
+
+# Note that this originally came from:
+# https://downloads.openwrt.org/snapshots/trunk/malta/generic/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2
+RUN curl -L https://ci-mirrors.rust-lang.org/libc/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2 | \
+      tar xjf - -C /toolchain --strip-components=2
+
+ENV PATH=$PATH:/rust/bin:/toolchain/bin \
+    CC_mipsel_unknown_linux_musl=mipsel-openwrt-linux-gcc \
+    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_LINKER=mipsel-openwrt-linux-gcc \
+    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_RUNNER="qemu-mipsel -L /toolchain"
diff --git a/library/stdarch/ci/docker/nvptx64-nvidia-cuda/Dockerfile b/library/stdarch/ci/docker/nvptx64-nvidia-cuda/Dockerfile
new file mode 100644
index 000000000..864d72e62
--- /dev/null
+++ b/library/stdarch/ci/docker/nvptx64-nvidia-cuda/Dockerfile
@@ -0,0 +1,5 @@
+FROM ubuntu:18.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  libc6-dev \
+  ca-certificates
diff --git a/library/stdarch/ci/docker/powerpc-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..877468e5d
--- /dev/null
+++ b/library/stdarch/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,11 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-powerpc-linux-gnu libc6-dev-powerpc-cross \
+        qemu-system-ppc make file
+
+ENV CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_LINKER=powerpc-linux-gnu-gcc \
+    CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc -cpu Vger -L /usr/powerpc-linux-gnu" \
+    CC=powerpc-linux-gnu-gcc \
+    OBJDUMP=powerpc-linux-gnu-objdump
diff --git a/library/stdarch/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..c030e6818
--- /dev/null
+++ b/library/stdarch/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,11 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-powerpc64-linux-gnu libc6-dev-ppc64-cross \
+        qemu-system-ppc file make
+
+ENV CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_LINKER=powerpc64-linux-gnu-gcc \
+    CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64 -cpu power9 -L /usr/powerpc64-linux-gnu" \
+    CC=powerpc64-linux-gnu-gcc \
+    OBJDUMP=powerpc64-linux-gnu-objdump
diff --git a/library/stdarch/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..df611b47c
--- /dev/null
+++ b/library/stdarch/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,12 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross \
+        qemu-system-ppc file make
+
+# Work around qemu triggering a sigill on vec_subs if the cpu target is not defined.
+ENV CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER=powerpc64le-linux-gnu-gcc \
+    CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64le -cpu power9 -L /usr/powerpc64le-linux-gnu" \
+    CC=powerpc64le-linux-gnu-gcc \
+    OBJDUMP=powerpc64le-linux-gnu-objdump
diff --git a/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..1618db22f
--- /dev/null
+++ b/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:21.10
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc libc6-dev qemu-user ca-certificates \
+        gcc-riscv64-linux-gnu libc6-dev-riscv64-cross \
+        qemu-user
+
+ENV CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER=riscv64-linux-gnu-gcc \
+    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER="qemu-riscv64 -L /usr/riscv64-linux-gnu" \
+    OBJDUMP=riscv64-linux-gnu-objdump
diff --git a/library/stdarch/ci/docker/s390x-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/s390x-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..657969234
--- /dev/null
+++ b/library/stdarch/ci/docker/s390x-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,13 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        curl ca-certificates \
+        gcc libc6-dev \
+        gcc-s390x-linux-gnu libc6-dev-s390x-cross \
+        qemu-user \
+        make \
+        file
+
+ENV CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_LINKER=s390x-linux-gnu-gcc \
+    CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_RUNNER="qemu-s390x -L /usr/s390x-linux-gnu" \
+    OBJDUMP=s390x-linux-gnu-objdump
+\ No newline at end of file
diff --git a/library/stdarch/ci/docker/wasm32-wasi/Dockerfile b/library/stdarch/ci/docker/wasm32-wasi/Dockerfile
new file mode 100644
index 000000000..3e250f8b5
--- /dev/null
+++ b/library/stdarch/ci/docker/wasm32-wasi/Dockerfile
@@ -0,0 +1,17 @@
+FROM ubuntu:20.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && apt-get install -y --no-install-recommends \
+  ca-certificates \
+  curl \
+  xz-utils \
+  clang
+
+RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/v0.29.0/wasmtime-v0.29.0-x86_64-linux.tar.xz | tar xJf -
+ENV PATH=$PATH:/wasmtime-v0.29.0-x86_64-linux
+
+ENV CARGO_TARGET_WASM32_WASI_RUNNER="wasmtime \
+  --enable-simd \
+  --enable-threads \
+  --mapdir .::/checkout/target/wasm32-wasi/release/deps \
+  --"
diff --git a/library/stdarch/ci/docker/x86_64-linux-android/Dockerfile b/library/stdarch/ci/docker/x86_64-linux-android/Dockerfile
new file mode 100644
index 000000000..c2830b15f
--- /dev/null
+++ b/library/stdarch/ci/docker/x86_64-linux-android/Dockerfile
@@ -0,0 +1,29 @@
+FROM ubuntu:16.04
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+  ca-certificates \
+  curl \
+  gcc \
+  libc-dev \
+  python \
+  unzip \
+  file \
+  make
+
+WORKDIR /android/
+ENV ANDROID_ARCH=x86_64
+COPY android-install-ndk.sh /android/
+RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
+
+# We do not run x86_64-linux-android tests on an android emulator.
+# See ci/android-sysimage.sh for information about how tests are run.
+COPY android-sysimage.sh /android/
+RUN bash /android/android-sysimage.sh x86_64 x86_64-24_r07.zip
+
+ENV PATH=$PATH:/rust/bin:/android/ndk-$ANDROID_ARCH/bin \
+    CARGO_TARGET_X86_64_LINUX_ANDROID_LINKER=x86_64-linux-android-gcc \
+    CC_x86_64_linux_android=x86_64-linux-android-gcc \
+    CXX_x86_64_linux_android=x86_64-linux-android-g++ \
+    OBJDUMP=x86_64-linux-android-objdump \
+    HOME=/tmp
diff --git a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile
new file mode 100644
index 000000000..b7fc93052
--- /dev/null
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:20.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  libc6-dev \
+  file \
+  make \
+  ca-certificates \
+  wget \
+  bzip2
+
+RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.35.0-2019-03-11-lin.tar.bz2
+RUN tar -xjf sde-external-8.35.0-2019-03-11-lin.tar.bz2
+ENV SKIP_TESTS="avx512bf16"
+ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.35.0-2019-03-11-lin/sde64 -rtm_mode full --"
diff --git a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 000000000..dc4c4e598
--- /dev/null
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:20.04
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  libc6-dev \
+  file \
+  make \
+  ca-certificates
diff --git a/library/stdarch/ci/dox.sh b/library/stdarch/ci/dox.sh
new file mode 100755
index 000000000..e70a32b2d
--- /dev/null
+++ b/library/stdarch/ci/dox.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+# Builds documentation for all target triples that we have a registered URL for
+# in liblibc. This scrapes the list of triples to document from `src/lib.rs`
+# which has a bunch of `html_root_url` directives we pick up.
+
+set -ex
+
+rm -rf target/doc
+mkdir -p target/doc
+
+dox() {
+  local arch=$1
+  local target=$2
+
+  echo "documenting ${arch}"
+
+  if [ "$CI" != "" ]; then
+    rustup target add "${target}" || true
+  fi
+
+  rm -rf "target/doc/${arch}"
+  mkdir "target/doc/${arch}"
+
+  export RUSTFLAGS="--cfg core_arch_docs"
+  export RUSTDOCFLAGS="--cfg core_arch_docs"
+
+  cargo build --verbose --target "${target}" --manifest-path crates/core_arch/Cargo.toml
+  cargo build --verbose --target "${target}" --manifest-path crates/std_detect/Cargo.toml
+
+  rustdoc --verbose --target "${target}" \
+          -o "target/doc/${arch}" crates/core_arch/src/lib.rs \
+          --edition=2018 \
+          --crate-name core_arch \
+          --library-path "target/${target}/debug/deps" \
+          --cfg core_arch_docs
+  rustdoc --verbose --target "${target}" \
+          -o "target/doc/${arch}" crates/std_detect/src/lib.rs \
+          --edition=2018 \
+          --crate-name std_detect \
+          --library-path "target/${target}/debug/deps" \
+          --extern cfg_if="$(ls target/"${target}"/debug/deps/libcfg_if-*.rlib)" \
+          --extern libc="$(ls target/"${target}"/debug/deps/liblibc-*.rlib)" \
+          --cfg core_arch_docs
+}
+
+dox i686 i686-unknown-linux-gnu
+dox x86_64 x86_64-unknown-linux-gnu
+dox arm armv7-unknown-linux-gnueabihf
+dox aarch64 aarch64-unknown-linux-gnu
+dox powerpc powerpc-unknown-linux-gnu
+dox powerpc64le powerpc64le-unknown-linux-gnu
+dox mips mips-unknown-linux-gnu
+dox mips64 mips64-unknown-linux-gnuabi64
+dox wasm32 wasm32-unknown-unknown
diff --git a/library/stdarch/ci/gba.json b/library/stdarch/ci/gba.json
new file mode 100644
index 000000000..5aece43af
--- /dev/null
+++ b/library/stdarch/ci/gba.json
@@ -0,0 +1,34 @@
+{
+  "abi-blacklist": [
+    "stdcall",
+    "fastcall",
+    "vectorcall",
+    "thiscall",
+    "win64",
+    "sysv64"
+  ],
+  "arch": "arm",
+  "atomic-cas": false,
+  "cpu": "arm7tdmi",
+  "data-layout": "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64",
+  "emit-debug-gdb-scripts": false,
+  "env": "agb",
+  "executables": true,
+  "features": "+soft-float,+strict-align",
+  "linker": "arm-none-eabi-ld",
+  "linker-flavor": "ld",
+  "linker-is-gnu": true,
+  "llvm-target": "thumbv4-none-agb",
+  "os": "none",
+  "panic-strategy": "abort",
+  "pre-link-args": {
+    "ld": [
+      "-Tlinker.ld"
+    ]
+  },
+  "relocation-model": "static",
+  "target-c-int-width": "32",
+  "target-endian": "little",
+  "target-pointer-width": "32",
+  "vendor": "nintendo"
+}
+\ No newline at end of file
diff --git a/library/stdarch/ci/run-docker.sh b/library/stdarch/ci/run-docker.sh
new file mode 100755
index 000000000..32209d96c
--- /dev/null
+++ b/library/stdarch/ci/run-docker.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env sh
+
+# Small script to run tests for a target (or all targets) inside all the
+# respective docker images.
+
+set -ex
+
+run() {
+    target=$(echo "${1}" | sed 's/-emulated//')
+    echo "Building docker container for TARGET=${1}"
+    docker build -t stdarch -f "ci/docker/${1}/Dockerfile" ci/
+    mkdir -p target c_programs rust_programs
+    echo "Running docker"
+    # shellcheck disable=SC2016
+    docker run \
+      --rm \
+      --user "$(id -u)":"$(id -g)" \
+      --env CARGO_HOME=/cargo \
+      --env CARGO_TARGET_DIR=/checkout/target \
+      --env TARGET="${target}" \
+      --env STDARCH_TEST_EVERYTHING \
+      --env STDARCH_ASSERT_INSTR_IGNORE \
+      --env STDARCH_DISABLE_ASSERT_INSTR \
+      --env NOSTD \
+      --env NORUN \
+      --env RUSTFLAGS \
+      --env STDARCH_TEST_NORUN \
+      --volume "${HOME}/.cargo":/cargo \
+      --volume "$(rustc --print sysroot)":/rust:ro \
+      --volume "$(pwd)":/checkout:ro \
+      --volume "$(pwd)"/target:/checkout/target \
+      --volume "$(pwd)"/c_programs:/checkout/c_programs \
+      --volume "$(pwd)"/rust_programs:/checkout/rust_programs \
+      --init \
+      --workdir /checkout \
+      --privileged \
+      stdarch \
+      sh -c "HOME=/tmp PATH=\$PATH:/rust/bin exec ci/run.sh ${1}"
+}
+
+if [ -z "$1" ]; then
+  for d in ci/docker/*; do
+    run "${d}"
+  done
+else
+  run "${1}"
+fi
diff --git a/library/stdarch/ci/run.sh b/library/stdarch/ci/run.sh
new file mode 100755
index 000000000..54145a0e7
--- /dev/null
+++ b/library/stdarch/ci/run.sh
@@ -0,0 +1,150 @@
+#!/usr/bin/env sh
+
+set -ex
+
+: "${TARGET?The TARGET environment variable must be set.}"
+
+# Tests are all super fast anyway, and they fault often enough on travis that
+# having only one thread increases debuggability to be worth it.
+#export RUST_BACKTRACE=full
+#export RUST_TEST_NOCAPTURE=1
+#export RUST_TEST_THREADS=1
+
+export RUSTFLAGS="${RUSTFLAGS} -D warnings -Z merge-functions=disabled "
+
+export STDARCH_DISABLE_DEDUP_GUARD=1
+
+case ${TARGET} in
+    # On Windows the linker performs identical COMDAT folding (ICF) by default
+    # in release mode which removes identical COMDAT sections. This interferes
+    # with our instruction assertions just like LLVM's MergeFunctions pass so
+    # we disable it.
+    *-pc-windows-msvc)
+        export RUSTFLAGS="${RUSTFLAGS} -Clink-args=/OPT:NOICF"
+        ;;
+    # On 32-bit use a static relocation model which avoids some extra
+    # instructions when dealing with static data, notably allowing some
+    # instruction assertion checks to pass below the 20 instruction limit. If
+    # this is the default, dynamic, then too many instructions are generated
+    # when we assert the instruction for a function and it causes tests to fail.
+    #
+    # It's not clear why `-Z plt=yes` is required here. Probably a bug in LLVM.
+    # If you can remove it and CI passes, please feel free to do so!
+    i686-* | i586-*)
+        export RUSTFLAGS="${RUSTFLAGS} -C relocation-model=static -Z plt=yes"
+        ;;
+    #Unoptimized build uses fast-isel which breaks with msa
+    mips-* | mipsel-*)
+	export RUSTFLAGS="${RUSTFLAGS} -C llvm-args=-fast-isel=false"
+	;;
+    # Some of our test dependencies use the deprecated `gcc` crates which is
+    # missing a fix from https://github.com/alexcrichton/cc-rs/pull/627. Apply
+    # the workaround manually here.
+    armv7-*eabihf | thumbv7-*eabihf)
+        export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+neon"
+        export TARGET_CFLAGS="-mfpu=vfpv3-d16"
+        ;;
+    # Some of our test dependencies use the deprecated `gcc` crates which
+    # doesn't detect RISC-V compilers automatically, so do it manually here.
+    riscv64*)
+        export TARGET_CC="riscv64-linux-gnu-gcc"
+        ;;
+esac
+
+echo "RUSTFLAGS=${RUSTFLAGS}"
+echo "FEATURES=${FEATURES}"
+echo "OBJDUMP=${OBJDUMP}"
+echo "STDARCH_DISABLE_ASSERT_INSTR=${STDARCH_DISABLE_ASSERT_INSTR}"
+echo "STDARCH_TEST_EVERYTHING=${STDARCH_TEST_EVERYTHING}"
+
+cargo_test() {
+    cmd="cargo"
+    subcmd="test"
+    if [ "$NORUN" = "1" ]; then
+        export subcmd="build"
+    fi
+    cmd="$cmd ${subcmd} --target=$TARGET $1"
+    cmd="$cmd -- $2"
+
+    # wasm targets can't catch panics so if a test failures make sure the test
+    # harness isn't trying to capture output, otherwise we won't get any useful
+    # output.
+    case ${TARGET} in
+        wasm32*)
+            cmd="$cmd --nocapture"
+            ;;
+    esac
+
+    if [ "$SKIP_TESTS" != "" ]; then
+        cmd="$cmd --skip "$SKIP_TESTS
+    fi
+    $cmd
+}
+
+CORE_ARCH="--manifest-path=crates/core_arch/Cargo.toml"
+STD_DETECT="--manifest-path=crates/std_detect/Cargo.toml"
+STDARCH_EXAMPLES="--manifest-path=examples/Cargo.toml"
+INTRINSIC_TEST="--manifest-path=crates/intrinsic-test/Cargo.toml"
+
+cargo_test "${CORE_ARCH} --release"
+
+if [ "$NOSTD" != "1" ]; then
+    cargo_test "${STD_DETECT}"
+    cargo_test "${STD_DETECT} --release"
+
+    cargo_test "${STD_DETECT} --no-default-features"
+    cargo_test "${STD_DETECT} --no-default-features --features=std_detect_file_io"
+    cargo_test "${STD_DETECT} --no-default-features --features=std_detect_dlsym_getauxval"
+    cargo_test "${STD_DETECT} --no-default-features --features=std_detect_dlsym_getauxval,std_detect_file_io"
+
+    cargo_test "${STDARCH_EXAMPLES}"
+    cargo_test "${STDARCH_EXAMPLES} --release"
+fi
+
+# Test targets compiled with extra features.
+case ${TARGET} in
+    x86*)
+        export STDARCH_DISABLE_ASSERT_INSTR=1
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
+        cargo_test "--release"
+        ;;
+    # FIXME: don't build anymore
+    #mips-*gnu* | mipsel-*gnu*)
+    #    export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+msa,+fp64,+mips32r5"
+    #    cargo_test "--release"
+	  #    ;;
+    mips64*)
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+msa"
+        cargo_test "--release"
+	      ;;
+    powerpc64*)
+        # We don't build the ppc 32-bit targets with these - these targets
+        # are mostly unsupported for now.
+        OLD_RUSTFLAGS="${RUSTFLAGS}"
+        export RUSTFLAGS="${OLD_RUSTFLAGS} -C target-feature=+altivec"
+        cargo_test "--release"
+
+        export RUSTFLAGS="${OLD_RUSTFLAGS} -C target-feature=+vsx"
+        cargo_test "--release"
+        ;;
+    *)
+        ;;
+
+esac
+
+if [ "${TARGET}" = "aarch64-unknown-linux-gnu" ]; then
+    export CPPFLAGS="-fuse-ld=lld -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/9/aarch64-linux-gnu/"
+    RUST_LOG=warn cargo run ${INTRINSIC_TEST} --release --bin intrinsic-test -- crates/intrinsic-test/acle/tools/intrinsic_db/advsimd.csv --runner "${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}" --cppcompiler "clang++-13" --skip crates/intrinsic-test/missing_aarch64.txt
+elif [ "${TARGET}" = "armv7-unknown-linux-gnueabihf" ]; then
+    export CPPFLAGS="-fuse-ld=lld -I/usr/arm-linux-gnueabihf/include/ -I/usr/arm-linux-gnueabihf/include/c++/9/arm-linux-gnueabihf/"
+    RUST_LOG=warn cargo run ${INTRINSIC_TEST} --release --bin intrinsic-test -- crates/intrinsic-test/acle/tools/intrinsic_db/advsimd.csv --runner "${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}" --cppcompiler "clang++-13" --skip crates/intrinsic-test/missing_arm.txt --a32
+fi
+
+if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ]; then
+    # Test examples
+    (
+        cd examples
+        cargo test --target "$TARGET"
+        echo test | cargo run --release hex
+    )
+fi
diff --git a/library/stdarch/ci/runtest-android.rs b/library/stdarch/ci/runtest-android.rs
new file mode 100644
index 000000000..ed1cd80c8
--- /dev/null
+++ b/library/stdarch/ci/runtest-android.rs
@@ -0,0 +1,45 @@
+use std::env;
+use std::process::Command;
+use std::path::{Path, PathBuf};
+
+fn main() {
+    let args = env::args_os()
+        .skip(1)
+        .filter(|arg| arg != "--quiet")
+        .collect::<Vec<_>>();
+    assert_eq!(args.len(), 1);
+    let test = PathBuf::from(&args[0]);
+    let dst = Path::new("/data/local/tmp").join(test.file_name().unwrap());
+
+    let status = Command::new("adb")
+        .arg("wait-for-device")
+        .status()
+        .expect("failed to run: adb wait-for-device");
+    assert!(status.success());
+
+    let status = Command::new("adb")
+        .arg("push")
+        .arg(&test)
+        .arg(&dst)
+        .status()
+        .expect("failed to run: adb pushr");
+    assert!(status.success());
+
+    let output = Command::new("adb")
+        .arg("shell")
+        .arg(&dst)
+        .output()
+        .expect("failed to run: adb shell");
+    assert!(status.success());
+
+    println!("status: {}\nstdout ---\n{}\nstderr ---\n{}",
+             output.status,
+             String::from_utf8_lossy(&output.stdout),
+             String::from_utf8_lossy(&output.stderr));
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let mut lines = stdout.lines().filter(|l| l.starts_with("test result"));
+    if !lines.all(|l| l.contains("test result: ok") && l.contains("0 failed")) {
+        panic!("failed to find successful test run");
+    }
+}
diff --git a/library/stdarch/ci/style.sh b/library/stdarch/ci/style.sh
new file mode 100755
index 000000000..8f81883f3
--- /dev/null
+++ b/library/stdarch/ci/style.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env sh
+
+set -ex
+
+if rustup component add rustfmt-preview ; then
+    command -v rustfmt
+    rustfmt -V
+    cargo fmt --all -- --check
+fi
+
+# if rustup component add clippy-preview ; then
+#     cargo clippy -V
+#     cargo clippy --all -- -D clippy::pedantic
+# fi
+
+if shellcheck --version ; then
+    shellcheck -e SC2103 ci/*.sh
+else
+    echo "shellcheck not found"
+    exit 1
+fi
+
diff --git a/library/stdarch/crates/assert-instr-macro/Cargo.toml b/library/stdarch/crates/assert-instr-macro/Cargo.toml
new file mode 100644
index 000000000..3d9b32067
--- /dev/null
+++ b/library/stdarch/crates/assert-instr-macro/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "assert-instr-macro"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2018"
+
+[lib]
+proc-macro = true
+test = false
+
+[dependencies]
+proc-macro2 = "1.0"
+quote = "1.0"
+syn = { version = "1.0", features = ["full"] }
diff --git a/library/stdarch/crates/assert-instr-macro/build.rs b/library/stdarch/crates/assert-instr-macro/build.rs
new file mode 100644
index 000000000..01ae79660
--- /dev/null
+++ b/library/stdarch/crates/assert-instr-macro/build.rs
@@ -0,0 +1,13 @@
+use std::env;
+
+fn main() {
+    println!("cargo:rerun-if-changed=build.rs");
+    let opt_level = env::var("OPT_LEVEL")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(0);
+    let profile = env::var("PROFILE").unwrap_or_default();
+    if profile == "release" || opt_level >= 2 {
+        println!("cargo:rustc-cfg=optimized");
+    }
+}
diff --git a/library/stdarch/crates/assert-instr-macro/src/lib.rs b/library/stdarch/crates/assert-instr-macro/src/lib.rs
new file mode 100644
index 000000000..9fa411df3
--- /dev/null
+++ b/library/stdarch/crates/assert-instr-macro/src/lib.rs
@@ -0,0 +1,251 @@
+//! Implementation of the `#[assert_instr]` macro
+//!
+//! This macro is used when testing the `stdarch` crate and is used to generate
+//! test cases to assert that functions do indeed contain the instructions that
+//! we're expecting them to contain.
+//!
+//! The procedural macro here is relatively simple, it simply appends a
+//! `#[test]` function to the original token stream which asserts that the
+//! function itself contains the relevant instruction.
+#![deny(rust_2018_idioms)]
+
+#[macro_use]
+extern crate quote;
+
+use proc_macro2::TokenStream;
+use quote::ToTokens;
+
+#[proc_macro_attribute]
+pub fn assert_instr(
+    attr: proc_macro::TokenStream,
+    item: proc_macro::TokenStream,
+) -> proc_macro::TokenStream {
+    let invoc = match syn::parse::<Invoc>(attr) {
+        Ok(s) => s,
+        Err(e) => return e.to_compile_error().into(),
+    };
+    let item = match syn::parse::<syn::Item>(item) {
+        Ok(s) => s,
+        Err(e) => return e.to_compile_error().into(),
+    };
+    let func = match item {
+        syn::Item::Fn(ref f) => f,
+        _ => panic!("must be attached to a function"),
+    };
+
+    let instr = &invoc.instr;
+    let name = &func.sig.ident;
+
+    // Disable assert_instr for x86 targets compiled with avx enabled, which
+    // causes LLVM to generate different intrinsics that the ones we are
+    // testing for.
+    let disable_assert_instr = std::env::var("STDARCH_DISABLE_ASSERT_INSTR").is_ok();
+
+    // Disable dedup guard. Only works if the LLVM MergeFunctions pass is disabled, e.g.
+    // with `-Z merge-functions=disabled` in RUSTFLAGS.
+    let disable_dedup_guard = std::env::var("STDARCH_DISABLE_DEDUP_GUARD").is_ok();
+
+    // If instruction tests are disabled avoid emitting this shim at all, just
+    // return the original item without our attribute.
+    if !cfg!(optimized) || disable_assert_instr {
+        return (quote! { #item }).into();
+    }
+
+    let instr_str = instr
+        .replace('.', "_")
+        .replace('/', "_")
+        .replace(':', "_")
+        .replace(char::is_whitespace, "");
+    let assert_name = syn::Ident::new(&format!("assert_{}_{}", name, instr_str), name.span());
+    // These name has to be unique enough for us to find it in the disassembly later on:
+    let shim_name = syn::Ident::new(
+        &format!("stdarch_test_shim_{}_{}", name, instr_str),
+        name.span(),
+    );
+    let shim_name_ptr = syn::Ident::new(
+        &format!("stdarch_test_shim_{}_{}_ptr", name, instr_str).to_ascii_uppercase(),
+        name.span(),
+    );
+    let mut inputs = Vec::new();
+    let mut input_vals = Vec::new();
+    let mut const_vals = Vec::new();
+    let ret = &func.sig.output;
+    for arg in func.sig.inputs.iter() {
+        let capture = match *arg {
+            syn::FnArg::Typed(ref c) => c,
+            ref v => panic!(
+                "arguments must not have patterns: `{:?}`",
+                v.clone().into_token_stream()
+            ),
+        };
+        let ident = match *capture.pat {
+            syn::Pat::Ident(ref i) => &i.ident,
+            _ => panic!("must have bare arguments"),
+        };
+        if let Some(&(_, ref tokens)) = invoc.args.iter().find(|a| *ident == a.0) {
+            input_vals.push(quote! { #tokens });
+        } else {
+            inputs.push(capture);
+            input_vals.push(quote! { #ident });
+        }
+    }
+    for arg in func.sig.generics.params.iter() {
+        let c = match *arg {
+            syn::GenericParam::Const(ref c) => c,
+            ref v => panic!(
+                "only const generics are allowed: `{:?}`",
+                v.clone().into_token_stream()
+            ),
+        };
+        if let Some(&(_, ref tokens)) = invoc.args.iter().find(|a| c.ident == a.0) {
+            const_vals.push(quote! { #tokens });
+        } else {
+            panic!("const generics must have a value for tests");
+        }
+    }
+
+    let attrs = func
+        .attrs
+        .iter()
+        .filter(|attr| {
+            attr.path
+                .segments
+                .first()
+                .expect("attr.path.segments.first() failed")
+                .ident
+                .to_string()
+                .starts_with("target")
+        })
+        .collect::<Vec<_>>();
+    let attrs = Append(&attrs);
+
+    // Use an ABI on Windows that passes SIMD values in registers, like what
+    // happens on Unix (I think?) by default.
+    let abi = if cfg!(windows) {
+        let target = std::env::var("TARGET").unwrap();
+        if target.contains("x86_64") {
+            syn::LitStr::new("sysv64", proc_macro2::Span::call_site())
+        } else {
+            syn::LitStr::new("vectorcall", proc_macro2::Span::call_site())
+        }
+    } else {
+        syn::LitStr::new("C", proc_macro2::Span::call_site())
+    };
+    let shim_name_str = format!("{}{}", shim_name, assert_name);
+    let to_test = if disable_dedup_guard {
+        quote! {
+            #attrs
+            #[no_mangle]
+            #[inline(never)]
+            pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret {
+                #name::<#(#const_vals),*>(#(#input_vals),*)
+            }
+        }
+    } else {
+        quote! {
+
+            const #shim_name_ptr : *const u8 = #shim_name_str.as_ptr();
+
+            #attrs
+            #[no_mangle]
+            #[inline(never)]
+            pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret {
+                // The compiler in optimized mode by default runs a pass called
+                // "mergefunc" where it'll merge functions that look identical.
+                // Turns out some intrinsics produce identical code and they're
+                // folded together, meaning that one just jumps to another. This
+                // messes up our inspection of the disassembly of this function and
+                // we're not a huge fan of that.
+                //
+                // To thwart this pass and prevent functions from being merged we
+                // generate some code that's hopefully very tight in terms of
+                // codegen but is otherwise unique to prevent code from being
+                // folded.
+                ::stdarch_test::_DONT_DEDUP = #shim_name_ptr;
+                #name::<#(#const_vals),*>(#(#input_vals),*)
+            }
+        }
+    };
+
+    let tokens: TokenStream = quote! {
+        #[test]
+        #[allow(non_snake_case)]
+        fn #assert_name() {
+            #to_test
+
+            ::stdarch_test::assert(#shim_name as usize,
+                                   stringify!(#shim_name),
+                                   #instr);
+        }
+    };
+
+    let tokens: TokenStream = quote! {
+        #item
+        #tokens
+    };
+    tokens.into()
+}
+
+struct Invoc {
+    instr: String,
+    args: Vec<(syn::Ident, syn::Expr)>,
+}
+
+impl syn::parse::Parse for Invoc {
+    fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
+        use syn::{ext::IdentExt, Token};
+
+        let mut instr = String::new();
+        while !input.is_empty() {
+            if input.parse::<Token![,]>().is_ok() {
+                break;
+            }
+            if let Ok(ident) = syn::Ident::parse_any(input) {
+                instr.push_str(&ident.to_string());
+                continue;
+            }
+            if input.parse::<Token![.]>().is_ok() {
+                instr.push('.');
+                continue;
+            }
+            if let Ok(s) = input.parse::<syn::LitStr>() {
+                instr.push_str(&s.value());
+                continue;
+            }
+            println!("{:?}", input.cursor().token_stream());
+            return Err(input.error("expected an instruction"));
+        }
+        if instr.is_empty() {
+            return Err(input.error("expected an instruction before comma"));
+        }
+        let mut args = Vec::new();
+        while !input.is_empty() {
+            let name = input.parse::<syn::Ident>()?;
+            input.parse::<Token![=]>()?;
+            let expr = input.parse::<syn::Expr>()?;
+            args.push((name, expr));
+
+            if input.parse::<Token![,]>().is_err() {
+                if !input.is_empty() {
+                    return Err(input.error("extra tokens at end"));
+                }
+                break;
+            }
+        }
+        Ok(Self { instr, args })
+    }
+}
+
+struct Append<T>(T);
+
+impl<T> quote::ToTokens for Append<T>
+where
+    T: Clone + IntoIterator,
+    T::Item: quote::ToTokens,
+{
+    fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) {
+        for item in self.0.clone() {
+            item.to_tokens(tokens);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/Cargo.toml b/library/stdarch/crates/core_arch/Cargo.toml
new file mode 100644
index 000000000..14b5479d1
--- /dev/null
+++ b/library/stdarch/crates/core_arch/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "core_arch"
+version = "0.1.5"
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Andrew Gallant <jamslam@gmail.com>",
+    "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
+]
+description = "`core::arch` - Rust's core library architecture-specific intrinsics."
+homepage = "https://github.com/rust-lang/stdarch"
+repository = "https://github.com/rust-lang/stdarch"
+readme = "README.md"
+keywords = ["core", "simd", "arch", "intrinsics"]
+categories = ["hardware-support", "no-std"]
+license = "MIT OR Apache-2.0"
+build = "build.rs"
+edition = "2018"
+
+[badges]
+is-it-maintained-issue-resolution = { repository = "rust-lang/stdarch" }
+is-it-maintained-open-issues = { repository = "rust-lang/stdarch" }
+maintenance = { status = "experimental" }
+
+[dev-dependencies]
+stdarch-test = { version = "0.*", path = "../stdarch-test" }
+std_detect = { version = "0.*", path = "../std_detect" }
diff --git a/library/stdarch/crates/core_arch/LICENSE-APACHE b/library/stdarch/crates/core_arch/LICENSE-APACHE
new file mode 100644
index 000000000..16fe87b06
--- /dev/null
+++ b/library/stdarch/crates/core_arch/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/library/stdarch/crates/core_arch/LICENSE-MIT b/library/stdarch/crates/core_arch/LICENSE-MIT
new file mode 100644
index 000000000..52d82415d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2017 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/library/stdarch/crates/core_arch/MISSING.md b/library/stdarch/crates/core_arch/MISSING.md
new file mode 100644
index 000000000..c948f3f8c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/MISSING.md
@@ -0,0 +1,116 @@
+## The following neon instructions are currently not implemented in stdarch
+
+### Not implemented on arm:
+
+`vcadd_rot270_f32`
+
+`vcadd_rot90_f32`
+
+`vcaddq_rot270_f32`
+
+`vcaddq_rot90_f32`
+
+`vdot_s32`
+
+`vdot_u32`
+
+`vdotq_s32`
+
+`vdotq_u32`
+
+`vdot_lane_s32`
+
+`vdot_lane_u32`
+
+`vdotq_lane_s32`
+
+`vdotq_lane_u32`
+
+`vcmla_f32`
+
+`vcmla_lane_f32`
+
+`vcmla_laneq_f32`
+
+`vcmla_rot180_f32`
+
+`vcmla_rot180_lane_f32`
+
+`vcmla_rot180_laneq_f32`
+
+`vcmla_rot270_f32`
+
+`vcmla_rot270_lane_f32`
+
+`vcmla_rot270_laneq_f32`
+
+`vcmla_rot90_f32`
+
+`vcmla_rot90_lane_f32`
+
+`vcmla_rot90_laneq_f32`
+
+`vcmlaq_f32`
+
+`vcmlaq_lane_f32`
+
+`vcmlaq_laneq_f32`
+
+`vcmlaq_rot180_f32`
+
+`vcmlaq_rot180_lane_f32`
+
+`vcmlaq_rot180_laneq_f32`
+
+`vcmlaq_rot270_f32`
+
+`vcmlaq_rot270_lane_f32`
+
+`vcmlaq_rot270_laneq_f32`
+
+`vcmlaq_rot90_f32`
+
+`vcmlaq_rot90_lane_f32`
+
+`vcmlaq_rot90_laneq_f32`
+
+### Not implemented in LLVM:
+
+`vrnd32x_f64`
+
+`vrnd32xq_f64`
+
+`vrnd32z_f64`
+
+`vrnd32zq_f64`
+
+`vrnd64x_f64`
+
+`vrnd64xq_f64`
+
+`vrnd64z_f64`
+
+`vrnd64zq_f64`
+
+### LLVM Select errors may occur:
+
+`vsudot_lane_s32`
+
+`vsudot_laneq_s32`
+
+`vsudotq_lane_s32`
+
+`vsudotq_laneq_s32`
+
+`vusdot_lane_s32`
+
+`vusdot_laneq_s32`
+
+`vusdot_s32`
+
+`vusdotq_lane_s32`
+
+`vusdotq_laneq_s32`
+
+`vusdotq_s32v`
+
diff --git a/library/stdarch/crates/core_arch/README.md b/library/stdarch/crates/core_arch/README.md
new file mode 100644
index 000000000..a5f490bcf
--- /dev/null
+++ b/library/stdarch/crates/core_arch/README.md
@@ -0,0 +1,60 @@
+`core::arch` - Rust's core library architecture-specific intrinsics
+=======
+
+The `core::arch` module implements architecture-dependent intrinsics (e.g. SIMD).
+
+# Usage 
+
+`core::arch` is available as part of `libcore` and it is re-exported by
+`libstd`. Prefer using it via `core::arch` or `std::arch` than via this crate.
+Unstable features are often available in nightly Rust via the
+`feature(stdsimd)`.
+
+Using `core::arch` via this crate requires nightly Rust, and it can (and does)
+break often. The only cases in which you should consider using it via this crate
+are:
+
+* if you need to re-compile `core::arch` yourself, e.g., with particular
+  target-features enabled that are not enabled for `libcore`/`libstd`. Note: if
+  you need to re-compile it for a non-standard target, please prefer using
+  `xargo` and re-compiling `libcore`/`libstd` as appropriate instead of using
+  this crate.
+  
+* using some features that might not be available even behind unstable Rust
+  features. We try to keep these to a minimum. If you need to use some of these
+  features, please open an issue so that we can expose them in nightly Rust and
+  you can use them from there.
+
+# Documentation
+
+* [Documentation - i686][i686]
+* [Documentation - x86\_64][x86_64]
+* [Documentation - arm][arm]
+* [Documentation - aarch64][aarch64]
+* [Documentation - powerpc][powerpc]
+* [Documentation - powerpc64][powerpc64]
+* [How to get started][contrib]
+* [How to help implement intrinsics][help-implement]
+
+[contrib]: https://github.com/rust-lang/stdarch/blob/master/CONTRIBUTING.md
+[help-implement]: https://github.com/rust-lang/stdarch/issues/40
+[i686]: https://rust-lang.github.io/stdarch/i686/core_arch/
+[x86_64]: https://rust-lang.github.io/stdarch/x86_64/core_arch/
+[arm]: https://rust-lang.github.io/stdarch/arm/core_arch/
+[aarch64]: https://rust-lang.github.io/stdarch/aarch64/core_arch/
+[powerpc]: https://rust-lang.github.io/stdarch/powerpc/core_arch/
+[powerpc64]: https://rust-lang.github.io/stdarch/powerpc64/core_arch/
+
+# License
+
+`core_arch` is primarily distributed under the terms of both the MIT license and
+the Apache License (Version 2.0), with portions covered by various BSD-like
+licenses.
+
+See LICENSE-APACHE, and LICENSE-MIT for details.
+
+# Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in `core_arch` by you, as defined in the Apache-2.0 license,
+shall be dual licensed as above, without any additional terms or conditions.
diff --git a/library/stdarch/crates/core_arch/avx512bw.md b/library/stdarch/crates/core_arch/avx512bw.md
new file mode 100644
index 000000000..20c8c2f14
--- /dev/null
+++ b/library/stdarch/crates/core_arch/avx512bw.md
@@ -0,0 +1,764 @@
+<summary>["AVX512BW"]</summary><p>
+
+  * [x] [`_mm512_loadu_epi16`]
+  * [x] [`_mm512_mask_loadu_epi16`] //need i1
+  * [x] [`_mm512_maskz_loadu_epi16`] //need i1
+  * [x] [`_mm_loadu_epi16`]
+  * [x] [`_mm_mask_loadu_epi16`] //need i1
+  * [x] [`_mm_maskz_loadu_epi16`] //need i1
+  * [x] [`_mm256_loadu_epi16`]
+  * [x] [`_mm256_mask_loadu_epi16`] //need i1
+  * [x] [`_mm256_maskz_loadu_epi16`] //need i1
+  * [x] [`_mm512_loadu_epi8`]
+  * [x] [`_mm512_mask_loadu_epi8`] //need i1
+  * [x] [`_mm512_maskz_loadu_epi8`] //need i1
+  * [x] [`_mm_loadu_epi8`]
+  * [x] [`_mm_mask_loadu_epi8`] //need i1
+  * [x] [`_mm_maskz_loadu_epi8`] //need i1
+  * [x] [`_mm256_loadu_epi8`]
+  * [x] [`_mm256_mask_loadu_epi8`] //need i1
+  * [x] [`_mm256_maskz_loadu_epi8`] //need i1
+  * [x] [`_mm512_mask_storeu_epi16`]
+  * [x] [`_mm512_storeu_epi16`]
+  * [x] [`_mm_mask_storeu_epi16`] //need i1
+  * [x] [`_mm_storeu_epi16`]
+  * [x] [`_mm256_mask_storeu_epi16`] //need i1
+  * [x] [`_mm256_storeu_epi16`]
+  * [x] [`_mm512_mask_storeu_epi8`] //need i1
+  * [x] [`_mm512_storeu_epi8`]
+  * [x] [`_mm_mask_storeu_epi8`] //need i1
+  * [x] [`_mm_storeu_epi8`]
+  * [x] [`_mm256_mask_storeu_epi8`] //need i1
+  * [x] [`_mm256_storeu_epi8`]
+  * [x] [`_mm512_abs_epi16`]
+  * [x] [`_mm512_mask_abs_epi16`]
+  * [x] [`_mm512_maskz_abs_epi16`]
+  * [x] [`_mm_mask_abs_epi16`]
+  * [x] [`_mm_maskz_abs_epi16`]
+  * [x] [`_mm256_mask_abs_epi16`]
+  * [x] [`_mm256_maskz_abs_epi16`]
+  * [x] [`_mm512_abs_epi8`]
+  * [x] [`_mm512_mask_abs_epi8`]
+  * [x] [`_mm512_maskz_abs_epi8`]
+  * [x] [`_mm_mask_abs_epi8`]
+  * [x] [`_mm_maskz_abs_epi8`]
+  * [x] [`_mm256_mask_abs_epi8`]
+  * [x] [`_mm256_maskz_abs_epi8`]
+  * [x] [`_mm512_add_epi16`]
+  * [x] [`_mm512_mask_add_epi16`]
+  * [x] [`_mm512_maskz_add_epi16`]
+  * [x] [`_mm_mask_add_epi16`]
+  * [x] [`_mm_maskz_add_epi16`]
+  * [x] [`_mm256_mask_add_epi16`]
+  * [x] [`_mm256_maskz_add_epi16`]
+  * [x] [`_mm512_add_epi8`]
+  * [x] [`_mm512_mask_add_epi8`]
+  * [x] [`_mm512_maskz_add_epi8`]
+  * [x] [`_mm_mask_add_epi8`]
+  * [x] [`_mm_maskz_add_epi8`]
+  * [x] [`_mm256_mask_add_epi8`]
+  * [x] [`_mm256_maskz_add_epi8`]
+  * [x] [`_mm512_adds_epi16`]
+  * [x] [`_mm512_mask_adds_epi16`]
+  * [x] [`_mm512_maskz_adds_epi16`]
+  * [x] [`_mm_mask_adds_epi16`]
+  * [x] [`_mm_maskz_adds_epi16`]
+  * [x] [`_mm256_mask_adds_epi16`]
+  * [x] [`_mm256_maskz_adds_epi16`]
+  * [x] [`_mm512_adds_epi8`]
+  * [x] [`_mm512_mask_adds_epi8`]
+  * [x] [`_mm512_maskz_adds_epi8`]
+  * [x] [`_mm_mask_adds_epi8`]
+  * [x] [`_mm_maskz_adds_epi8`]
+  * [x] [`_mm256_mask_adds_epi8`]
+  * [x] [`_mm256_maskz_adds_epi8`]
+  * [x] [`_mm512_adds_epu16`]
+  * [x] [`_mm512_mask_adds_epu16`]
+  * [x] [`_mm512_maskz_adds_epu16`]
+  * [x] [`_mm_mask_adds_epu16`]
+  * [x] [`_mm_maskz_adds_epu16`]
+  * [x] [`_mm256_mask_adds_epu16`]
+  * [x] [`_mm256_maskz_adds_epu16`]
+  * [x] [`_mm512_adds_epu8`]
+  * [x] [`_mm512_mask_adds_epu8`]
+  * [x] [`_mm512_maskz_adds_epu8`]
+  * [x] [`_mm_mask_adds_epu8`]
+  * [x] [`_mm_maskz_adds_epu8`]
+  * [x] [`_mm256_mask_adds_epu8`]
+  * [x] [`_mm256_maskz_adds_epu8`]
+  * [x] [`_mm512_alignr_epi8`]
+  * [x] [`_mm512_mask_alignr_epi8`]
+  * [x] [`_mm512_maskz_alignr_epi8`]
+  * [x] [`_mm_mask_alignr_epi8`]
+  * [x] [`_mm_maskz_alignr_epi8`]
+  * [x] [`_mm256_mask_alignr_epi8`]
+  * [x] [`_mm256_maskz_alignr_epi8`]
+  * [x] [`_mm512_avg_epu16`]
+  * [x] [`_mm512_mask_avg_epu16`]
+  * [x] [`_mm512_maskz_avg_epu16`]
+  * [x] [`_mm_mask_avg_epu16`]
+  * [x] [`_mm_maskz_avg_epu16`]
+  * [x] [`_mm256_mask_avg_epu16`]
+  * [x] [`_mm256_maskz_avg_epu16`]
+  * [x] [`_mm512_avg_epu8`]
+  * [x] [`_mm512_mask_avg_epu8`]
+  * [x] [`_mm512_maskz_avg_epu8`]
+  * [x] [`_mm_mask_avg_epu8`]
+  * [x] [`_mm_maskz_avg_epu8`]
+  * [x] [`_mm256_mask_avg_epu8`]
+  * [x] [`_mm256_maskz_avg_epu8`]
+  * [x] [`_mm512_mask_blend_epi16`]
+  * [x] [`_mm_mask_blend_epi16`]
+  * [x] [`_mm256_mask_blend_epi16`]
+  * [x] [`_mm512_mask_blend_epi8`]
+  * [x] [`_mm512_broadcastb_epi8`]
+  * [x] [`_mm_mask_blend_epi8`]
+  * [x] [`_mm256_mask_blend_epi8`]
+  * [x] [`_mm512_mask_broadcastb_epi8`]
+  * [x] [`_mm512_maskz_broadcastb_epi8`]
+  * [x] [`_mm_mask_broadcastb_epi8`]
+  * [x] [`_mm_maskz_broadcastb_epi8`]
+  * [x] [`_mm256_mask_broadcastb_epi8`]
+  * [x] [`_mm256_maskz_broadcastb_epi8`]
+  * [x] [`_mm512_broadcastw_epi16`]
+  * [x] [`_mm512_mask_broadcastw_epi16`]
+  * [x] [`_mm512_maskz_broadcastw_epi16`]
+  * [x] [`_mm_mask_broadcastw_epi16`]
+  * [x] [`_mm_maskz_broadcastw_epi16`]
+  * [x] [`_mm256_mask_broadcastw_epi16`]
+  * [x] [`_mm256_maskz_broadcastw_epi16`]
+  * [x] [`_mm512_bslli_epi128`]
+  * [x] [`_mm512_bsrli_epi128`]
+  * [x] [`_mm512_cmp_epi16_mask`]
+  * [x] [`_mm512_mask_cmp_epi16_mask`]
+  * [x] [`_mm_cmp_epi16_mask`]
+  * [x] [`_mm_mask_cmp_epi16_mask`]
+  * [x] [`_mm256_cmp_epi16_mask`]
+  * [x] [`_mm256_mask_cmp_epi16_mask`]
+  * [x] [`_mm512_cmp_epi8_mask`]
+  * [x] [`_mm512_mask_cmp_epi8_mask`]
+  * [x] [`_mm_cmp_epi8_mask`]
+  * [x] [`_mm_mask_cmp_epi8_mask`]
+  * [x] [`_mm256_cmp_epi8_mask`]
+  * [x] [`_mm256_mask_cmp_epi8_mask`]
+  * [x] [`_mm512_cmp_epu16_mask`]
+  * [x] [`_mm512_mask_cmp_epu16_mask`]
+  * [x] [`_mm_cmp_epu16_mask`]
+  * [x] [`_mm_mask_cmp_epu16_mask`]
+  * [x] [`_mm256_cmp_epu16_mask`]
+  * [x] [`_mm256_mask_cmp_epu16_mask`]
+  * [x] [`_mm512_cmp_epu8_mask`]
+  * [x] [`_mm512_mask_cmp_epu8_mask`]
+  * [x] [`_mm_cmp_epu8_mask`]
+  * [x] [`_mm_mask_cmp_epu8_mask`]
+  * [x] [`_mm256_cmp_epu8_mask`]
+  * [x] [`_mm256_mask_cmp_epu8_mask`]
+  * [x] [`_mm512_cmpeq_epi16_mask`]
+  * [x] [`_mm512_mask_cmpeq_epi16_mask`]
+  * [x] [`_mm_cmpeq_epi16_mask`]
+  * [x] [`_mm_mask_cmpeq_epi16_mask`]
+  * [x] [`_mm256_cmpeq_epi16_mask`]
+  * [x] [`_mm256_mask_cmpeq_epi16_mask`]
+  * [x] [`_mm512_cmpeq_epi8_mask`]
+  * [x] [`_mm512_mask_cmpeq_epi8_mask`]
+  * [x] [`_mm_cmpeq_epi8_mask`]
+  * [x] [`_mm_mask_cmpeq_epi8_mask`]
+  * [x] [`_mm256_cmpeq_epi8_mask`]
+  * [x] [`_mm256_mask_cmpeq_epi8_mask`]
+  * [x] [`_mm512_cmpeq_epu16_mask`]
+  * [x] [`_mm512_mask_cmpeq_epu16_mask`]
+  * [x] [`_mm_cmpeq_epu16_mask`]
+  * [x] [`_mm_mask_cmpeq_epu16_mask`]
+  * [x] [`_mm256_cmpeq_epu16_mask`]
+  * [x] [`_mm256_mask_cmpeq_epu16_mask`]
+  * [x] [`_mm512_cmpeq_epu8_mask`]
+  * [x] [`_mm512_mask_cmpeq_epu8_mask`]
+  * [x] [`_mm_cmpeq_epu8_mask`]
+  * [x] [`_mm_mask_cmpeq_epu8_mask`]
+  * [x] [`_mm256_cmpeq_epu8_mask`]
+  * [x] [`_mm256_mask_cmpeq_epu8_mask`]
+  * [x] [`_mm512_cmpge_epi16_mask`]
+  * [x] [`_mm512_mask_cmpge_epi16_mask`]
+  * [x] [`_mm_cmpge_epi16_mask`]
+  * [x] [`_mm_mask_cmpge_epi16_mask`]
+  * [x] [`_mm256_cmpge_epi16_mask`]
+  * [x] [`_mm256_mask_cmpge_epi16_mask`]
+  * [x] [`_mm512_cmpge_epi8_mask`]
+  * [x] [`_mm512_mask_cmpge_epi8_mask`]
+  * [x] [`_mm_cmpge_epi8_mask`]
+  * [x] [`_mm_mask_cmpge_epi8_mask`]
+  * [x] [`_mm256_cmpge_epi8_mask`]
+  * [x] [`_mm256_mask_cmpge_epi8_mask`]
+  * [x] [`_mm512_cmpge_epu16_mask`]
+  * [x] [`_mm512_mask_cmpge_epu16_mask`]
+  * [x] [`_mm_cmpge_epu16_mask`]
+  * [x] [`_mm_mask_cmpge_epu16_mask`]
+  * [x] [`_mm256_cmpge_epu16_mask`]
+  * [x] [`_mm256_mask_cmpge_epu16_mask`]
+  * [x] [`_mm512_cmpge_epu8_mask`]
+  * [x] [`_mm512_mask_cmpge_epu8_mask`]
+  * [x] [`_mm_cmpge_epu8_mask`]
+  * [x] [`_mm_mask_cmpge_epu8_mask`]
+  * [x] [`_mm256_cmpge_epu8_mask`]
+  * [x] [`_mm256_mask_cmpge_epu8_mask`]
+  * [x] [`_mm512_cmpgt_epi16_mask`]
+  * [x] [`_mm512_mask_cmpgt_epi16_mask`]
+  * [x] [`_mm512_cmpgt_epi8_mask`]
+  * [x] [`_mm512_mask_cmpgt_epi8_mask`]
+  * [x] [`_mm_cmpgt_epi8_mask`]
+  * [x] [`_mm_mask_cmpgt_epi8_mask`]
+  * [x] [`_mm256_cmpgt_epi8_mask`]
+  * [x] [`_mm256_mask_cmpgt_epi8_mask`]
+  * [x] [`_mm512_cmpgt_epu16_mask`]
+  * [x] [`_mm512_mask_cmpgt_epu16_mask`]
+  * [x] [`_mm_cmpgt_epu16_mask`]
+  * [x] [`_mm_mask_cmpgt_epu16_mask`]
+  * [x] [`_mm256_cmpgt_epu16_mask`]
+  * [x] [`_mm256_mask_cmpgt_epu16_mask`]
+  * [x] [`_mm512_cmpgt_epu8_mask`]
+  * [x] [`_mm512_mask_cmpgt_epu8_mask`]
+  * [x] [`_mm_cmpgt_epu8_mask`]
+  * [x] [`_mm_mask_cmpgt_epu8_mask`]
+  * [x] [`_mm256_cmpgt_epu8_mask`]
+  * [x] [`_mm256_mask_cmpgt_epu8_mask`]
+  * [x] [`_mm512_cmple_epi16_mask`]
+  * [x] [`_mm512_mask_cmple_epi16_mask`]
+  * [x] [`_mm_cmpgt_epi16_mask`]
+  * [x] [`_mm_mask_cmpgt_epi16_mask`]
+  * [x] [`_mm256_cmpgt_epi16_mask`]
+  * [x] [`_mm256_mask_cmpgt_epi16_mask`]
+  * [x] [`_mm512_cmple_epi8_mask`]
+  * [x] [`_mm512_mask_cmple_epi8_mask`]
+  * [x] [`_mm_cmple_epi8_mask`]
+  * [x] [`_mm_mask_cmple_epi8_mask`]
+  * [x] [`_mm256_cmple_epi8_mask`]
+  * [x] [`_mm256_mask_cmple_epi8_mask`]
+  * [x] [`_mm512_cmple_epu16_mask`]
+  * [x] [`_mm512_mask_cmple_epu16_mask`]
+  * [x] [`_mm_cmple_epu16_mask`]
+  * [x] [`_mm_mask_cmple_epu16_mask`]
+  * [x] [`_mm256_cmple_epu16_mask`]
+  * [x] [`_mm256_mask_cmple_epu16_mask`]
+  * [x] [`_mm512_cmple_epu8_mask`]
+  * [x] [`_mm512_mask_cmple_epu8_mask`]
+  * [x] [`_mm_cmple_epu8_mask`]
+  * [x] [`_mm_mask_cmple_epu8_mask`]
+  * [x] [`_mm256_cmple_epu8_mask`]
+  * [x] [`_mm256_mask_cmple_epu8_mask`]
+  * [x] [`_mm512_cmplt_epi16_mask`]
+  * [x] [`_mm512_mask_cmplt_epi16_mask`]
+  * [x] [`_mm_cmple_epi16_mask`]
+  * [x] [`_mm_mask_cmple_epi16_mask`]
+  * [x] [`_mm256_cmple_epi16_mask`]
+  * [x] [`_mm256_mask_cmple_epi16_mask`]
+  * [x] [`_mm512_cmplt_epi8_mask`]
+  * [x] [`_mm512_mask_cmplt_epi8_mask`]
+  * [x] [`_mm_cmplt_epi8_mask`]
+  * [x] [`_mm_mask_cmplt_epi8_mask`]
+  * [x] [`_mm256_cmplt_epi8_mask`]
+  * [x] [`_mm256_mask_cmplt_epi8_mask`]
+  * [x] [`_mm512_cmplt_epu16_mask`]
+  * [x] [`_mm512_mask_cmplt_epu16_mask`]
+  * [x] [`_mm_cmplt_epu16_mask`]
+  * [x] [`_mm_mask_cmplt_epu16_mask`]
+  * [x] [`_mm256_cmplt_epu16_mask`]
+  * [x] [`_mm256_mask_cmplt_epu16_mask`]
+  * [x] [`_mm512_cmplt_epu8_mask`]
+  * [x] [`_mm512_mask_cmplt_epu8_mask`]
+  * [x] [`_mm_cmplt_epu8_mask`]
+  * [x] [`_mm_mask_cmplt_epu8_mask`]
+  * [x] [`_mm256_cmplt_epu8_mask`]
+  * [x] [`_mm256_mask_cmplt_epu8_mask`]
+  * [x] [`_mm512_cmpneq_epi16_mask`]
+  * [x] [`_mm512_mask_cmpneq_epi16_mask`]
+  * [x] [`_mm_cmpneq_epi16_mask`]
+  * [x] [`_mm_mask_cmpneq_epi16_mask`]
+  * [x] [`_mm256_cmpneq_epi16_mask`]
+  * [x] [`_mm256_mask_cmpneq_epi16_mask`]
+  * [x] [`_mm512_cmpneq_epi8_mask`]
+  * [x] [`_mm512_mask_cmpneq_epi8_mask`]
+  * [x] [`_mm_cmpneq_epi8_mask`]
+  * [x] [`_mm_mask_cmpneq_epi8_mask`]
+  * [x] [`_mm256_cmpneq_epi8_mask`]
+  * [x] [`_mm256_mask_cmpneq_epi8_mask`]
+  * [x] [`_mm512_cmpneq_epu16_mask`]
+  * [x] [`_mm512_mask_cmpneq_epu16_mask`]
+  * [x] [`_mm_cmpneq_epu16_mask`]
+  * [x] [`_mm_mask_cmpneq_epu16_mask`]
+  * [x] [`_mm256_cmpneq_epu16_mask`]
+  * [x] [`_mm256_mask_cmpneq_epu16_mask`]
+  * [x] [`_mm512_cmpneq_epu8_mask`]
+  * [x] [`_mm512_mask_cmpneq_epu8_mask`]
+  * [x] [`_mm_cmpneq_epu8_mask`]
+  * [x] [`_mm_mask_cmpneq_epu8_mask`]
+  * [x] [`_mm256_cmpneq_epu8_mask`]
+  * [x] [`_mm256_mask_cmpneq_epu8_mask`]
+  * [x] [`_mm512_cvtepi16_epi8`]
+  * [x] [`_mm512_mask_cvtepi16_epi8`]
+  * [x] [`_mm512_maskz_cvtepi16_epi8`]
+  * [x] [`_mm512_mask_cvtepi16_storeu_epi8`]
+  * [x] [`_mm_mask_cvtepi16_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtepi16_storeu_epi8`]
+  * [x] [`_mm_cvtepi16_epi8`]
+  * [x] [`_mm_mask_cvtepi16_epi8`]
+  * [x] [`_mm_maskz_cvtepi16_epi8`]
+  * [x] [`_mm256_cvtepi16_epi8`]
+  * [x] [`_mm256_mask_cvtepi16_epi8`]
+  * [x] [`_mm256_maskz_cvtepi16_epi8`]
+  * [x] [`_mm512_cvtepi8_epi16`]
+  * [x] [`_mm512_mask_cvtepi8_epi16`]
+  * [x] [`_mm512_maskz_cvtepi8_epi16`]
+  * [x] [`_mm_mask_cvtepi8_epi16`]
+  * [x] [`_mm_maskz_cvtepi8_epi16`]
+  * [x] [`_mm256_mask_cvtepi8_epi16`]
+  * [x] [`_mm256_maskz_cvtepi8_epi16`]
+  * [x] [`_mm512_cvtsepi16_epi8`]
+  * [x] [`_mm512_mask_cvtsepi16_epi8`]
+  * [x] [`_mm512_maskz_cvtsepi16_epi8`]
+  * [x] [`_mm_cvtsepi16_epi8`]
+  * [x] [`_mm_mask_cvtsepi16_epi8`]
+  * [x] [`_mm_maskz_cvtsepi16_epi8`]
+  * [x] [`_mm256_cvtsepi16_epi8`]
+  * [x] [`_mm256_mask_cvtsepi16_epi8`]
+  * [x] [`_mm256_maskz_cvtsepi16_epi8`]
+  * [x] [`_mm512_mask_cvtsepi16_storeu_epi8`]
+  * [x] [`_mm_mask_cvtsepi16_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtsepi16_storeu_epi8`]
+  * [x] [`_mm512_cvtepu8_epi16`]
+  * [x] [`_mm512_mask_cvtepu8_epi16`]
+  * [x] [`_mm512_maskz_cvtepu8_epi16`]
+  * [x] [`_mm_mask_cvtepu8_epi16`]
+  * [x] [`_mm_maskz_cvtepu8_epi16`]
+  * [x] [`_mm256_mask_cvtepu8_epi16`]
+  * [x] [`_mm256_maskz_cvtepu8_epi16`]
+  * [_] [`_cvtmask32_u32`]
+  * [_] [`_cvtmask64_u64`]
+  * [_] [`_cvtu32_mask32`]
+  * [_] [`_cvtu64_mask64`]
+  * [x] [`_mm512_cvtusepi16_epi8`]
+  * [x] [`_mm512_mask_cvtusepi16_epi8`]
+  * [x] [`_mm512_maskz_cvtusepi16_epi8`]
+  * [x] [`_mm_cvtusepi16_epi8`]
+  * [x] [`_mm_mask_cvtusepi16_epi8`]
+  * [x] [`_mm_maskz_cvtusepi16_epi8`]
+  * [x] [`_mm256_cvtusepi16_epi8`]
+  * [x] [`_mm256_mask_cvtusepi16_epi8`]
+  * [x] [`_mm256_maskz_cvtusepi16_epi8`]
+  * [x] [`_mm512_mask_cvtusepi16_storeu_epi8`]
+  * [x] [`_mm_mask_cvtusepi16_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtusepi16_storeu_epi8`]
+  * [x] [`_mm512_dbsad_epu8`]
+  * [x] [`_mm512_mask_dbsad_epu8`]
+  * [x] [`_mm512_maskz_dbsad_epu8`]
+  * [x] [`_mm_dbsad_epu8`]
+  * [x] [`_mm_mask_dbsad_epu8`]
+  * [x] [`_mm_maskz_dbsad_epu8`]
+  * [x] [`_mm256_dbsad_epu8`]
+  * [x] [`_mm256_mask_dbsad_epu8`]
+  * [x] [`_mm256_maskz_dbsad_epu8`]
+  * [x] [`_kadd_mask32`]
+  * [x] [`_kadd_mask64`]
+  * [x] [`_kand_mask32`]
+  * [x] [`_kand_mask64`]
+  * [x] [`_kandn_mask32`]
+  * [x] [`_kandn_mask64`]
+  * [x] [`_knot_mask32`]
+  * [x] [`_knot_mask64`]
+  * [x] [`_kor_mask32`]
+  * [x] [`_kor_mask64`]
+  * [_] [`_kortest_mask32_u8`]
+  * [_] [`_kortest_mask64_u8`]
+  * [_] [`_kortestc_mask32_u8`]
+  * [_] [`_kortestc_mask64_u8`]
+  * [_] [`_kortestz_mask32_u8`]
+  * [_] [`_kortestz_mask64_u8`]
+  * [_] [`_kshiftli_mask32`]
+  * [_] [`_kshiftli_mask64`]
+  * [_] [`_kshiftri_mask32`]
+  * [_] [`_kshiftri_mask64`]
+  * [_] [`_ktest_mask32_u8`]
+  * [_] [`_ktest_mask64_u8`]
+  * [_] [`_ktestc_mask32_u8`]
+  * [_] [`_ktestc_mask64_u8`]
+  * [_] [`_ktestz_mask32_u8`]
+  * [_] [`_ktestz_mask64_u8`]
+  * [_] [`_mm512_kunpackd`]
+  * [_] [`_mm512_kunpackw`]
+  * [x] [`_kxnor_mask32`]
+  * [x] [`_kxnor_mask64`]
+  * [x] [`_kxor_mask32`]
+  * [x] [`_kxor_mask64`]
+  * [x] [`_load_mask32`]
+  * [x] [`_load_mask64`]
+  * [x] [`_mm512_madd_epi16`]
+  * [x] [`_mm512_mask_madd_epi16`]
+  * [x] [`_mm512_maskz_madd_epi16`]
+  * [x] [`_mm_mask_madd_epi16`]
+  * [x] [`_mm_maskz_madd_epi16`]
+  * [x] [`_mm256_mask_madd_epi16`]
+  * [x] [`_mm256_maskz_madd_epi16`]
+  * [x] [`_mm512_maddubs_epi16`]
+  * [x] [`_mm512_mask_maddubs_epi16`]
+  * [x] [`_mm512_maskz_maddubs_epi16`]
+  * [x] [`_mm_mask_maddubs_epi16`]
+  * [x] [`_mm_maskz_maddubs_epi16`]
+  * [x] [`_mm256_mask_maddubs_epi16`]
+  * [x] [`_mm256_maskz_maddubs_epi16`]
+  * [x] [`_mm512_mask_max_epi16`]
+  * [x] [`_mm512_maskz_max_epi16`]
+  * [x] [`_mm512_max_epi16`]
+  * [x] [`_mm_mask_max_epi16`]
+  * [x] [`_mm_maskz_max_epi16`]
+  * [x] [`_mm256_mask_max_epi16`]
+  * [x] [`_mm256_maskz_max_epi16`]
+  * [x] [`_mm512_mask_max_epi8`]
+  * [x] [`_mm512_maskz_max_epi8`]
+  * [x] [`_mm512_max_epi8`]
+  * [x] [`_mm_mask_max_epi8`]
+  * [x] [`_mm_maskz_max_epi8`]
+  * [x] [`_mm256_mask_max_epi8`]
+  * [x] [`_mm256_maskz_max_epi8`]
+  * [x] [`_mm512_mask_max_epu16`]
+  * [x] [`_mm512_maskz_max_epu16`]
+  * [x] [`_mm512_max_epu16`]
+  * [x] [`_mm_mask_max_epu16`]
+  * [x] [`_mm_maskz_max_epu16`]
+  * [x] [`_mm256_mask_max_epu16`]
+  * [x] [`_mm256_maskz_max_epu16`]
+  * [x] [`_mm512_mask_max_epu8`]
+  * [x] [`_mm512_maskz_max_epu8`]
+  * [x] [`_mm512_max_epu8`]
+  * [x] [`_mm_mask_max_epu8`]
+  * [x] [`_mm_maskz_max_epu8`]
+  * [x] [`_mm256_mask_max_epu8`]
+  * [x] [`_mm256_maskz_max_epu8`]
+  * [x] [`_mm512_mask_min_epi16`]
+  * [x] [`_mm512_maskz_min_epi16`]
+  * [x] [`_mm512_min_epi16`]
+  * [x] [`_mm_mask_min_epi16`]
+  * [x] [`_mm_maskz_min_epi16`]
+  * [x] [`_mm256_mask_min_epi16`]
+  * [x] [`_mm256_maskz_min_epi16`]
+  * [x] [`_mm512_mask_min_epi8`]
+  * [x] [`_mm512_maskz_min_epi8`]
+  * [x] [`_mm512_min_epi8`]
+  * [x] [`_mm_mask_min_epi8`]
+  * [x] [`_mm_maskz_min_epi8`]
+  * [x] [`_mm256_mask_min_epi8`]
+  * [x] [`_mm256_maskz_min_epi8`]
+  * [x] [`_mm512_mask_min_epu16`]
+  * [x] [`_mm512_maskz_min_epu16`]
+  * [x] [`_mm512_min_epu16`]
+  * [x] [`_mm_mask_min_epu16`]
+  * [x] [`_mm_maskz_min_epu16`]
+  * [x] [`_mm256_mask_min_epu16`]
+  * [x] [`_mm256_maskz_min_epu16`]
+  * [x] [`_mm512_mask_min_epu8`]
+  * [x] [`_mm512_maskz_min_epu8`]
+  * [x] [`_mm512_min_epu8`]
+  * [x] [`_mm_mask_min_epu8`]
+  * [x] [`_mm_maskz_min_epu8`]
+  * [x] [`_mm256_mask_min_epu8`]
+  * [x] [`_mm256_maskz_min_epu8`]
+  * [x] [`_mm512_mask_mov_epi16`]
+  * [x] [`_mm512_maskz_mov_epi16`]
+  * [x] [`_mm_mask_mov_epi16`]
+  * [x] [`_mm_maskz_mov_epi16`]
+  * [x] [`_mm256_mask_mov_epi16`]
+  * [x] [`_mm256_maskz_mov_epi16`]
+  * [x] [`_mm512_mask_mov_epi8`]
+  * [x] [`_mm512_maskz_mov_epi8`]
+  * [x] [`_mm_mask_mov_epi8`]
+  * [x] [`_mm_maskz_mov_epi8`]
+  * [x] [`_mm256_mask_mov_epi8`]
+  * [x] [`_mm256_maskz_mov_epi8`]
+  * [x] [`_mm512_movepi16_mask`]
+  * [x] [`_mm_movepi16_mask`]
+  * [x] [`_mm256_movepi16_mask`]
+  * [x] [`_mm512_movepi8_mask`]
+  * [x] [`_mm_movepi8_mask`]
+  * [x] [`_mm256_movepi8_mask`]
+  * [x] [`_mm512_movm_epi16`]
+  * [x] [`_mm_movm_epi16`]
+  * [x] [`_mm256_movm_epi16`]
+  * [x] [`_mm512_movm_epi8`]
+  * [x] [`_mm_movm_epi8`]
+  * [x] [`_mm256_movm_epi8`]
+  * [x] [`_mm512_mask_mulhi_epi16`]
+  * [x] [`_mm512_maskz_mulhi_epi16`]
+  * [x] [`_mm512_mulhi_epi16`]
+  * [x] [`_mm512_mask_mulhi_epu16`]
+  * [x] [`_mm512_maskz_mulhi_epu16`]
+  * [x] [`_mm_mask_mulhi_epi16`]
+  * [x] [`_mm_maskz_mulhi_epi16`]
+  * [x] [`_mm256_mask_mulhi_epi16`]
+  * [x] [`_mm256_maskz_mulhi_epi16`]
+  * [x] [`_mm512_mulhi_epu16`]
+  * [x] [`_mm_mask_mulhi_epu16`]
+  * [x] [`_mm_maskz_mulhi_epu16`]
+  * [x] [`_mm256_mask_mulhi_epu16`]
+  * [x] [`_mm256_maskz_mulhi_epu16`]
+  * [x] [`_mm512_mask_mulhrs_epi16`]
+  * [x] [`_mm512_maskz_mulhrs_epi16`]
+  * [x] [`_mm512_mulhrs_epi16`]
+  * [x] [`_mm_mask_mulhrs_epi16`]
+  * [x] [`_mm_maskz_mulhrs_epi16`]
+  * [x] [`_mm256_mask_mulhrs_epi16`]
+  * [x] [`_mm256_maskz_mulhrs_epi16`]
+  * [x] [`_mm512_mask_mullo_epi16`]
+  * [x] [`_mm512_maskz_mullo_epi16`]
+  * [x] [`_mm512_mullo_epi16`]
+  * [x] [`_mm_mask_mullo_epi16`]
+  * [x] [`_mm_maskz_mullo_epi16`]
+  * [x] [`_mm256_mask_mullo_epi16`]
+  * [x] [`_mm256_maskz_mullo_epi16`]
+  * [x] [`_mm512_mask_packs_epi16`]
+  * [x] [`_mm512_maskz_packs_epi16`]
+  * [x] [`_mm512_packs_epi16`]
+  * [x] [`_mm_mask_packs_epi16`]
+  * [x] [`_mm_maskz_packs_epi16`]
+  * [x] [`_mm256_mask_packs_epi16`]
+  * [x] [`_mm256_maskz_packs_epi16`]
+  * [x] [`_mm512_mask_packs_epi32`]
+  * [x] [`_mm512_maskz_packs_epi32`]
+  * [x] [`_mm512_packs_epi32`]
+  * [x] [`_mm_mask_packs_epi32`]
+  * [x] [`_mm_maskz_packs_epi32`]
+  * [x] [`_mm256_mask_packs_epi32`]
+  * [x] [`_mm256_maskz_packs_epi32`]
+  * [x] [`_mm512_mask_packus_epi16`]
+  * [x] [`_mm512_maskz_packus_epi16`]
+  * [x] [`_mm512_packus_epi16`]
+  * [x] [`_mm_mask_packus_epi16`]
+  * [x] [`_mm_maskz_packus_epi16`]
+  * [x] [`_mm256_mask_packus_epi16`]
+  * [x] [`_mm256_maskz_packus_epi16`]
+  * [x] [`_mm512_mask_packus_epi32`]
+  * [x] [`_mm512_maskz_packus_epi32`]
+  * [x] [`_mm512_packus_epi32`]
+  * [x] [`_mm_mask_packus_epi32`]
+  * [x] [`_mm_maskz_packus_epi32`]
+  * [x] [`_mm256_mask_packus_epi32`]
+  * [x] [`_mm256_maskz_packus_epi32`]
+  * [x] [`_mm512_mask_permutex2var_epi16`]
+  * [x] [`_mm512_mask2_permutex2var_epi16`]
+  * [x] [`_mm512_maskz_permutex2var_epi16`]
+  * [x] [`_mm512_permutex2var_epi16`]
+  * [x] [`_mm_mask_permutex2var_epi16`]
+  * [x] [`_mm_mask2_permutex2var_epi16`]
+  * [x] [`_mm_maskz_permutex2var_epi16`]
+  * [x] [`_mm_permutex2var_epi16`]
+  * [x] [`_mm256_mask_permutex2var_epi16`]
+  * [x] [`_mm256_mask2_permutex2var_epi16`]
+  * [x] [`_mm256_maskz_permutex2var_epi16`]
+  * [x] [`_mm256_permutex2var_epi16`]
+  * [x] [`_mm512_mask_permutexvar_epi16`]
+  * [x] [`_mm512_maskz_permutexvar_epi16`]
+  * [x] [`_mm512_permutexvar_epi16`]
+  * [x] [`_mm_mask_permutexvar_epi16`]
+  * [x] [`_mm_maskz_permutexvar_epi16`]
+  * [x] [`_mm_permutexvar_epi16`]
+  * [x] [`_mm256_mask_permutexvar_epi16`]
+  * [x] [`_mm256_maskz_permutexvar_epi16`]
+  * [x] [`_mm256_permutexvar_epi16`]
+  * [x] [`_mm512_sad_epu8`]
+  * [x] [`_mm512_mask_set1_epi16`]
+  * [x] [`_mm512_maskz_set1_epi16`]
+  * [x] [`_mm_mask_set1_epi16`]
+  * [x] [`_mm_maskz_set1_epi16`]
+  * [x] [`_mm256_mask_set1_epi16`]
+  * [x] [`_mm256_maskz_set1_epi16`]
+  * [x] [`_mm512_mask_set1_epi8`]
+  * [x] [`_mm512_maskz_set1_epi8`]
+  * [x] [`_mm_mask_set1_epi8`]
+  * [x] [`_mm_maskz_set1_epi8`]
+  * [x] [`_mm256_mask_set1_epi8`]
+  * [x] [`_mm256_maskz_set1_epi8`]
+  * [x] [`_mm512_mask_shuffle_epi8`]
+  * [x] [`_mm512_maskz_shuffle_epi8`]
+  * [x] [`_mm512_shuffle_epi8`]
+  * [x] [`_mm_mask_shuffle_epi8`]
+  * [x] [`_mm_maskz_shuffle_epi8`]
+  * [x] [`_mm256_mask_shuffle_epi8`]
+  * [x] [`_mm256_maskz_shuffle_epi8`]
+  * [x] [`_mm512_mask_shufflehi_epi16`]
+  * [x] [`_mm512_maskz_shufflehi_epi16`]
+  * [x] [`_mm512_shufflehi_epi16`]
+  * [x] [`_mm_mask_shufflehi_epi16`]
+  * [x] [`_mm_maskz_shufflehi_epi16`]
+  * [x] [`_mm256_mask_shufflehi_epi16`]
+  * [x] [`_mm256_maskz_shufflehi_epi16`]
+  * [x] [`_mm512_mask_shufflelo_epi16`]
+  * [x] [`_mm512_maskz_shufflelo_epi16`]
+  * [x] [`_mm512_shufflelo_epi16`]
+  * [x] [`_mm_mask_shufflelo_epi16`]
+  * [x] [`_mm_maskz_shufflelo_epi16`]
+  * [x] [`_mm256_mask_shufflelo_epi16`]
+  * [x] [`_mm256_maskz_shufflelo_epi16`]
+  * [x] [`_mm512_mask_sll_epi16`]
+  * [x] [`_mm512_maskz_sll_epi16`]
+  * [x] [`_mm512_sll_epi16`]
+  * [x] [`_mm_mask_sll_epi16`]
+  * [x] [`_mm_maskz_sll_epi16`]
+  * [x] [`_mm256_mask_sll_epi16`]
+  * [x] [`_mm256_maskz_sll_epi16`]
+  * [x] [`_mm512_mask_slli_epi16`]
+  * [x] [`_mm512_maskz_slli_epi16`]
+  * [x] [`_mm512_slli_epi16`]
+  * [x] [`_mm_mask_slli_epi16
+  * [x] [`_mm_maskz_slli_epi16
+  * [x] [`_mm256_mask_slli_epi16
+  * [x] [`_mm256_maskz_slli_epi16
+  * [x] [`_mm512_mask_sllv_epi16`]
+  * [x] [`_mm512_maskz_sllv_epi16`]
+  * [x] [`_mm512_sllv_epi16`]
+  * [x] [`_mm_mask_sllv_epi16`]
+  * [x] [`_mm_maskz_sllv_epi16`]
+  * [x] [`_mm_sllv_epi16`]
+  * [x] [`_mm256_mask_sllv_epi16`]
+  * [x] [`_mm256_maskz_sllv_epi16`]
+  * [x] [`_mm256_sllv_epi16`]
+  * [x] [`_mm512_mask_sra_epi16`]
+  * [x] [`_mm512_maskz_sra_epi16`]
+  * [x] [`_mm512_sra_epi16`]
+  * [x] [`_mm_mask_sra_epi16`]
+  * [x] [`_mm_maskz_sra_epi16`]
+  * [x] [`_mm256_mask_sra_epi16`]
+  * [x] [`_mm256_maskz_sra_epi16`]
+  * [x] [`_mm512_mask_srai_epi16`]
+  * [x] [`_mm512_maskz_srai_epi16`]
+  * [x] [`_mm512_srai_epi16`]
+  * [x] [`_mm_mask_srai_epi16`]
+  * [x] [`_mm_maskz_srai_epi16`]
+  * [x] [`_mm256_mask_srai_epi16`]
+  * [x] [`_mm256_maskz_srai_epi16`]
+  * [x] [`_mm512_mask_srav_epi16`]
+  * [x] [`_mm512_maskz_srav_epi16`]
+  * [x] [`_mm512_srav_epi16`]
+  * [x] [`_mm_mask_srav_epi16`]
+  * [x] [`_mm_maskz_srav_epi16`]
+  * [x] [`_mm_srav_epi16`]
+  * [x] [`_mm256_mask_srav_epi16`]
+  * [x] [`_mm256_maskz_srav_epi16`]
+  * [x] [`_mm256_srav_epi16`]
+  * [x] [`_mm512_mask_srl_epi16`]
+  * [x] [`_mm512_maskz_srl_epi16`]
+  * [x] [`_mm512_srl_epi16`]
+  * [x] [`_mm_mask_srl_epi16`]
+  * [x] [`_mm_maskz_srl_epi16`]
+  * [x] [`_mm256_mask_srl_epi16`]
+  * [x] [`_mm256_maskz_srl_epi16`]
+  * [x] [`_mm512_mask_srli_epi16`]
+  * [x] [`_mm512_maskz_srli_epi16`]
+  * [x] [`_mm512_srli_epi16`]
+  * [x] [`_mm_mask_srli_epi16`]
+  * [x] [`_mm_maskz_srli_epi16`]
+  * [x] [`_mm256_mask_srli_epi16`]
+  * [x] [`_mm256_maskz_srli_epi16`]
+  * [x] [`_mm512_mask_srlv_epi16`]
+  * [x] [`_mm512_maskz_srlv_epi16`]
+  * [x] [`_mm512_srlv_epi16`]
+  * [x] [`_mm_mask_srlv_epi16`]
+  * [x] [`_mm_maskz_srlv_epi16`]
+  * [x] [`_mm_srlv_epi16`]
+  * [x] [`_mm256_mask_srlv_epi16`]
+  * [x] [`_mm256_maskz_srlv_epi16`]
+  * [x] [`_mm256_srlv_epi16`]
+  * [x] [`_store_mask32`]
+  * [x] [`_store_mask64`]
+  * [x] [`_mm512_mask_sub_epi16`]
+  * [x] [`_mm512_maskz_sub_epi16`]
+  * [x] [`_mm512_sub_epi16`]
+  * [x] [`_mm_mask_sub_epi16`]
+  * [x] [`_mm_maskz_sub_epi16`]
+  * [x] [`_mm256_mask_sub_epi16`]
+  * [x] [`_mm256_maskz_sub_epi16`]
+  * [x] [`_mm512_mask_sub_epi8`]
+  * [x] [`_mm512_maskz_sub_epi8`]
+  * [x] [`_mm_mask_sub_epi8`]
+  * [x] [`_mm_maskz_sub_epi8`]
+  * [x] [`_mm256_mask_sub_epi8`]
+  * [x] [`_mm256_maskz_sub_epi8`]
+  * [x] [`_mm512_sub_epi8`]
+  * [x] [`_mm512_mask_subs_epi16`]
+  * [x] [`_mm512_maskz_subs_epi16`]
+  * [x] [`_mm512_subs_epi16`]
+  * [x] [`_mm_mask_subs_epi16`]
+  * [x] [`_mm_maskz_subs_epi16`]
+  * [x] [`_mm256_mask_subs_epi16`]
+  * [x] [`_mm256_maskz_subs_epi16`]
+  * [x] [`_mm512_mask_subs_epi8`]
+  * [x] [`_mm512_maskz_subs_epi8`]
+  * [x] [`_mm512_subs_epi8`]
+  * [x] [`_mm_mask_subs_epi8`]
+  * [x] [`_mm_maskz_subs_epi8`]
+  * [x] [`_mm256_mask_subs_epi8`]
+  * [x] [`_mm256_maskz_subs_epi8`]
+  * [x] [`_mm512_mask_subs_epu16`]
+  * [x] [`_mm512_maskz_subs_epu16`]
+  * [x] [`_mm512_subs_epu16`]
+  * [x] [`_mm_mask_subs_epu16`]
+  * [x] [`_mm_maskz_subs_epu16`]
+  * [x] [`_mm256_mask_subs_epu16`]
+  * [x] [`_mm256_maskz_subs_epu16`]
+  * [x] [`_mm512_mask_subs_epu8`]
+  * [x] [`_mm512_maskz_subs_epu8`]
+  * [x] [`_mm512_subs_epu8`]
+  * [x] [`_mm_mask_subs_epu8`]
+  * [x] [`_mm_maskz_subs_epu8`]
+  * [x] [`_mm256_mask_subs_epu8`]
+  * [x] [`_mm256_maskz_subs_epu8`]
+  * [x] [`_mm512_mask_test_epi16_mask`]
+  * [x] [`_mm512_test_epi16_mask`]
+  * [x] [`_mm_mask_test_epi16_mask`]
+  * [x] [`_mm_test_epi16_mask`]
+  * [x] [`_mm256_mask_test_epi16_mask`]
+  * [x] [`_mm256_test_epi16_mask`]
+  * [x] [`_mm512_mask_test_epi8_mask`]
+  * [x] [`_mm512_test_epi8_mask`]
+  * [x] [`_mm_mask_test_epi8_mask`]
+  * [x] [`_mm_test_epi8_mask`]
+  * [x] [`_mm256_mask_test_epi8_mask`]
+  * [x] [`_mm256_test_epi8_mask`]
+  * [x] [`_mm512_mask_testn_epi16_mask`]
+  * [x] [`_mm512_testn_epi16_mask`]
+  * [x] [`_mm_mask_testn_epi16_mask`]
+  * [x] [`_mm_testn_epi16_mask`]
+  * [x] [`_mm256_mask_testn_epi16_mask`]
+  * [x] [`_mm256_testn_epi16_mask`]
+  * [x] [`_mm512_mask_testn_epi8_mask`]
+  * [x] [`_mm512_testn_epi8_mask`]
+  * [x] [`_mm_mask_testn_epi8_mask`]
+  * [x] [`_mm_testn_epi8_mask`]
+  * [x] [`_mm256_mask_testn_epi8_mask`]
+  * [x] [`_mm256_testn_epi8_mask`]
+  * [x] [`_mm512_mask_unpackhi_epi16`]
+  * [x] [`_mm512_maskz_unpackhi_epi16`]
+  * [x] [`_mm512_unpackhi_epi16`]
+  * [x] [`_mm_mask_unpackhi_epi16`]
+  * [x] [`_mm_maskz_unpackhi_epi16`]
+  * [x] [`_mm256_mask_unpackhi_epi16`]
+  * [x] [`_mm256_maskz_unpackhi_epi16`]
+  * [x] [`_mm512_mask_unpackhi_epi8`]
+  * [x] [`_mm512_maskz_unpackhi_epi8`]
+  * [x] [`_mm512_unpackhi_epi8`]
+  * [x] [`_mm_mask_unpackhi_epi8`]
+  * [x] [`_mm_maskz_unpackhi_epi8`]
+  * [x] [`_mm256_mask_unpackhi_epi8`]
+  * [x] [`_mm256_maskz_unpackhi_epi8`]
+  * [x] [`_mm512_mask_unpacklo_epi16`]
+  * [x] [`_mm512_maskz_unpacklo_epi16`]
+  * [x] [`_mm512_unpacklo_epi16`]
+  * [x] [`_mm_mask_unpacklo_epi16`]
+  * [x] [`_mm_maskz_unpacklo_epi16`]
+  * [x] [`_mm256_mask_unpacklo_epi16`]
+  * [x] [`_mm256_maskz_unpacklo_epi16`]
+  * [x] [`_mm512_mask_unpacklo_epi8`]
+  * [x] [`_mm512_maskz_unpacklo_epi8`]
+  * [x] [`_mm512_unpacklo_epi8`]
+  * [x] [`_mm_mask_unpacklo_epi8`]
+  * [x] [`_mm_maskz_unpacklo_epi8`]
+  * [x] [`_mm256_mask_unpacklo_epi8`]
+  * [x] [`_mm256_maskz_unpacklo_epi8`]
+
+</p>
diff --git a/library/stdarch/crates/core_arch/avx512f.md b/library/stdarch/crates/core_arch/avx512f.md
new file mode 100644
index 000000000..6cb6e6564
--- /dev/null
+++ b/library/stdarch/crates/core_arch/avx512f.md
@@ -0,0 +1,2633 @@
+<summary>["AVX512F"]</summary><p>
+
+  * [x] [`_mm512_abs_epi32`]
+  * [x] [`_mm512_mask_abs_epi32`]
+  * [x] [`_mm512_maskz_abs_epi32`]
+  * [x] [`_mm_mask_abs_epi32`]
+  * [x] [`_mm_maskz_abs_epi32`]
+  * [x] [`_mm256_mask_abs_epi32`]
+  * [x] [`_mm256_maskz_abs_epi32`]
+  * [x] [`_mm512_abs_epi64`]
+  * [x] [`_mm512_mask_abs_epi64`]
+  * [x] [`_mm512_maskz_abs_epi64`]
+  * [x] [`_mm_abs_epi64`]
+  * [x] [`_mm_mask_abs_epi64`]
+  * [x] [`_mm_maskz_abs_epi64`]
+  * [x] [`_mm256_abs_epi64`]
+  * [x] [`_mm256_mask_abs_epi64`]
+  * [x] [`_mm256_maskz_abs_epi64`]
+  * [x] [`_mm512_abs_pd`]
+  * [x] [`_mm512_mask_abs_pd`]
+  * [x] [`_mm512_abs_ps`]
+  * [x] [`_mm512_mask_abs_ps`]
+  * [x] [`_mm512_add_epi32`]
+  * [x] [`_mm512_mask_add_epi32`]
+  * [x] [`_mm512_maskz_add_epi32`]
+  * [x] [`_mm_mask_add_epi32`]
+  * [x] [`_mm_maskz_add_epi32`]
+  * [x] [`_mm256_mask_add_epi32`]
+  * [x] [`_mm256_maskz_add_epi32`]
+  * [x] [`_mm512_add_epi64`]
+  * [x] [`_mm512_mask_add_epi64`]
+  * [x] [`_mm512_maskz_add_epi64`]
+  * [x] [`_mm_mask_add_epi64`]
+  * [x] [`_mm_maskz_add_epi64`]
+  * [x] [`_mm256_mask_add_epi64`]
+  * [x] [`_mm256_maskz_add_epi64`]
+  * [x] [`_mm512_add_ps`]
+  * [x] [`_mm512_mask_add_ps`]
+  * [x] [`_mm512_maskz_add_ps`]
+  * [x] [`_mm_mask_add_ps`]
+  * [x] [`_mm_maskz_add_ps`]
+  * [x] [`_mm256_mask_add_ps`]
+  * [x] [`_mm256_maskz_add_ps`]
+  * [x] [`_mm512_add_pd`]
+  * [x] [`_mm512_mask_add_pd`]
+  * [x] [`_mm512_maskz_add_pd`]
+  * [x] [`_mm_mask_add_pd`]
+  * [x] [`_mm_maskz_add_pd`]
+  * [x] [`_mm256_mask_add_pd`]
+  * [x] [`_mm256_maskz_add_pd`]
+  * [x] [`_mm512_add_round_ps`]
+  * [x] [`_mm512_mask_add_round_ps`]
+  * [x] [`_mm512_maskz_add_round_ps`]
+  * [x] [`_mm512_add_round_pd`]
+  * [x] [`_mm512_mask_add_round_pd`]
+  * [x] [`_mm512_maskz_add_round_pd`]
+  * [x] [`_mm512_sub_epi32`]
+  * [x] [`_mm512_mask_sub_epi32`]
+  * [x] [`_mm512_maskz_sub_epi32`]
+  * [x] [`_mm_mask_sub_epi32`]
+  * [x] [`_mm_maskz_sub_epi32`]
+  * [x] [`_mm256_mask_sub_epi32`]
+  * [x] [`_mm256_maskz_sub_epi32`]
+  * [x] [`_mm512_sub_epi64`]
+  * [x] [`_mm512_mask_sub_epi64`]
+  * [x] [`_mm512_maskz_sub_epi64`]
+  * [x] [`_mm_mask_sub_epi64`]
+  * [x] [`_mm_maskz_sub_epi64`]
+  * [x] [`_mm256_mask_sub_epi64`]
+  * [x] [`_mm256_maskz_sub_epi64`]
+  * [x] [`_mm512_sub_ps`]
+  * [x] [`_mm512_mask_sub_ps`]
+  * [x] [`_mm512_maskz_sub_ps`]
+  * [x] [`_mm_mask_sub_ps`]
+  * [x] [`_mm_maskz_sub_ps`]
+  * [x] [`_mm256_mask_sub_ps`]
+  * [x] [`_mm256_maskz_sub_ps`]
+  * [x] [`_mm512_sub_pd`]
+  * [x] [`_mm512_mask_sub_pd`]
+  * [x] [`_mm512_maskz_sub_pd`]
+  * [x] [`_mm_mask_sub_pd`]
+  * [x] [`_mm_maskz_sub_pd`]
+  * [x] [`_mm256_mask_sub_pd`]
+  * [x] [`_mm256_maskz_sub_pd`]
+  * [x] [`_mm512_sub_round_ps`]
+  * [x] [`_mm512_mask_sub_round_ps`]
+  * [x] [`_mm512_maskz_sub_round_ps`]
+  * [x] [`_mm512_sub_round_pd`]
+  * [x] [`_mm512_mask_sub_round_pd`]
+  * [x] [`_mm512_maskz_sub_round_pd`]
+  * [x] [`_mm512_mul_epi32`]
+  * [x] [`_mm512_mask_mul_epi32`]
+  * [x] [`_mm512_maskz_mul_epi32`]
+  * [x] [`_mm_mask_mul_epi32`]
+  * [x] [`_mm_maskz_mul_epi32`]
+  * [x] [`_mm256_mask_mul_epi32`]
+  * [x] [`_mm256_maskz_mul_epi32`]
+  * [x] [`_mm512_mul_epu32`]
+  * [x] [`_mm512_mask_mul_epu32`]
+  * [x] [`_mm512_maskz_mul_epu32`]
+  * [x] [`_mm_mask_mul_epu32`]
+  * [x] [`_mm_maskz_mul_epu32`]
+  * [x] [`_mm256_mask_mul_epu32`]
+  * [x] [`_mm256_maskz_mul_epu32`]
+  * [x] [`_mm512_mul_ps`]
+  * [x] [`_mm512_mask_mul_ps`]
+  * [x] [`_mm512_maskz_mul_ps`]
+  * [x] [`_mm_mask_mul_ps`]
+  * [x] [`_mm_maskz_mul_ps`]
+  * [x] [`_mm256_mask_mul_ps`]
+  * [x] [`_mm256_maskz_mul_ps`]
+  * [x] [`_mm512_mul_pd`]
+  * [x] [`_mm512_mask_mul_pd`]
+  * [x] [`_mm512_maskz_mul_pd`]
+  * [x] [`_mm_mask_mul_pd`]
+  * [x] [`_mm_maskz_mul_pd`]
+  * [x] [`_mm256_mask_mul_pd`]
+  * [x] [`_mm256_maskz_mul_pd`]
+  * [x] [`_mm512_mul_round_ps`]
+  * [x] [`_mm512_mask_mul_round_ps`]
+  * [x] [`_mm512_maskz_mul_round_ps`]
+  * [x] [`_mm512_mul_round_pd`]
+  * [x] [`_mm512_mask_mul_round_pd`]
+  * [x] [`_mm512_maskz_mul_round_pd`]
+  * [x] [`_mm512_mullo_epi32`]
+  * [x] [`_mm512_mask_mullo_epi32`]
+  * [x] [`_mm512_maskz_mullo_epi32`]
+  * [x] [`_mm_mask_mullo_epi32`]
+  * [x] [`_mm_maskz_mullo_epi32`]
+  * [x] [`_mm256_mask_mullo_epi32`]
+  * [x] [`_mm256_maskz_mullo_epi32`]
+  * [x] [`_mm512_mullox_epi64`]
+  * [x] [`_mm512_mask_mullox_epi64`]
+  * [x] [`_mm512_div_ps`]
+  * [x] [`_mm512_mask_div_ps`]
+  * [x] [`_mm512_maskz_div_ps`]
+  * [x] [`_mm_mask_div_ps`]
+  * [x] [`_mm_maskz_div_ps`]
+  * [x] [`_mm256_mask_div_ps`]
+  * [x] [`_mm256_maskz_div_ps`]
+  * [x] [`_mm512_div_pd`]
+  * [x] [`_mm512_mask_div_pd`]
+  * [x] [`_mm512_maskz_div_pd`]
+  * [x] [`_mm_mask_div_pd`]
+  * [x] [`_mm_maskz_div_pd`]
+  * [x] [`_mm256_mask_div_pd`]
+  * [x] [`_mm256_maskz_div_pd`]
+  * [x] [`_mm512_div_round_ps`]
+  * [x] [`_mm512_mask_div_round_ps`]
+  * [x] [`_mm512_maskz_div_round_ps`]
+  * [x] [`_mm512_div_round_pd`]
+  * [x] [`_mm512_mask_div_round_pd`]
+  * [x] [`_mm512_maskz_div_round_pd`]
+  * [x] [`_mm512_max_epi32`]
+  * [x] [`_mm512_mask_max_epi32`]
+  * [x] [`_mm512_maskz_max_epi32`]
+  * [x] [`_mm_mask_max_epi32`]
+  * [x] [`_mm_maskz_max_epi32`]
+  * [x] [`_mm256_mask_max_epi32`]
+  * [x] [`_mm256_maskz_max_epi32`]
+  * [x] [`_mm512_max_epu32`]
+  * [x] [`_mm512_mask_max_epu32`]
+  * [x] [`_mm512_maskz_max_epu32`]
+  * [x] [`_mm_mask_max_epu32`]
+  * [x] [`_mm_maskz_max_epu32`]
+  * [x] [`_mm256_mask_max_epu32`]
+  * [x] [`_mm256_maskz_max_epu32`]
+  * [x] [`_mm512_max_epi64`]
+  * [x] [`_mm512_mask_max_epi64`]
+  * [x] [`_mm512_maskz_max_epi64`]
+  * [x] [`_mm_mask_max_epi64`]
+  * [x] [`_mm_maskz_max_epi64`]
+  * [x] [`_mm_max_epi64`]
+  * [x] [`_mm256_mask_max_epi64`]
+  * [x] [`_mm256_maskz_max_epi64`]
+  * [x] [`_mm256_max_epi64`]
+  * [x] [`_mm512_max_epu64`]
+  * [x] [`_mm512_mask_max_epu64`]
+  * [x] [`_mm512_maskz_max_epu64`]
+  * [x] [`_mm_mask_max_epu64`]
+  * [x] [`_mm_maskz_max_epu64`]
+  * [x] [`_mm_max_epu64`]
+  * [x] [`_mm256_mask_max_epu64`]
+  * [x] [`_mm256_maskz_max_epu64`]
+  * [x] [`_mm256_max_epu64`]
+  * [x] [`_mm512_max_ps`]
+  * [x] [`_mm512_mask_max_ps`]
+  * [x] [`_mm512_maskz_max_ps`]
+  * [x] [`_mm_mask_max_ps`]
+  * [x] [`_mm_maskz_max_ps`]
+  * [x] [`_mm256_mask_max_ps`]
+  * [x] [`_mm256_maskz_max_ps`]
+  * [x] [`_mm512_max_pd`]
+  * [x] [`_mm512_mask_max_pd`]
+  * [x] [`_mm512_maskz_max_pd`]
+  * [x] [`_mm_mask_max_pd`]
+  * [x] [`_mm_maskz_max_pd`]
+  * [x] [`_mm256_mask_max_pd`]
+  * [x] [`_mm256_maskz_max_pd`]
+  * [x] [`_mm512_max_round_ps`]
+  * [x] [`_mm512_mask_max_round_ps`]
+  * [x] [`_mm512_maskz_max_round_ps`]
+  * [x] [`_mm512_max_round_pd`]
+  * [x] [`_mm512_mask_max_round_pd`]
+  * [x] [`_mm512_maskz_max_round_pd`]
+  * [x] [`_mm512_min_epi32`]
+  * [x] [`_mm512_mask_min_epi32`]
+  * [x] [`_mm512_maskz_min_epi32`]
+  * [x] [`_mm_mask_min_epi32`]
+  * [x] [`_mm_maskz_min_epi32`]
+  * [x] [`_mm256_mask_min_epi32`]
+  * [x] [`_mm256_maskz_min_epi32`]
+  * [x] [`_mm512_min_epi64`]
+  * [x] [`_mm512_mask_min_epi64`]
+  * [x] [`_mm512_maskz_min_epi64`]
+  * [x] [`_mm_mask_min_epi64`]
+  * [x] [`_mm_maskz_min_epi64`]
+  * [x] [`_mm_min_epi64`]
+  * [x] [`_mm256_mask_min_epi64`]
+  * [x] [`_mm256_maskz_min_epi64`]
+  * [x] [`_mm256_min_epi64`]
+  * [x] [`_mm512_min_epu32`]
+  * [x] [`_mm512_mask_min_epu32`]
+  * [x] [`_mm512_maskz_min_epu32`]
+  * [x] [`_mm_mask_min_epu32`]
+  * [x] [`_mm_maskz_min_epu32`]
+  * [x] [`_mm256_mask_min_epu32`]
+  * [x] [`_mm256_maskz_min_epu32`]
+  * [x] [`_mm512_min_epu64`]
+  * [x] [`_mm512_mask_min_epu64`]
+  * [x] [`_mm512_maskz_min_epu64`]
+  * [x] [`_mm_mask_min_epu64`]
+  * [x] [`_mm_maskz_min_epu64`]
+  * [x] [`_mm_min_epu64`]
+  * [x] [`_mm256_mask_min_epu64`]
+  * [x] [`_mm256_maskz_min_epu64`]
+  * [x] [`_mm256_min_epu64`]
+  * [x] [`_mm512_min_ps`]
+  * [x] [`_mm512_mask_min_ps`]
+  * [x] [`_mm512_maskz_min_ps`]
+  * [x] [`_mm_mask_min_ps`]
+  * [x] [`_mm_maskz_min_ps`]
+  * [x] [`_mm256_mask_min_ps`]
+  * [x] [`_mm256_maskz_min_ps`]
+  * [x] [`_mm512_min_pd`]
+  * [x] [`_mm512_mask_min_pd`]
+  * [x] [`_mm512_maskz_min_pd`]
+  * [x] [`_mm_mask_min_pd`]
+  * [x] [`_mm_maskz_min_pd`]
+  * [x] [`_mm256_mask_min_pd`]
+  * [x] [`_mm256_maskz_min_pd`]
+  * [x] [`_mm512_min_round_ps`]
+  * [x] [`_mm512_mask_min_round_ps`]
+  * [x] [`_mm512_maskz_min_round_ps`]
+  * [x] [`_mm512_min_round_pd`]
+  * [x] [`_mm512_mask_min_round_pd`]
+  * [x] [`_mm512_maskz_min_round_pd`]
+  * [x] [`_mm512_sqrt_ps`]
+  * [x] [`_mm512_mask_sqrt_ps`]
+  * [x] [`_mm512_maskz_sqrt_ps`]
+  * [x] [`_mm_mask_sqrt_ps`]
+  * [x] [`_mm_maskz_sqrt_ps`]
+  * [x] [`_mm256_mask_sqrt_ps`]
+  * [x] [`_mm256_maskz_sqrt_ps`]
+  * [x] [`_mm512_sqrt_pd`]
+  * [x] [`_mm512_mask_sqrt_pd`]
+  * [x] [`_mm512_maskz_sqrt_pd`]
+  * [x] [`_mm_mask_sqrt_pd`]
+  * [x] [`_mm_maskz_sqrt_pd`]
+  * [x] [`_mm256_mask_sqrt_pd`]
+  * [x] [`_mm256_maskz_sqrt_pd`]
+  * [x] [`_mm512_sqrt_round_ps`]
+  * [x] [`_mm512_mask_sqrt_round_ps`]
+  * [x] [`_mm512_maskz_sqrt_round_ps`]
+  * [x] [`_mm512_sqrt_round_pd`]
+  * [x] [`_mm512_mask_sqrt_round_pd`]
+  * [x] [`_mm512_maskz_sqrt_round_pd`]
+  * [x] [`_mm512_rsqrt14_ps`]
+  * [x] [`_mm512_mask_rsqrt14_ps`]
+  * [x] [`_mm512_maskz_rsqrt14_ps`]
+  * [x] [`_mm_mask_rsqrt14_ps`]
+  * [x] [`_mm_maskz_rsqrt14_ps`]
+  * [x] [`_mm256_mask_rsqrt14_ps`]
+  * [x] [`_mm256_maskz_rsqrt14_ps`]
+  * [x] [`_mm512_rsqrt14_pd`]
+  * [x] [`_mm512_mask_rsqrt14_pd`]
+  * [x] [`_mm512_maskz_rsqrt14_pd`]
+  * [x] [`_mm_mask_rsqrt14_pd`]
+  * [x] [`_mm_maskz_rsqrt14_pd`]
+  * [x] [`_mm256_mask_rsqrt14_pd`]
+  * [x] [`_mm256_maskz_rsqrt14_pd`]
+  * [x] [`_mm512_rcp14_ps`]
+  * [x] [`_mm512_mask_rcp14_ps`]
+  * [x] [`_mm512_maskz_rcp14_ps`]
+  * [x] [`_mm_mask_rcp14_ps`]
+  * [x] [`_mm_maskz_rcp14_ps`]
+  * [x] [`_mm_rcp14_ps`]
+  * [x] [`_mm256_mask_rcp14_ps`]
+  * [x] [`_mm256_maskz_rcp14_ps`]
+  * [x] [`_mm256_rcp14_ps`]
+  * [x] [`_mm512_rcp14_pd`]
+  * [x] [`_mm512_mask_rcp14_pd`]
+  * [x] [`_mm512_maskz_rcp14_pd`]
+  * [x] [`_mm_mask_rcp14_pd`]
+  * [x] [`_mm_maskz_rcp14_pd`]
+  * [x] [`_mm_rcp14_pd`]
+  * [x] [`_mm256_mask_rcp14_pd`]
+  * [x] [`_mm256_maskz_rcp14_pd`]
+  * [x] [`_mm256_rcp14_pd`]
+  * [x] [`_mm512_getexp_ps`]
+  * [x] [`_mm512_mask_getexp_ps`]
+  * [x] [`_mm512_maskz_getexp_ps`]
+  * [x] [`_mm_getexp_ps`]
+  * [x] [`_mm_mask_getexp_ps`]
+  * [x] [`_mm_maskz_getexp_ps`]
+  * [x] [`_mm256_getexp_ps`]
+  * [x] [`_mm256_mask_getexp_ps`]
+  * [x] [`_mm256_maskz_getexp_ps`]
+  * [x] [`_mm512_getexp_pd`]
+  * [x] [`_mm512_mask_getexp_pd`]
+  * [x] [`_mm512_maskz_getexp_pd`]
+  * [x] [`_mm_getexp_pd`]
+  * [x] [`_mm_mask_getexp_pd`]
+  * [x] [`_mm_maskz_getexp_pd`]
+  * [x] [`_mm256_getexp_pd`]
+  * [x] [`_mm256_mask_getexp_pd`]
+  * [x] [`_mm256_maskz_getexp_pd`]
+  * [x] [`_mm512_getexp_round_ps`]
+  * [x] [`_mm512_mask_getexp_round_ps`]
+  * [x] [`_mm512_maskz_getexp_round_ps`]
+  * [x] [`_mm512_getexp_round_pd`]
+  * [x] [`_mm512_mask_getexp_round_pd`]
+  * [x] [`_mm512_maskz_getexp_round_pd`]
+  * [x] [`_mm512_getmant_ps`]
+  * [x] [`_mm512_mask_getmant_ps`]
+  * [x] [`_mm512_maskz_getmant_ps`]
+  * [x] [`_mm_getmant_ps`]
+  * [x] [`_mm_mask_getmant_ps`]
+  * [x] [`_mm_maskz_getmant_ps`]
+  * [x] [`_mm256_getmant_ps`]
+  * [x] [`_mm256_mask_getmant_ps`]
+  * [x] [`_mm256_maskz_getmant_ps`]
+  * [x] [`_mm512_getmant_pd`]
+  * [x] [`_mm512_mask_getmant_pd`]
+  * [x] [`_mm512_maskz_getmant_pd`]
+  * [x] [`_mm_getmant_pd`]
+  * [x] [`_mm_mask_getmant_pd`]
+  * [x] [`_mm_maskz_getmant_pd`]
+  * [x] [`_mm256_getmant_pd`]
+  * [x] [`_mm256_mask_getmant_pd`]
+  * [x] [`_mm256_maskz_getmant_pd`]
+  * [x] [`_mm512_getmant_round_ps`]
+  * [x] [`_mm512_mask_getmant_round_ps`]
+  * [x] [`_mm512_maskz_getmant_round_ps`]
+  * [x] [`_mm512_getmant_round_pd`]
+  * [x] [`_mm512_mask_getmant_round_pd`]
+  * [x] [`_mm512_maskz_getmant_round_pd`]
+  * [x] [`_mm512_roundscale_ps`]
+  * [x] [`_mm512_mask_roundscale_ps`]
+  * [x] [`_mm512_maskz_roundscale_ps`]
+  * [x] [`_mm_mask_roundscale_ps`]
+  * [x] [`_mm_maskz_roundscale_ps`]
+  * [x] [`_mm_roundscale_ps`]
+  * [x] [`_mm256_mask_roundscale_ps`]
+  * [x] [`_mm256_maskz_roundscale_ps`]
+  * [x] [`_mm256_roundscale_ps`]
+  * [x] [`_mm512_roundscale_pd`]
+  * [x] [`_mm512_mask_roundscale_pd`]
+  * [x] [`_mm512_maskz_roundscale_pd`]
+  * [x] [`_mm_mask_roundscale_pd`]
+  * [x] [`_mm_maskz_roundscale_pd`]
+  * [x] [`_mm_roundscale_pd`]
+  * [x] [`_mm256_mask_roundscale_pd`]
+  * [x] [`_mm256_maskz_roundscale_pd`]
+  * [x] [`_mm256_roundscale_pd`]
+  * [x] [`_mm512_roundscale_round_ps`]
+  * [x] [`_mm512_mask_roundscale_round_ps`]
+  * [x] [`_mm512_maskz_roundscale_round_ps`]
+  * [x] [`_mm512_roundscale_round_pd`]
+  * [x] [`_mm512_mask_roundscale_round_pd`]
+  * [x] [`_mm512_maskz_roundscale_round_pd`]
+  * [x] [`_mm512_scalef_ps`]
+  * [x] [`_mm512_mask_scalef_ps`]
+  * [x] [`_mm512_maskz_scalef_ps`]
+  * [x] [`_mm_mask_scalef_ps`]
+  * [x] [`_mm_maskz_scalef_ps`]
+  * [x] [`_mm_scalef_ps`]
+  * [x] [`_mm256_mask_scalef_ps`]
+  * [x] [`_mm256_maskz_scalef_ps`]
+  * [x] [`_mm256_scalef_ps`]
+  * [x] [`_mm512_scalef_pd`]
+  * [x] [`_mm512_mask_scalef_pd`]
+  * [x] [`_mm512_maskz_scalef_pd`]
+  * [x] [`_mm_mask_scalef_pd`]
+  * [x] [`_mm_maskz_scalef_pd`]
+  * [x] [`_mm_scalef_pd`]
+  * [x] [`_mm256_mask_scalef_pd`]
+  * [x] [`_mm256_maskz_scalef_pd`]
+  * [x] [`_mm256_scalef_pd`]
+  * [x] [`_mm512_scalef_round_ps`]
+  * [x] [`_mm512_mask_scalef_round_ps`]
+  * [x] [`_mm512_maskz_scalef_round_ps`]
+  * [x] [`_mm512_scalef_round_pd`]
+  * [x] [`_mm512_mask_scalef_round_pd`]
+  * [x] [`_mm512_maskz_scalef_round_pd`]
+  * [x] [`_mm512_fixupimm_ps`]
+  * [x] [`_mm512_mask_fixupimm_ps`]
+  * [x] [`_mm512_maskz_fixupimm_ps`]
+  * [x] [`_mm_fixupimm_ps`]
+  * [x] [`_mm_mask_fixupimm_ps`]
+  * [x] [`_mm_maskz_fixupimm_ps`]
+  * [x] [`_mm256_fixupimm_ps`]
+  * [x] [`_mm256_mask_fixupimm_ps`]
+  * [x] [`_mm256_maskz_fixupimm_ps`]
+  * [x] [`_mm512_fixupimm_pd`]
+  * [x] [`_mm512_mask_fixupimm_pd`]
+  * [x] [`_mm512_maskz_fixupimm_pd`]
+  * [x] [`_mm_fixupimm_pd`]
+  * [x] [`_mm_mask_fixupimm_pd`]
+  * [x] [`_mm_maskz_fixupimm_pd`]
+  * [x] [`_mm256_fixupimm_pd`]
+  * [x] [`_mm256_mask_fixupimm_pd`]
+  * [x] [`_mm256_maskz_fixupimm_pd`]
+  * [x] [`_mm512_fixupimm_round_ps`]
+  * [x] [`_mm512_mask_fixupimm_round_ps`]
+  * [x] [`_mm512_maskz_fixupimm_round_ps`]
+  * [x] [`_mm512_fixupimm_round_pd`]
+  * [x] [`_mm512_mask_fixupimm_round_pd`]
+  * [x] [`_mm512_maskz_fixupimm_round_pd`]
+  * [x] [`_mm512_fmadd_ps`]
+  * [x] [`_mm512_mask_fmadd_ps`]
+  * [x] [`_mm512_maskz_fmadd_ps`]
+  * [x] [`_mm512_mask3_fmadd_ps`]
+  * [x] [`_mm_mask_fmadd_ps`]
+  * [x] [`_mm_mask3_fmadd_ps`]
+  * [x] [`_mm_maskz_fmadd_ps`]
+  * [x] [`_mm256_mask_fmadd_ps`]
+  * [x] [`_mm256_mask3_fmadd_ps`]
+  * [x] [`_mm256_maskz_fmadd_ps`]
+  * [x] [`_mm512_fmadd_pd`]
+  * [x] [`_mm512_mask_fmadd_pd`]
+  * [x] [`_mm512_maskz_fmadd_pd`]
+  * [x] [`_mm512_mask3_fmadd_pd`]
+  * [x] [`_mm_mask_fmadd_pd`]
+  * [x] [`_mm_mask3_fmadd_pd`]
+  * [x] [`_mm_maskz_fmadd_pd`]
+  * [x] [`_mm256_mask_fmadd_pd`]
+  * [x] [`_mm256_mask3_fmadd_pd`]
+  * [x] [`_mm256_maskz_fmadd_pd`]
+  * [x] [`_mm512_fmadd_round_ps`]
+  * [x] [`_mm512_mask_fmadd_round_ps`]
+  * [x] [`_mm512_maskz_fmadd_round_ps`]
+  * [x] [`_mm512_mask3_fmadd_round_ps`]
+  * [x] [`_mm512_fmadd_round_pd`]
+  * [x] [`_mm512_mask_fmadd_round_pd`]
+  * [x] [`_mm512_maskz_fmadd_round_pd`]
+  * [x] [`_mm512_mask3_fmadd_round_pd`]
+  * [x] [`_mm512_fmsub_ps`]
+  * [x] [`_mm512_mask_fmsub_ps`]
+  * [x] [`_mm512_maskz_fmsub_ps`]
+  * [x] [`_mm512_mask3_fmsub_ps`]
+  * [x] [`_mm_mask_fmsub_ps`]
+  * [x] [`_mm_mask3_fmsub_ps`]
+  * [x] [`_mm_maskz_fmsub_ps`]
+  * [x] [`_mm256_mask_fmsub_ps`]
+  * [x] [`_mm256_mask3_fmsub_ps`]
+  * [x] [`_mm256_maskz_fmsub_ps`]
+  * [x] [`_mm512_fmsub_pd`]
+  * [x] [`_mm512_mask_fmsub_pd`]
+  * [x] [`_mm512_maskz_fmsub_pd`]
+  * [x] [`_mm512_mask3_fmsub_pd`]
+  * [x] [`_mm_mask_fmsub_pd`]
+  * [x] [`_mm_mask3_fmsub_pd`]
+  * [x] [`_mm_maskz_fmsub_pd`]
+  * [x] [`_mm256_mask_fmsub_pd`]
+  * [x] [`_mm256_mask3_fmsub_pd`]
+  * [x] [`_mm256_maskz_fmsub_pd`]
+  * [x] [`_mm512_fmsub_round_ps`]
+  * [x] [`_mm512_mask_fmsub_round_ps`]
+  * [x] [`_mm512_maskz_fmsub_round_ps`]
+  * [x] [`_mm512_mask3_fmsub_round_ps`]
+  * [x] [`_mm512_fmsub_round_pd`]
+  * [x] [`_mm512_mask_fmsub_round_pd`]
+  * [x] [`_mm512_maskz_fmsub_round_pd`]
+  * [x] [`_mm512_mask3_fmsub_round_pd`]
+  * [x] [`_mm512_fmaddsub_ps`]
+  * [x] [`_mm512_mask_fmaddsub_ps`]
+  * [x] [`_mm512_maskz_fmaddsub_ps`]
+  * [x] [`_mm512_mask3_fmaddsub_ps`]
+  * [x] [`_mm_mask_fmaddsub_ps`]
+  * [x] [`_mm_mask3_fmaddsub_ps`]
+  * [x] [`_mm_maskz_fmaddsub_ps`]
+  * [x] [`_mm256_mask_fmaddsub_ps`]
+  * [x] [`_mm256_mask3_fmaddsub_ps`]
+  * [x] [`_mm256_maskz_fmaddsub_ps`]
+  * [x] [`_mm512_fmaddsub_pd`]
+  * [x] [`_mm512_mask_fmaddsub_pd`]
+  * [x] [`_mm512_maskz_fmaddsub_pd`]
+  * [x] [`_mm512_mask3_fmaddsub_pd`]
+  * [x] [`_mm_mask_fmaddsub_pd`]
+  * [x] [`_mm_mask3_fmaddsub_pd`]
+  * [x] [`_mm_maskz_fmaddsub_pd`]
+  * [x] [`_mm256_mask_fmaddsub_pd`]
+  * [x] [`_mm256_mask3_fmaddsub_pd`]
+  * [x] [`_mm256_maskz_fmaddsub_pd`]
+  * [x] [`_mm512_fmaddsub_round_ps`]
+  * [x] [`_mm512_mask_fmaddsub_round_ps`]
+  * [x] [`_mm512_maskz_fmaddsub_round_ps`]
+  * [x] [`_mm512_mask3_fmaddsub_round_ps`]
+  * [x] [`_mm512_fmaddsub_round_pd`]
+  * [x] [`_mm512_mask_fmaddsub_round_pd`]
+  * [x] [`_mm512_maskz_fmaddsub_round_pd`]
+  * [x] [`_mm512_mask3_fmaddsub_round_pd`]
+  * [x] [`_mm512_fmsubadd_ps`]
+  * [x] [`_mm512_mask_fmsubadd_ps`]
+  * [x] [`_mm512_maskz_fmsubadd_ps`]
+  * [x] [`_mm512_mask3_fmsubadd_ps`]
+  * [x] [`_mm_mask_fmsubadd_ps`]
+  * [x] [`_mm_mask3_fmsubadd_ps`]
+  * [x] [`_mm_maskz_fmsubadd_ps`]
+  * [x] [`_mm256_mask_fmsubadd_ps`]
+  * [x] [`_mm256_mask3_fmsubadd_ps`]
+  * [x] [`_mm256_maskz_fmsubadd_ps`]
+  * [x] [`_mm512_fmsubadd_pd`]
+  * [x] [`_mm512_mask_fmsubadd_pd`]
+  * [x] [`_mm512_maskz_fmsubadd_pd`]
+  * [x] [`_mm512_mask3_fmsubadd_pd`]
+  * [x] [`_mm_mask_fmsubadd_pd`]
+  * [x] [`_mm_mask3_fmsubadd_pd`]
+  * [x] [`_mm_maskz_fmsubadd_pd`]
+  * [x] [`_mm256_mask_fmsubadd_pd`]
+  * [x] [`_mm256_mask3_fmsubadd_pd`]
+  * [x] [`_mm256_maskz_fmsubadd_pd`]
+  * [x] [`_mm512_fmsubadd_round_ps`]
+  * [x] [`_mm512_mask_fmsubadd_round_ps`]
+  * [x] [`_mm512_maskz_fmsubadd_round_ps`]
+  * [x] [`_mm512_mask3_fmsubadd_round_ps`]
+  * [x] [`_mm512_fmsubadd_round_pd`]
+  * [x] [`_mm512_mask_fmsubadd_round_pd`]
+  * [x] [`_mm512_maskz_fmsubadd_round_pd`]
+  * [x] [`_mm512_mask3_fmsubadd_round_pd`]
+  * [x] [`_mm512_fnmadd_ps`]
+  * [x] [`_mm512_mask_fnmadd_ps`]
+  * [x] [`_mm512_maskz_fnmadd_ps`]
+  * [x] [`_mm512_mask3_fnmadd_ps`]
+  * [x] [`_mm_mask_fnmadd_ps`]
+  * [x] [`_mm_mask3_fnmadd_ps`]
+  * [x] [`_mm_maskz_fnmadd_ps`]
+  * [x] [`_mm256_mask_fnmadd_ps`]
+  * [x] [`_mm256_mask3_fnmadd_ps`]
+  * [x] [`_mm256_maskz_fnmadd_ps`]
+  * [x] [`_mm512_fnmadd_pd`]
+  * [x] [`_mm512_mask_fnmadd_pd`]
+  * [x] [`_mm512_maskz_fnmadd_pd`]
+  * [x] [`_mm512_mask3_fnmadd_pd`]
+  * [x] [`_mm_mask_fnmadd_pd`]
+  * [x] [`_mm_mask3_fnmadd_pd`]
+  * [x] [`_mm_maskz_fnmadd_pd`]
+  * [x] [`_mm256_mask_fnmadd_pd`]
+  * [x] [`_mm256_mask3_fnmadd_pd`]
+  * [x] [`_mm256_maskz_fnmadd_pd`]
+  * [x] [`_mm512_fnmadd_round_ps`]
+  * [x] [`_mm512_mask_fnmadd_round_ps`]
+  * [x] [`_mm512_maskz_fnmadd_round_ps`]
+  * [x] [`_mm512_mask3_fnmadd_round_ps`]
+  * [x] [`_mm512_fnmadd_round_pd`]
+  * [x] [`_mm512_mask_fnmadd_round_pd`]
+  * [x] [`_mm512_maskz_fnmadd_round_pd`]
+  * [x] [`_mm512_mask3_fnmadd_round_pd`]
+  * [x] [`_mm512_fnmsub_ps`]
+  * [x] [`_mm512_mask_fnmsub_ps`]
+  * [x] [`_mm512_maskz_fnmsub_ps`]
+  * [x] [`_mm512_mask3_fnmsub_ps`]
+  * [x] [`_mm_mask_fnmsub_ps`]
+  * [x] [`_mm_mask3_fnmsub_ps`]
+  * [x] [`_mm_maskz_fnmsub_ps`]
+  * [x] [`_mm256_mask_fnmsub_ps`]
+  * [x] [`_mm256_mask3_fnmsub_ps`]
+  * [x] [`_mm256_maskz_fnmsub_ps`]
+  * [x] [`_mm512_fnmsub_pd`]
+  * [x] [`_mm512_mask_fnmsub_pd`]
+  * [x] [`_mm512_maskz_fnmsub_pd`]
+  * [x] [`_mm512_mask3_fnmsub_pd`]
+  * [x] [`_mm_mask_fnmsub_pd`]
+  * [x] [`_mm_mask3_fnmsub_pd`]
+  * [x] [`_mm_maskz_fnmsub_pd`]
+  * [x] [`_mm256_mask_fnmsub_pd`]
+  * [x] [`_mm256_mask3_fnmsub_pd`]
+  * [x] [`_mm256_maskz_fnmsub_pd`]
+  * [x] [`_mm512_fnmsub_round_ps`]
+  * [x] [`_mm512_mask_fnmsub_round_ps`]
+  * [x] [`_mm512_maskz_fnmsub_round_ps`]
+  * [x] [`_mm512_mask3_fnmsub_round_ps`]
+  * [x] [`_mm512_fnmsub_round_pd`]
+  * [x] [`_mm512_mask_fnmsub_round_pd`]
+  * [x] [`_mm512_maskz_fnmsub_round_pd`]
+  * [x] [`_mm512_mask3_fnmsub_round_pd`]
+  * [x] [`_mm512_cmp_epi32_mask`]
+  * [x] [`_mm512_mask_cmp_epi32_mask`]
+  * [x] [`_mm_cmp_epi32_mask`]
+  * [x] [`_mm_mask_cmp_epi32_mask`]
+  * [x] [`_mm256_cmp_epi32_mask`]
+  * [x] [`_mm256_mask_cmp_epi32_mask`]
+  * [x] [`_mm512_cmp_epi64_mask`]
+  * [x] [`_mm512_mask_cmp_epi64_mask`]
+  * [x] [`_mm_cmp_epi64_mask`]
+  * [x] [`_mm_mask_cmp_epi64_mask`]
+  * [x] [`_mm256_cmp_epi64_mask`]
+  * [x] [`_mm256_mask_cmp_epi64_mask`]
+  * [x] [`_mm512_cmp_epu32_mask`]
+  * [x] [`_mm512_mask_cmp_epu32_mask`]
+  * [x] [`_mm_cmp_epu32_mask`]
+  * [x] [`_mm_mask_cmp_epu32_mask`]
+  * [x] [`_mm256_cmp_epu32_mask`]
+  * [x] [`_mm256_mask_cmp_epu32_mask`]
+  * [x] [`_mm512_cmp_epu64_mask`]
+  * [x] [`_mm512_mask_cmp_epu64_mask`]
+  * [x] [`_mm_cmp_epu64_mask`]
+  * [x] [`_mm_mask_cmp_epu64_mask`]
+  * [x] [`_mm256_cmp_epu64_mask`]
+  * [x] [`_mm256_mask_cmp_epu64_mask`]
+  * [x] [`_mm512_cmp_ps_mask`]
+  * [x] [`_mm512_mask_cmp_ps_mask`]
+  * [x] [`_mm_cmp_ps_mask`]
+  * [x] [`_mm_mask_cmp_ps_mask`]
+  * [x] [`_mm256_cmp_ps_mask`]
+  * [x] [`_mm256_mask_cmp_ps_mask`]
+  * [x] [`_mm512_cmp_round_ps_mask`]
+  * [x] [`_mm512_mask_cmp_round_ps_mask`]
+  * [x] [`_mm512_cmp_pd_mask`]
+  * [x] [`_mm512_mask_cmp_pd_mask`]
+  * [x] [`_mm_cmp_pd_mask`]
+  * [x] [`_mm_mask_cmp_pd_mask`]
+  * [x] [`_mm256_cmp_pd_mask`]
+  * [x] [`_mm256_mask_cmp_pd_mask`]
+  * [x] [`_mm512_cmp_round_pd_mask`]
+  * [x] [`_mm512_mask_cmp_round_pd_mask`]
+  * [x] [`_mm512_cmpeq_epi32_mask`]
+  * [x] [`_mm512_mask_cmpeq_epi32_mask`]
+  * [x] [`_mm_cmpeq_epi32_mask`]
+  * [x] [`_mm_mask_cmpeq_epi32_mask`]
+  * [x] [`_mm256_cmpeq_epi32_mask`]
+  * [x] [`_mm256_mask_cmpeq_epi32_mask`]
+  * [x] [`_mm512_cmpeq_epi64_mask`]
+  * [x] [`_mm512_mask_cmpeq_epi64_mask`]
+  * [x] [`_mm_cmpeq_epi64_mask`]
+  * [x] [`_mm_mask_cmpeq_epi64_mask`]
+  * [x] [`_mm256_cmpeq_epi64_mask`]
+  * [x] [`_mm256_mask_cmpeq_epi64_mask`]
+  * [x] [`_mm512_cmpeq_epu32_mask`]
+  * [x] [`_mm512_mask_cmpeq_epu32_mask`]
+  * [x] [`_mm_cmpeq_epu32_mask`]
+  * [x] [`_mm_mask_cmpeq_epu32_mask`]
+  * [x] [`_mm256_cmpeq_epu32_mask`]
+  * [x] [`_mm256_mask_cmpeq_epu32_mask`]
+  * [x] [`_mm512_cmpeq_epu64_mask`]
+  * [x] [`_mm512_mask_cmpeq_epu64_mask`]
+  * [x] [`_mm_cmpeq_epu64_mask`]
+  * [x] [`_mm_mask_cmpeq_epu64_mask`]
+  * [x] [`_mm256_cmpeq_epu64_mask`]
+  * [x] [`_mm256_mask_cmpeq_epu64_mask`]
+  * [x] [`_mm512_cmpneq_epi32_mask`]
+  * [x] [`_mm512_mask_cmpneq_epi32_mask`]
+  * [x] [`_mm_cmpneq_epi32_mask`]
+  * [x] [`_mm_mask_cmpneq_epi32_mask`]
+  * [x] [`_mm256_cmpneq_epi32_mask`]
+  * [x] [`_mm256_mask_cmpneq_epi32_mask`]
+  * [x] [`_mm512_cmpneq_epi64_mask`]
+  * [x] [`_mm512_mask_cmpneq_epi64_mask`]
+  * [x] [`_mm_cmpneq_epi64_mask`]
+  * [x] [`_mm_mask_cmpneq_epi64_mask`]
+  * [x] [`_mm256_cmpneq_epi64_mask`]
+  * [x] [`_mm256_mask_cmpneq_epi64_mask`]
+  * [x] [`_mm512_cmpneq_epu32_mask`]
+  * [x] [`_mm512_mask_cmpneq_epu32_mask`]
+  * [x] [`_mm_cmpneq_epu32_mask`]
+  * [x] [`_mm_mask_cmpneq_epu32_mask`]
+  * [x] [`_mm256_cmpneq_epu32_mask`]
+  * [x] [`_mm256_mask_cmpneq_epu32_mask`]
+  * [x] [`_mm512_cmpneq_epu64_mask`]
+  * [x] [`_mm512_mask_cmpneq_epu64_mask`]
+  * [x] [`_mm_cmpneq_epu64_mask`]
+  * [x] [`_mm_mask_cmpneq_epu64_mask`]
+  * [x] [`_mm256_cmpneq_epu64_mask`]
+  * [x] [`_mm256_mask_cmpneq_epu64_mask`]
+  * [x] [`_mm512_cmpge_epi32_mask`]
+  * [x] [`_mm512_mask_cmpge_epi32_mask`]
+  * [x] [`_mm_cmpge_epi32_mask`]
+  * [x] [`_mm_mask_cmpge_epi32_mask`]
+  * [x] [`_mm256_cmpge_epi32_mask`]
+  * [x] [`_mm256_mask_cmpge_epi32_mask`]
+  * [x] [`_mm512_cmpge_epi64_mask`]
+  * [x] [`_mm512_mask_cmpge_epi64_mask`]
+  * [x] [`_mm_cmpge_epi64_mask`]
+  * [x] [`_mm_mask_cmpge_epi64_mask`]
+  * [x] [`_mm256_cmpge_epi64_mask`]
+  * [x] [`_mm256_mask_cmpge_epi64_mask`]
+  * [x] [`_mm512_cmpge_epu32_mask`]
+  * [x] [`_mm512_mask_cmpge_epu32_mask`]
+  * [x] [`_mm_cmpge_epu32_mask`]
+  * [x] [`_mm_mask_cmpge_epu32_mask`]
+  * [x] [`_mm256_cmpge_epu32_mask`]
+  * [x] [`_mm256_mask_cmpge_epu32_mask`]
+  * [x] [`_mm512_cmpge_epu64_mask`]
+  * [x] [`_mm512_mask_cmpge_epu64_mask`]
+  * [x] [`_mm_cmpge_epu64_mask`]
+  * [x] [`_mm_mask_cmpge_epu64_mask`]
+  * [x] [`_mm256_cmpge_epu64_mask`]
+  * [x] [`_mm256_mask_cmpge_epu64_mask`]
+  * [x] [`_mm512_cmpgt_epi32_mask`]
+  * [x] [`_mm512_mask_cmpgt_epi32_mask`]
+  * [x] [`_mm_cmpgt_epi32_mask`]
+  * [x] [`_mm_mask_cmpgt_epi32_mask`]
+  * [x] [`_mm256_cmpgt_epi32_mask`]
+  * [x] [`_mm256_mask_cmpgt_epi32_mask`]
+  * [x] [`_mm512_cmpgt_epi64_mask`]
+  * [x] [`_mm512_mask_cmpgt_epi64_mask`]
+  * [x] [`_mm_cmpgt_epi64_mask`]
+  * [x] [`_mm_mask_cmpgt_epi64_mask`]
+  * [x] [`_mm256_cmpgt_epi64_mask`]
+  * [x] [`_mm256_mask_cmpgt_epi64_mask`]
+  * [x] [`_mm512_cmpgt_epu32_mask`]
+  * [x] [`_mm512_mask_cmpgt_epu32_mask`]
+  * [x] [`_mm_cmpgt_epu32_mask`]
+  * [x] [`_mm_mask_cmpgt_epu32_mask`]
+  * [x] [`_mm256_cmpgt_epu32_mask`]
+  * [x] [`_mm256_mask_cmpgt_epu32_mask`]
+  * [x] [`_mm512_cmpgt_epu64_mask`]
+  * [x] [`_mm512_mask_cmpgt_epu64_mask`]
+  * [x] [`_mm_cmpgt_epu64_mask`]
+  * [x] [`_mm_mask_cmpgt_epu64_mask`]
+  * [x] [`_mm256_cmpgt_epu64_mask`]
+  * [x] [`_mm256_mask_cmpgt_epu64_mask`]
+  * [x] [`_mm512_cmple_epi32_mask`]
+  * [x] [`_mm512_mask_cmple_epi32_mask`]
+  * [x] [`_mm_cmple_epi32_mask`]
+  * [x] [`_mm_mask_cmple_epi32_mask`]
+  * [x] [`_mm256_cmple_epi32_mask`]
+  * [x] [`_mm256_mask_cmple_epi32_mask`]
+  * [x] [`_mm512_cmple_epi64_mask`]
+  * [x] [`_mm512_mask_cmple_epi64_mask`]
+  * [x] [`_mm_cmple_epi64_mask`]
+  * [x] [`_mm_mask_cmple_epi64_mask`]
+  * [x] [`_mm256_cmple_epi64_mask`]
+  * [x] [`_mm256_mask_cmple_epi64_mask`]
+  * [x] [`_mm512_cmple_epu32_mask`]
+  * [x] [`_mm512_mask_cmple_epu32_mask`]
+  * [x] [`_mm_cmple_epu32_mask`]
+  * [x] [`_mm_mask_cmple_epu32_mask`]
+  * [x] [`_mm256_cmple_epu32_mask`]
+  * [x] [`_mm256_mask_cmple_epu32_mask`]
+  * [x] [`_mm512_cmple_epu64_mask`]
+  * [x] [`_mm512_mask_cmple_epu64_mask`]
+  * [x] [`_mm_cmple_epu64_mask`]
+  * [x] [`_mm_mask_cmple_epu64_mask`]
+  * [x] [`_mm256_cmple_epu64_mask`]
+  * [x] [`_mm256_mask_cmple_epu64_mask`]
+  * [x] [`_mm512_cmplt_epi32_mask`]
+  * [x] [`_mm512_mask_cmplt_epi32_mask`]
+  * [x] [`_mm_cmplt_epi32_mask`]
+  * [x] [`_mm_mask_cmplt_epi32_mask`]
+  * [x] [`_mm256_cmplt_epi32_mask`]
+  * [x] [`_mm256_mask_cmplt_epi32_mask`]
+  * [x] [`_mm512_cmplt_epi64_mask`]
+  * [x] [`_mm512_mask_cmplt_epi64_mask`]
+  * [x] [`_mm_cmplt_epi64_mask`]
+  * [x] [`_mm_mask_cmplt_epi64_mask`]
+  * [x] [`_mm256_cmplt_epi64_mask`]
+  * [x] [`_mm256_mask_cmplt_epi64_mask`]
+  * [x] [`_mm512_cmplt_epu32_mask`]
+  * [x] [`_mm512_mask_cmplt_epu32_mask`]
+  * [x] [`_mm_cmplt_epu32_mask`]
+  * [x] [`_mm_mask_cmplt_epu32_mask`]
+  * [x] [`_mm256_cmplt_epu32_mask`]
+  * [x] [`_mm256_mask_cmplt_epu32_mask`]
+  * [x] [`_mm512_cmplt_epu64_mask`]
+  * [x] [`_mm512_mask_cmplt_epu64_mask`]
+  * [x] [`_mm_cmplt_epu64_mask`]
+  * [x] [`_mm_mask_cmplt_epu64_mask`]
+  * [x] [`_mm256_cmplt_epu64_mask`]
+  * [x] [`_mm256_mask_cmplt_epu64_mask`]
+  * [x] [`_mm512_cmpeq_ps_mask`]
+  * [x] [`_mm512_mask_cmpeq_ps_mask`]
+  * [x] [`_mm512_cmpeq_pd_mask`]
+  * [x] [`_mm512_mask_cmpeq_pd_mask`]
+  * [x] [`_mm512_cmpneq_ps_mask`]
+  * [x] [`_mm512_mask_cmpneq_ps_mask`]
+  * [x] [`_mm512_cmpneq_pd_mask`]
+  * [x] [`_mm512_mask_cmpneq_pd_mask`]
+  * [x] [`_mm512_cmple_ps_mask`]
+  * [x] [`_mm512_mask_cmple_ps_mask`]
+  * [x] [`_mm512_cmple_pd_mask`]
+  * [x] [`_mm512_mask_cmple_pd_mask`]
+  * [x] [`_mm512_cmplt_ps_mask`]
+  * [x] [`_mm512_mask_cmplt_ps_mask`]
+  * [x] [`_mm512_cmplt_pd_mask`]
+  * [x] [`_mm512_mask_cmplt_pd_mask`]
+  * [x] [`_mm512_cmpnle_ps_mask`]
+  * [x] [`_mm512_mask_cmpnle_ps_mask`]
+  * [x] [`_mm512_cmpnle_pd_mask`]
+  * [x] [`_mm512_mask_cmpnle_pd_mask`]
+  * [x] [`_mm512_cmpnlt_ps_mask`]
+  * [x] [`_mm512_mask_cmpnlt_ps_mask`]
+  * [x] [`_mm512_cmpnlt_pd_mask`]
+  * [x] [`_mm512_mask_cmpnlt_pd_mask`]
+  * [x] [`_mm512_cmpord_ps_mask`]
+  * [x] [`_mm512_mask_cmpord_ps_mask`]
+  * [x] [`_mm512_cmpord_pd_mask`]
+  * [x] [`_mm512_mask_cmpord_pd_mask`]
+  * [x] [`_mm512_cmpunord_ps_mask`]
+  * [x] [`_mm512_mask_cmpunord_ps_mask`]
+  * [x] [`_mm512_cmpunord_pd_mask`]
+  * [x] [`_mm512_mask_cmpunord_pd_mask`]
+  * [x] [`_mm512_reduce_add_epi32`]
+  * [x] [`_mm512_mask_reduce_add_epi32`]
+  * [x] [`_mm512_reduce_add_epi64`]
+  * [x] [`_mm512_mask_reduce_add_epi64`]
+  * [x] [`_mm512_reduce_add_ps`]
+  * [x] [`_mm512_mask_reduce_add_ps`]
+  * [x] [`_mm512_reduce_add_pd`]
+  * [x] [`_mm512_mask_reduce_add_pd`]
+  * [x] [`_mm512_reduce_and_epi32`]
+  * [x] [`_mm512_mask_reduce_and_epi32`]
+  * [x] [`_mm512_reduce_and_epi64`]
+  * [x] [`_mm512_mask_reduce_and_epi64`]
+  * [x] [`_mm512_reduce_max_epi32`]
+  * [x] [`_mm512_mask_reduce_max_epi32`]
+  * [x] [`_mm512_reduce_max_epi64`]
+  * [x] [`_mm512_mask_reduce_max_epi64`]
+  * [x] [`_mm512_reduce_max_epu32`]
+  * [x] [`_mm512_mask_reduce_max_epu32`]
+  * [x] [`_mm512_reduce_max_epu64`]
+  * [x] [`_mm512_mask_reduce_max_epu64`]
+  * [x] [`_mm512_reduce_max_ps`]
+  * [x] [`_mm512_mask_reduce_max_ps`]
+  * [x] [`_mm512_reduce_max_pd`]
+  * [x] [`_mm512_mask_reduce_max_pd`]
+  * [x] [`_mm512_reduce_min_epi32`]
+  * [x] [`_mm512_mask_reduce_min_epi32`]
+  * [x] [`_mm512_reduce_min_epi64`]
+  * [x] [`_mm512_mask_reduce_min_epi64`]
+  * [x] [`_mm512_reduce_min_epu32`]
+  * [x] [`_mm512_mask_reduce_min_epu32`]
+  * [x] [`_mm512_reduce_min_epu64`]
+  * [x] [`_mm512_mask_reduce_min_epu64`]
+  * [x] [`_mm512_reduce_min_ps`]
+  * [x] [`_mm512_mask_reduce_min_ps`]
+  * [x] [`_mm512_reduce_min_pd`]
+  * [x] [`_mm512_mask_reduce_min_pd`]
+  * [x] [`_mm512_reduce_mul_epi32`]
+  * [x] [`_mm512_mask_reduce_mul_epi32`]
+  * [x] [`_mm512_reduce_mul_epi64`]
+  * [x] [`_mm512_mask_reduce_mul_epi64`]
+  * [x] [`_mm512_reduce_mul_ps`]
+  * [x] [`_mm512_mask_reduce_mul_ps`]
+  * [x] [`_mm512_reduce_mul_pd`]
+  * [x] [`_mm512_mask_reduce_mul_pd`]
+  * [x] [`_mm512_reduce_or_epi32`]
+  * [x] [`_mm512_mask_reduce_or_epi32`]
+  * [x] [`_mm512_reduce_or_epi64`]
+  * [x] [`_mm512_mask_reduce_or_epi64`]
+  * [x] [`_mm512_rol_epi32`]
+  * [x] [`_mm512_mask_rol_epi32`]
+  * [x] [`_mm512_maskz_rol_epi32`]
+  * [x] [`_mm_mask_rol_epi32`]
+  * [x] [`_mm_maskz_rol_epi32`]
+  * [x] [`_mm_rol_epi32`]
+  * [x] [`_mm256_mask_rol_epi32`]
+  * [x] [`_mm256_maskz_rol_epi32`]
+  * [x] [`_mm256_rol_epi32`]
+  * [x] [`_mm512_rol_epi64`]
+  * [x] [`_mm512_mask_rol_epi64`]
+  * [x] [`_mm512_maskz_rol_epi64`]
+  * [x] [`_mm_mask_rol_epi64`]
+  * [x] [`_mm_maskz_rol_epi64`]
+  * [x] [`_mm_rol_epi64`]
+  * [x] [`_mm256_mask_rol_epi64`]
+  * [x] [`_mm256_maskz_rol_epi64`]
+  * [x] [`_mm256_rol_epi64`]
+  * [x] [`_mm512_rolv_epi32`]
+  * [x] [`_mm512_mask_rolv_epi32`]
+  * [x] [`_mm512_maskz_rolv_epi32`]
+  * [x] [`_mm_mask_rolv_epi32`]
+  * [x] [`_mm_maskz_rolv_epi32`]
+  * [x] [`_mm_rolv_epi32`]
+  * [x] [`_mm256_mask_rolv_epi32`]
+  * [x] [`_mm256_maskz_rolv_epi32`]
+  * [x] [`_mm256_rolv_epi32`]
+  * [x] [`_mm512_rolv_epi64`]
+  * [x] [`_mm512_mask_rolv_epi64`]
+  * [x] [`_mm512_maskz_rolv_epi64`]
+  * [x] [`_mm_mask_rolv_epi64`]
+  * [x] [`_mm_maskz_rolv_epi64`]
+  * [x] [`_mm_rolv_epi64`]
+  * [x] [`_mm256_mask_rolv_epi64`]
+  * [x] [`_mm256_maskz_rolv_epi64`]
+  * [x] [`_mm256_rolv_epi64`]
+  * [x] [`_mm512_ror_epi32`]
+  * [x] [`_mm512_mask_ror_epi32`]
+  * [x] [`_mm512_maskz_ror_epi32`]
+  * [x] [`_mm_mask_ror_epi32`]
+  * [x] [`_mm_maskz_ror_epi32`]
+  * [x] [`_mm_ror_epi32`]
+  * [x] [`_mm256_mask_ror_epi32`]
+  * [x] [`_mm256_maskz_ror_epi32`]
+  * [x] [`_mm256_ror_epi32`]
+  * [x] [`_mm512_ror_epi64`]
+  * [x] [`_mm512_mask_ror_epi64`]
+  * [x] [`_mm512_maskz_ror_epi64`]
+  * [x] [`_mm_mask_ror_epi64`]
+  * [x] [`_mm_maskz_ror_epi64`]
+  * [x] [`_mm_ror_epi64`]
+  * [x] [`_mm256_mask_ror_epi64`]
+  * [x] [`_mm256_maskz_ror_epi64`]
+  * [x] [`_mm256_ror_epi64`]
+  * [x] [`_mm512_rorv_epi32`]
+  * [x] [`_mm512_mask_rorv_epi32`]
+  * [x] [`_mm512_maskz_rorv_epi32`]
+  * [x] [`_mm_mask_rorv_epi32`]
+  * [x] [`_mm_maskz_rorv_epi32`]
+  * [x] [`_mm_rorv_epi32`]
+  * [x] [`_mm256_mask_rorv_epi32`]
+  * [x] [`_mm256_maskz_rorv_epi32`]
+  * [x] [`_mm256_rorv_epi32`]
+  * [x] [`_mm512_rorv_epi64`]
+  * [x] [`_mm512_mask_rorv_epi64`]
+  * [x] [`_mm512_maskz_rorv_epi64`]
+  * [x] [`_mm_mask_rorv_epi64`]
+  * [x] [`_mm_maskz_rorv_epi64`]
+  * [x] [`_mm_rorv_epi64`]
+  * [x] [`_mm256_mask_rorv_epi64`]
+  * [x] [`_mm256_maskz_rorv_epi64`]
+  * [x] [`_mm256_rorv_epi64`]
+  * [x] [`_mm512_sll_epi32`]
+  * [x] [`_mm512_mask_sll_epi32`]
+  * [x] [`_mm512_maskz_sll_epi32`]
+  * [x] [`_mm_mask_sll_epi32`]
+  * [x] [`_mm_maskz_sll_epi32`]
+  * [x] [`_mm256_mask_sll_epi32`]
+  * [x] [`_mm256_maskz_sll_epi32`]
+  * [x] [`_mm512_sll_epi64`]
+  * [x] [`_mm512_mask_sll_epi64`]
+  * [x] [`_mm512_maskz_sll_epi64`]
+  * [x] [`_mm_mask_sll_epi64`]
+  * [x] [`_mm_maskz_sll_epi64`]
+  * [x] [`_mm256_mask_sll_epi64`]
+  * [x] [`_mm256_maskz_sll_epi64`]
+  * [x] [`_mm512_slli_epi32`]
+  * [x] [`_mm512_mask_slli_epi32`]
+  * [x] [`_mm512_maskz_slli_epi32`]
+  * [x] [`_mm_mask_slli_epi32`]
+  * [x] [`_mm_maskz_slli_epi32`]
+  * [x] [`_mm256_mask_slli_epi32`]
+  * [x] [`_mm256_maskz_slli_epi32`]
+  * [x] [`_mm512_slli_epi64`]
+  * [x] [`_mm512_mask_slli_epi64`]
+  * [x] [`_mm512_maskz_slli_epi64`]
+  * [x] [`_mm_mask_slli_epi64`]
+  * [x] [`_mm_maskz_slli_epi64`]
+  * [x] [`_mm256_mask_slli_epi64`]
+  * [x] [`_mm256_maskz_slli_epi64`]
+  * [x] [`_mm512_sllv_epi32`]
+  * [x] [`_mm512_mask_sllv_epi32`]
+  * [x] [`_mm512_maskz_sllv_epi32`]
+  * [x] [`_mm_mask_sllv_epi32`]
+  * [x] [`_mm_maskz_sllv_epi32`]
+  * [x] [`_mm256_mask_sllv_epi32`]
+  * [x] [`_mm256_maskz_sllv_epi32`]
+  * [x] [`_mm512_sllv_epi64`]
+  * [x] [`_mm512_mask_sllv_epi64`]
+  * [x] [`_mm512_maskz_sllv_epi64`]
+  * [x] [`_mm_mask_sllv_epi64`]
+  * [x] [`_mm_maskz_sllv_epi64`]
+  * [x] [`_mm256_mask_sllv_epi64`]
+  * [x] [`_mm256_maskz_sllv_epi64`]
+  * [x] [`_mm512_sra_epi32`]
+  * [x] [`_mm512_mask_sra_epi32`]
+  * [x] [`_mm512_maskz_sra_epi32`]
+  * [x] [`_mm_mask_sra_epi32`]
+  * [x] [`_mm_maskz_sra_epi32`]
+  * [x] [`_mm256_mask_sra_epi32`]
+  * [x] [`_mm256_maskz_sra_epi32`]
+  * [x] [`_mm512_sra_epi64`]
+  * [x] [`_mm512_mask_sra_epi64`]
+  * [x] [`_mm512_maskz_sra_epi64`]
+  * [x] [`_mm_mask_sra_epi64`]
+  * [x] [`_mm_maskz_sra_epi64`]
+  * [x] [`_mm_sra_epi64`]
+  * [x] [`_mm256_mask_sra_epi64`]
+  * [x] [`_mm256_maskz_sra_epi64`]
+  * [x] [`_mm256_sra_epi64`]
+  * [x] [`_mm512_srai_epi32`]
+  * [x] [`_mm512_mask_srai_epi32`]
+  * [x] [`_mm512_maskz_srai_epi32`]
+  * [x] [`_mm_mask_srai_epi32`]
+  * [x] [`_mm_maskz_srai_epi32`]
+  * [x] [`_mm256_mask_srai_epi32`]
+  * [x] [`_mm256_maskz_srai_epi32`]
+  * [x] [`_mm512_srai_epi64`]
+  * [x] [`_mm512_mask_srai_epi64`]
+  * [x] [`_mm512_maskz_srai_epi64`]
+  * [x] [`_mm_mask_srai_epi64`]
+  * [x] [`_mm_maskz_srai_epi64`]
+  * [x] [`_mm_srai_epi64`]
+  * [x] [`_mm256_mask_srai_epi64`]
+  * [x] [`_mm256_maskz_srai_epi64`]
+  * [x] [`_mm256_srai_epi64`]
+  * [x] [`_mm512_srav_epi32`]
+  * [x] [`_mm512_mask_srav_epi32`]
+  * [x] [`_mm512_maskz_srav_epi32`]
+  * [x] [`_mm_mask_srav_epi32`]
+  * [x] [`_mm_maskz_srav_epi32`]
+  * [x] [`_mm256_mask_srav_epi32`]
+  * [x] [`_mm256_maskz_srav_epi32`]
+  * [x] [`_mm512_srav_epi64`]
+  * [x] [`_mm512_mask_srav_epi64`]
+  * [x] [`_mm512_maskz_srav_epi64`]
+  * [x] [`_mm_mask_srav_epi64`]
+  * [x] [`_mm_maskz_srav_epi64`]
+  * [x] [`_mm_srav_epi64`]
+  * [x] [`_mm256_mask_srav_epi64`]
+  * [x] [`_mm256_maskz_srav_epi64`]
+  * [x] [`_mm256_srav_epi64`]
+  * [x] [`_mm512_srl_epi32`]
+  * [x] [`_mm512_mask_srl_epi32`]
+  * [x] [`_mm512_maskz_srl_epi32`]
+  * [x] [`_mm_mask_srl_epi32`]
+  * [x] [`_mm_maskz_srl_epi32`]
+  * [x] [`_mm256_mask_srl_epi32`]
+  * [x] [`_mm256_maskz_srl_epi32`]
+  * [x] [`_mm512_srl_epi64`]
+  * [x] [`_mm512_mask_srl_epi64`]
+  * [x] [`_mm512_maskz_srl_epi64`]
+  * [x] [`_mm_mask_srl_epi64`]
+  * [x] [`_mm_maskz_srl_epi64`]
+  * [x] [`_mm256_mask_srl_epi64`]
+  * [x] [`_mm256_maskz_srl_epi64`]
+  * [x] [`_mm512_srli_epi32`]
+  * [x] [`_mm512_mask_srli_epi32`]
+  * [x] [`_mm512_maskz_srli_epi32`]
+  * [x] [`_mm_mask_srli_epi32`]
+  * [x] [`_mm_maskz_srli_epi32`]
+  * [x] [`_mm256_mask_srli_epi32`]
+  * [x] [`_mm256_maskz_srli_epi32`]
+  * [x] [`_mm512_srli_epi64`]
+  * [x] [`_mm512_mask_srli_epi64`]
+  * [x] [`_mm512_maskz_srli_epi64`]
+  * [x] [`_mm_mask_srli_epi64`]
+  * [x] [`_mm_maskz_srli_epi64`]
+  * [x] [`_mm256_mask_srli_epi64`]
+  * [x] [`_mm256_maskz_srli_epi64`]
+  * [x] [`_mm512_srlv_epi32`]
+  * [x] [`_mm512_mask_srlv_epi32`]
+  * [x] [`_mm512_maskz_srlv_epi32`]
+  * [x] [`_mm_mask_srlv_epi32`]
+  * [x] [`_mm_maskz_srlv_epi32`]
+  * [x] [`_mm256_mask_srlv_epi32`]
+  * [x] [`_mm256_maskz_srlv_epi32`]
+  * [x] [`_mm512_srlv_epi64`]
+  * [x] [`_mm512_mask_srlv_epi64`]
+  * [x] [`_mm512_maskz_srlv_epi64`]
+  * [x] [`_mm_mask_srlv_epi64`]
+  * [x] [`_mm_maskz_srlv_epi64`]
+  * [x] [`_mm256_mask_srlv_epi64`]
+  * [x] [`_mm256_maskz_srlv_epi64`]
+  * [x] [`_mm512_mask_mov_epi32`]
+  * [x] [`_mm512_maskz_mov_epi32`]
+  * [x] [`_mm_mask_mov_epi32`]
+  * [x] [`_mm_maskz_mov_epi32`]
+  * [x] [`_mm256_mask_mov_epi32`]
+  * [x] [`_mm256_maskz_mov_epi32`]
+  * [x] [`_mm512_mask_mov_epi64`]
+  * [x] [`_mm512_maskz_mov_epi64`]
+  * [x] [`_mm_mask_mov_epi64`]
+  * [x] [`_mm_maskz_mov_epi64`]
+  * [x] [`_mm256_mask_mov_epi64`]
+  * [x] [`_mm256_maskz_mov_epi64`]
+  * [x] [`_mm512_mask_mov_ps`]
+  * [x] [`_mm512_maskz_mov_ps`]
+  * [x] [`_mm_mask_mov_ps`]
+  * [x] [`_mm_maskz_mov_ps`]
+  * [x] [`_mm256_mask_mov_ps`]
+  * [x] [`_mm256_maskz_mov_ps`]
+  * [x] [`_mm512_mask_mov_pd`]
+  * [x] [`_mm512_maskz_mov_pd`]
+  * [x] [`_mm_mask_mov_pd`]
+  * [x] [`_mm_maskz_mov_pd`]
+  * [x] [`_mm256_mask_mov_pd`]
+  * [x] [`_mm256_maskz_mov_pd`]
+  * [x] [`_mm512_movehdup_ps`]
+  * [x] [`_mm512_mask_movehdup_ps`]
+  * [x] [`_mm512_maskz_movehdup_ps`]
+  * [x] [`_mm_mask_movehdup_ps`]
+  * [x] [`_mm_maskz_movehdup_ps`]
+  * [x] [`_mm256_mask_movehdup_ps`]
+  * [x] [`_mm256_maskz_movehdup_ps`]
+  * [x] [`_mm512_moveldup_ps`]
+  * [x] [`_mm512_mask_moveldup_ps`]
+  * [x] [`_mm512_maskz_moveldup_ps`]
+  * [x] [`_mm_mask_moveldup_ps`]
+  * [x] [`_mm_maskz_moveldup_ps`]
+  * [x] [`_mm256_mask_moveldup_ps`]
+  * [x] [`_mm256_maskz_moveldup_ps`]
+  * [x] [`_mm512_movedup_pd`]
+  * [x] [`_mm512_mask_movedup_pd`]
+  * [x] [`_mm512_maskz_movedup_pd`]
+  * [x] [`_mm_mask_movedup_pd`]
+  * [x] [`_mm_maskz_movedup_pd`]
+  * [x] [`_mm256_mask_movedup_pd`]
+  * [x] [`_mm256_maskz_movedup_pd`]
+  * [x] [`_mm512_or_epi32`]
+  * [x] [`_mm512_mask_or_epi32`]
+  * [x] [`_mm512_maskz_or_epi32`]
+  * [x] [`_mm_mask_or_epi32`]
+  * [x] [`_mm_maskz_or_epi32`]
+  * [x] [`_mm_or_epi32`]
+  * [x] [`_mm256_mask_or_epi32`]
+  * [x] [`_mm256_maskz_or_epi32`]
+  * [x] [`_mm256_or_epi32`]
+  * [x] [`_mm512_or_epi64`]
+  * [x] [`_mm512_mask_or_epi64`]
+  * [x] [`_mm512_maskz_or_epi64`]
+  * [x] [`_mm_mask_or_epi64`]
+  * [x] [`_mm_maskz_or_epi64`]
+  * [x] [`_mm_or_epi64`]
+  * [x] [`_mm256_mask_or_epi64`]
+  * [x] [`_mm256_maskz_or_epi64`]
+  * [x] [`_mm256_or_epi64`]
+  * [x] [`_mm512_or_si512`]
+  * [x] [`_mm512_and_epi32`]
+  * [x] [`_mm512_mask_and_epi32`]
+  * [x] [`_mm512_maskz_and_epi32`]
+  * [x] [`_mm_mask_and_epi32`]
+  * [x] [`_mm_maskz_and_epi32`]
+  * [x] [`_mm256_mask_and_epi32`]
+  * [x] [`_mm256_maskz_and_epi32`]
+  * [x] [`_mm512_and_epi64`]
+  * [x] [`_mm512_mask_and_epi64`]
+  * [x] [`_mm512_maskz_and_epi64`]
+  * [x] [`_mm_mask_and_epi64`]
+  * [x] [`_mm_maskz_and_epi64`]
+  * [x] [`_mm256_mask_and_epi64`]
+  * [x] [`_mm256_maskz_and_epi64`]
+  * [x] [`_mm512_and_si512`]
+  * [x] [`_mm512_xor_epi32`]
+  * [x] [`_mm512_mask_xor_epi32`]
+  * [x] [`_mm512_maskz_xor_epi32`]
+  * [x] [`_mm_mask_xor_epi32`]
+  * [x] [`_mm_maskz_xor_epi32`]
+  * [x] [`_mm_xor_epi32`]
+  * [x] [`_mm256_mask_xor_epi32`]
+  * [x] [`_mm256_maskz_xor_epi32`]
+  * [x] [`_mm256_xor_epi32`]
+  * [x] [`_mm512_xor_epi64`]
+  * [x] [`_mm512_mask_xor_epi64`]
+  * [x] [`_mm512_maskz_xor_epi64`]
+  * [x] [`_mm_mask_xor_epi64`]
+  * [x] [`_mm_maskz_xor_epi64`]
+  * [x] [`_mm_xor_epi64`]
+  * [x] [`_mm256_mask_xor_epi64`]
+  * [x] [`_mm256_maskz_xor_epi64`]
+  * [x] [`_mm256_xor_epi64`]
+  * [x] [`_mm512_xor_si512`]
+  * [x] [`_mm512_andnot_epi32`]
+  * [x] [`_mm512_mask_andnot_epi32`]
+  * [x] [`_mm512_maskz_andnot_epi32`]
+  * [x] [`_mm_mask_andnot_epi32`]
+  * [x] [`_mm_maskz_andnot_epi32`]
+  * [x] [`_mm256_mask_andnot_epi32`]
+  * [x] [`_mm256_maskz_andnot_epi32`]
+  * [x] [`_mm512_andnot_epi64`]
+  * [x] [`_mm512_mask_andnot_epi64`]
+  * [x] [`_mm512_maskz_andnot_epi64`]
+  * [x] [`_mm_mask_andnot_epi64`]
+  * [x] [`_mm_maskz_andnot_epi64`]
+  * [x] [`_mm256_mask_andnot_epi64`]
+  * [x] [`_mm256_maskz_andnot_epi64`]
+  * [x] [`_mm512_andnot_si512`]
+  * [x] [`_mm512_unpackhi_epi32`]
+  * [x] [`_mm512_mask_unpackhi_epi32`]
+  * [x] [`_mm512_maskz_unpackhi_epi32`]
+  * [x] [`_mm_mask_unpackhi_epi32`]
+  * [x] [`_mm_maskz_unpackhi_epi32`]
+  * [x] [`_mm256_mask_unpackhi_epi32`]
+  * [x] [`_mm256_maskz_unpackhi_epi32`]
+  * [x] [`_mm512_unpackhi_epi64`]
+  * [x] [`_mm512_mask_unpackhi_epi64`]
+  * [x] [`_mm512_maskz_unpackhi_epi64`]
+  * [x] [`_mm_mask_unpackhi_epi64`]
+  * [x] [`_mm_maskz_unpackhi_epi64`]
+  * [x] [`_mm256_mask_unpackhi_epi64`]
+  * [x] [`_mm256_maskz_unpackhi_epi64`]
+  * [x] [`_mm512_unpackhi_ps`]
+  * [x] [`_mm512_mask_unpackhi_ps`]
+  * [x] [`_mm512_maskz_unpackhi_ps`]
+  * [x] [`_mm_mask_unpackhi_ps`]
+  * [x] [`_mm_maskz_unpackhi_ps`]
+  * [x] [`_mm256_mask_unpackhi_ps`]
+  * [x] [`_mm256_maskz_unpackhi_ps`]
+  * [x] [`_mm512_unpackhi_pd`]
+  * [x] [`_mm512_mask_unpackhi_pd`]
+  * [x] [`_mm512_maskz_unpackhi_pd`]
+  * [x] [`_mm_mask_unpackhi_pd`]
+  * [x] [`_mm_maskz_unpackhi_pd`]
+  * [x] [`_mm256_mask_unpackhi_pd`]
+  * [x] [`_mm256_maskz_unpackhi_pd`]
+  * [x] [`_mm512_unpacklo_epi32`]
+  * [x] [`_mm512_mask_unpacklo_epi32`]
+  * [x] [`_mm512_maskz_unpacklo_epi32`]
+  * [x] [`_mm_mask_unpacklo_epi32`]
+  * [x] [`_mm_maskz_unpacklo_epi32`]
+  * [x] [`_mm256_mask_unpacklo_epi32`]
+  * [x] [`_mm256_maskz_unpacklo_epi32`]
+  * [x] [`_mm512_unpacklo_epi64`]
+  * [x] [`_mm512_mask_unpacklo_epi64`]
+  * [x] [`_mm512_maskz_unpacklo_epi64`]
+  * [x] [`_mm_mask_unpacklo_epi64`]
+  * [x] [`_mm_maskz_unpacklo_epi64`]
+  * [x] [`_mm256_mask_unpacklo_epi64`]
+  * [x] [`_mm256_maskz_unpacklo_epi64`]
+  * [x] [`_mm512_unpacklo_ps`]
+  * [x] [`_mm512_mask_unpacklo_ps`]
+  * [x] [`_mm512_maskz_unpacklo_ps`]
+  * [x] [`_mm_mask_unpacklo_ps`]
+  * [x] [`_mm_maskz_unpacklo_ps`]
+  * [x] [`_mm256_mask_unpacklo_ps`]
+  * [x] [`_mm256_maskz_unpacklo_ps`]
+  * [x] [`_mm512_unpacklo_pd`]
+  * [x] [`_mm512_mask_unpacklo_pd`]
+  * [x] [`_mm512_maskz_unpacklo_pd`]
+  * [x] [`_mm_mask_unpacklo_pd`]
+  * [x] [`_mm_maskz_unpacklo_pd`]
+  * [x] [`_mm256_mask_unpacklo_pd`]
+  * [x] [`_mm256_maskz_unpacklo_pd`]
+  * [x] [`_mm512_mask_blend_epi32`]
+  * [x] [`_mm_mask_blend_epi32`]
+  * [x] [`_mm256_mask_blend_epi32`]
+  * [x] [`_mm512_mask_blend_epi64`]
+  * [x] [`_mm_mask_blend_epi64`]
+  * [x] [`_mm256_mask_blend_epi64`]
+  * [x] [`_mm512_mask_blend_ps`]
+  * [x] [`_mm_mask_blend_ps`]
+  * [x] [`_mm256_mask_blend_ps`]
+  * [x] [`_mm512_mask_blend_pd`]
+  * [x] [`_mm_mask_blend_pd`]
+  * [x] [`_mm256_mask_blend_pd`]
+  * [x] [`_mm512_broadcast_f32x4`]
+  * [x] [`_mm512_mask_broadcast_f32x4`]
+  * [x] [`_mm512_maskz_broadcast_f32x4`]
+  * [x] [`_mm256_broadcast_f32x4`]
+  * [x] [`_mm256_mask_broadcast_f32x4`]
+  * [x] [`_mm256_maskz_broadcast_f32x4`]
+  * [x] [`_mm512_broadcast_f64x4`]
+  * [x] [`_mm512_mask_broadcast_f64x4`]
+  * [x] [`_mm512_maskz_broadcast_f64x4`]
+  * [x] [`_mm512_broadcast_i32x4`]
+  * [x] [`_mm512_mask_broadcast_i32x4`]
+  * [x] [`_mm512_maskz_broadcast_i32x4`]
+  * [x] [`_mm256_broadcast_i32x4`]
+  * [x] [`_mm256_mask_broadcast_i32x4`]
+  * [x] [`_mm256_maskz_broadcast_i32x4`]
+  * [x] [`_mm512_broadcast_i64x4`]
+  * [x] [`_mm512_mask_broadcast_i64x4`]
+  * [x] [`_mm512_maskz_broadcast_i64x4`]
+  * [x] [`_mm512_broadcastd_epi32`]
+  * [x] [`_mm512_mask_broadcastd_epi32`]
+  * [x] [`_mm512_maskz_broadcastd_epi32`]
+  * [x] [`_mm_mask_broadcastd_epi32`]
+  * [x] [`_mm_maskz_broadcastd_epi32`]
+  * [x] [`_mm256_mask_broadcastd_epi32`]
+  * [x] [`_mm256_maskz_broadcastd_epi32`]
+  * [x] [`_mm512_broadcastq_epi64`]
+  * [x] [`_mm512_mask_broadcastq_epi64`]
+  * [x] [`_mm512_maskz_broadcastq_epi64`]
+  * [x] [`_mm_mask_broadcastq_epi64`]
+  * [x] [`_mm_maskz_broadcastq_epi64`]
+  * [x] [`_mm256_mask_broadcastq_epi64`]
+  * [x] [`_mm256_maskz_broadcastq_epi64`]
+  * [x] [`_mm512_broadcastss_ps`]
+  * [x] [`_mm512_mask_broadcastss_ps`]
+  * [x] [`_mm512_maskz_broadcastss_ps`]
+  * [x] [`_mm_mask_broadcastss_ps`]
+  * [x] [`_mm_maskz_broadcastss_ps`]
+  * [x] [`_mm256_mask_broadcastss_ps`]
+  * [x] [`_mm256_maskz_broadcastss_ps`]
+  * [x] [`_mm512_broadcastsd_pd`]
+  * [x] [`_mm512_mask_broadcastsd_pd`]
+  * [x] [`_mm512_maskz_broadcastsd_pd`]
+  * [x] [`_mm256_mask_broadcastsd_pd`]
+  * [x] [`_mm256_maskz_broadcastsd_pd`]
+  * [x] [`_mm512_shuffle_epi32`]
+  * [x] [`_mm512_mask_shuffle_epi32`]
+  * [x] [`_mm512_maskz_shuffle_epi32`]
+  * [x] [`_mm_mask_shuffle_epi32`]
+  * [x] [`_mm_maskz_shuffle_epi32`]
+  * [x] [`_mm256_mask_shuffle_epi32`]
+  * [x] [`_mm256_maskz_shuffle_epi32`]
+  * [x] [`_mm512_shuffle_ps`]
+  * [x] [`_mm512_mask_shuffle_ps`]
+  * [x] [`_mm512_maskz_shuffle_ps`]
+  * [x] [`_mm_mask_shuffle_ps`]
+  * [x] [`_mm_maskz_shuffle_ps`]
+  * [x] [`_mm256_mask_shuffle_ps`]
+  * [x] [`_mm256_maskz_shuffle_ps`]
+  * [x] [`_mm512_shuffle_pd`]
+  * [x] [`_mm512_mask_shuffle_pd`]
+  * [x] [`_mm512_maskz_shuffle_pd`]
+  * [x] [`_mm_mask_shuffle_pd`]
+  * [x] [`_mm_maskz_shuffle_pd`]
+  * [x] [`_mm256_mask_shuffle_pd`]
+  * [x] [`_mm256_maskz_shuffle_pd`]
+  * [x] [`_mm512_shuffle_i32x4`]
+  * [x] [`_mm512_mask_shuffle_i32x4`]
+  * [x] [`_mm512_maskz_shuffle_i32x4`]
+  * [x] [`_mm256_mask_shuffle_i32x4`]
+  * [x] [`_mm256_maskz_shuffle_i32x4`]
+  * [x] [`_mm256_shuffle_i32x4`]
+  * [x] [`_mm512_shuffle_i64x2`]
+  * [x] [`_mm512_mask_shuffle_i64x2`]
+  * [x] [`_mm512_maskz_shuffle_i64x2`]
+  * [x] [`_mm256_mask_shuffle_i64x2`]
+  * [x] [`_mm256_maskz_shuffle_i64x2`]
+  * [x] [`_mm256_shuffle_i64x2`]
+  * [x] [`_mm512_shuffle_f32x4`]
+  * [x] [`_mm512_mask_shuffle_f32x4`]
+  * [x] [`_mm512_maskz_shuffle_f32x4`]
+  * [x] [`_mm256_mask_shuffle_f32x4`]
+  * [x] [`_mm256_maskz_shuffle_f32x4`]
+  * [x] [`_mm256_shuffle_f32x4`]
+  * [x] [`_mm512_shuffle_f64x2`]
+  * [x] [`_mm512_mask_shuffle_f64x2`]
+  * [x] [`_mm512_maskz_shuffle_f64x2`]
+  * [x] [`_mm256_mask_shuffle_f64x2`]
+  * [x] [`_mm256_maskz_shuffle_f64x2`]
+  * [x] [`_mm256_shuffle_f64x2`]
+  * [x] [`_mm512_alignr_epi32`]
+  * [x] [`_mm512_mask_alignr_epi32`]
+  * [x] [`_mm512_maskz_alignr_epi32`]
+  * [x] [`_mm_alignr_epi32`]
+  * [x] [`_mm_mask_alignr_epi32`]
+  * [x] [`_mm_maskz_alignr_epi32`]
+  * [x] [`_mm256_alignr_epi32`]
+  * [x] [`_mm256_mask_alignr_epi32`]
+  * [x] [`_mm256_maskz_alignr_epi32`]
+  * [x] [`_mm512_alignr_epi64`]
+  * [x] [`_mm512_mask_alignr_epi64`]
+  * [x] [`_mm512_maskz_alignr_epi64`]
+  * [x] [`_mm_alignr_epi64`]
+  * [x] [`_mm_mask_alignr_epi64`]
+  * [x] [`_mm_maskz_alignr_epi64`]
+  * [x] [`_mm256_alignr_epi64`]
+  * [x] [`_mm256_mask_alignr_epi64`]
+  * [x] [`_mm256_maskz_alignr_epi64`]
+  * [x] [`_mm512_permute_ps`]
+  * [x] [`_mm512_mask_permute_ps`]
+  * [x] [`_mm512_maskz_permute_ps`]
+  * [x] [`_mm_mask_permute_ps`]
+  * [x] [`_mm_maskz_permute_ps`]
+  * [x] [`_mm256_mask_permute_ps`]
+  * [x] [`_mm256_maskz_permute_ps`]
+  * [x] [`_mm512_permute_pd`]
+  * [x] [`_mm512_mask_permute_pd`]
+  * [x] [`_mm512_maskz_permute_pd`]
+  * [x] [`_mm_mask_permute_pd`]
+  * [x] [`_mm_maskz_permute_pd`]
+  * [x] [`_mm256_mask_permute_pd`]
+  * [x] [`_mm256_maskz_permute_pd`]
+  * [x] [`_mm512_permutevar_epi32`]
+  * [x] [`_mm512_mask_permutevar_epi32`]
+  * [x] [`_mm512_permutevar_ps`]
+  * [x] [`_mm512_mask_permutevar_ps`]
+  * [x] [`_mm512_maskz_permutevar_ps`]
+  * [x] [`_mm_mask_permutevar_ps`]
+  * [x] [`_mm_maskz_permutevar_ps`]
+  * [x] [`_mm256_mask_permutevar_ps`]
+  * [x] [`_mm256_maskz_permutevar_ps`]
+  * [x] [`_mm512_permutevar_pd`]
+  * [x] [`_mm512_mask_permutevar_pd`]
+  * [x] [`_mm512_maskz_permutevar_pd`]
+  * [x] [`_mm_mask_permutevar_pd`]
+  * [x] [`_mm_maskz_permutevar_pd`]
+  * [x] [`_mm256_mask_permutevar_pd`]
+  * [x] [`_mm256_maskz_permutevar_pd`]
+  * [x] [`_mm512_permutex2var_epi32`]
+  * [x] [`_mm512_mask_permutex2var_epi32`]
+  * [x] [`_mm512_maskz_permutex2var_epi32`]
+  * [x] [`_mm512_mask2_permutex2var_epi32`]
+  * [x] [`_mm_mask_permutex2var_epi32`]
+  * [x] [`_mm_mask2_permutex2var_epi32`]
+  * [x] [`_mm_maskz_permutex2var_epi32`]
+  * [x] [`_mm_permutex2var_epi32`]
+  * [x] [`_mm256_mask_permutex2var_epi32`]
+  * [x] [`_mm256_mask2_permutex2var_epi32`]
+  * [x] [`_mm256_maskz_permutex2var_epi32`]
+  * [x] [`_mm256_permutex2var_epi32`]
+  * [x] [`_mm512_permutex2var_epi64`]
+  * [x] [`_mm512_mask_permutex2var_epi64`]
+  * [x] [`_mm512_maskz_permutex2var_epi64`]
+  * [x] [`_mm512_mask2_permutex2var_epi64`]
+  * [x] [`_mm_mask_permutex2var_epi64`]
+  * [x] [`_mm_mask2_permutex2var_epi64`]
+  * [x] [`_mm_maskz_permutex2var_epi64`]
+  * [x] [`_mm_permutex2var_epi64`]
+  * [x] [`_mm256_mask_permutex2var_epi64`]
+  * [x] [`_mm256_mask2_permutex2var_epi64`]
+  * [x] [`_mm256_maskz_permutex2var_epi64`]
+  * [x] [`_mm256_permutex2var_epi64`]
+  * [x] [`_mm512_permutex2var_ps`]
+  * [x] [`_mm512_mask_permutex2var_ps`]
+  * [x] [`_mm512_maskz_permutex2var_ps`]
+  * [x] [`_mm512_mask2_permutex2var_ps`]
+  * [x] [`_mm_mask_permutex2var_ps`]
+  * [x] [`_mm_mask2_permutex2var_ps`]
+  * [x] [`_mm_maskz_permutex2var_ps`]
+  * [x] [`_mm_permutex2var_ps`]
+  * [x] [`_mm256_mask_permutex2var_ps`]
+  * [x] [`_mm256_mask2_permutex2var_ps`]
+  * [x] [`_mm256_maskz_permutex2var_ps`]
+  * [x] [`_mm256_permutex2var_ps`]
+  * [x] [`_mm512_permutex2var_pd`]
+  * [x] [`_mm512_mask_permutex2var_pd`]
+  * [x] [`_mm512_maskz_permutex2var_pd`]
+  * [x] [`_mm512_mask2_permutex2var_pd`]
+  * [x] [`_mm_mask_permutex2var_pd`]
+  * [x] [`_mm_mask2_permutex2var_pd`]
+  * [x] [`_mm_maskz_permutex2var_pd`]
+  * [x] [`_mm_permutex2var_pd`]
+  * [x] [`_mm256_mask_permutex2var_pd`]
+  * [x] [`_mm256_mask2_permutex2var_pd`]
+  * [x] [`_mm256_maskz_permutex2var_pd`]
+  * [x] [`_mm256_permutex2var_pd`]
+  * [x] [`_mm512_permutex_epi64`]
+  * [x] [`_mm512_mask_permutex_epi64`]
+  * [x] [`_mm512_maskz_permutex_epi64`]
+  * [x] [`_mm256_mask_permutex_epi64`]
+  * [x] [`_mm256_maskz_permutex_epi64`]
+  * [x] [`_mm256_permutex_epi64`]
+  * [x] [`_mm512_permutex_pd`]
+  * [x] [`_mm512_mask_permutex_pd`]
+  * [x] [`_mm512_maskz_permutex_pd`]
+  * [x] [`_mm256_mask_permutex_pd`]
+  * [x] [`_mm256_maskz_permutex_pd`]
+  * [x] [`_mm256_permutex_pd`]
+  * [x] [`_mm512_permutexvar_epi32`]
+  * [x] [`_mm512_mask_permutexvar_epi32`]
+  * [x] [`_mm512_maskz_permutexvar_epi32`]
+  * [x] [`_mm256_mask_permutexvar_epi32`]
+  * [x] [`_mm256_maskz_permutexvar_epi32`]
+  * [x] [`_mm256_permutexvar_epi32`]
+  * [x] [`_mm512_permutexvar_epi64`]
+  * [x] [`_mm512_mask_permutexvar_epi64`]
+  * [x] [`_mm512_maskz_permutexvar_epi64`]
+  * [x] [`_mm256_mask_permutexvar_epi64`]
+  * [x] [`_mm256_maskz_permutexvar_epi64`]
+  * [x] [`_mm256_permutexvar_epi64`]
+  * [x] [`_mm512_permutexvar_ps`]
+  * [x] [`_mm512_mask_permutexvar_ps`]
+  * [x] [`_mm512_maskz_permutexvar_ps`]
+  * [x] [`_mm256_mask_permutexvar_ps`]
+  * [x] [`_mm256_maskz_permutexvar_ps`]
+  * [x] [`_mm256_permutexvar_ps`]
+  * [x] [`_mm512_permutexvar_pd`]
+  * [x] [`_mm512_mask_permutexvar_pd`]
+  * [x] [`_mm512_maskz_permutexvar_pd`]
+  * [x] [`_mm256_mask_permutexvar_pd`]
+  * [x] [`_mm256_maskz_permutexvar_pd`]
+  * [x] [`_mm256_permutexvar_pd`]
+  * [x] [`_mm512_i32gather_epi32`]
+  * [x] [`_mm512_mask_i32gather_epi32`]
+  * [_] [`_mm_mmask_i32gather_epi32`] //need i1
+  * [_] [`_mm256_mmask_i32gather_epi32`] //need i1
+  * [x] [`_mm512_i32gather_epi64`]
+  * [x] [`_mm512_mask_i32gather_epi64`]
+  * [_] [`_mm_mmask_i32gather_epi64`] //need i1
+  * [_] [`_mm256_mmask_i32gather_epi64`] //need i1
+  * [x] [`_mm512_i32gather_ps`]
+  * [x] [`_mm512_mask_i32gather_ps`]
+  * [_] [`_mm_mmask_i32gather_ps`] //need i1
+  * [_] [`_mm256_mmask_i32gather_ps`] //need i1
+  * [x] [`_mm512_i32gather_pd`]
+  * [x] [`_mm512_mask_i32gather_pd`]
+  * [_] [`_mm_mmask_i32gather_pd`] //need i1
+  * [_] [`_mm256_mmask_i32gather_pd`] //need i1
+  * [x] [`_mm512_i64gather_epi32`]
+  * [x] [`_mm512_mask_i64gather_epi32`]
+  * [_] [`_mm_mmask_i64gather_epi32`] //need i1
+  * [_] [`_mm256_mmask_i64gather_epi32`] //need i1
+  * [x] [`_mm512_i64gather_epi64`]
+  * [x] [`_mm512_mask_i64gather_epi64`]
+  * [_] [`_mm_mmask_i64gather_epi64`] //need i1
+  * [_] [`_mm256_mmask_i64gather_epi64`] //need i1
+  * [x] [`_mm512_i64gather_ps`]
+  * [x] [`_mm512_mask_i64gather_ps`]
+  * [_] [`_mm_mmask_i64gather_ps`] //need i1
+  * [_] [`_mm256_mmask_i64gather_ps`] //need i1
+  * [x] [`_mm512_i64gather_pd`]
+  * [x] [`_mm512_mask_i64gather_pd`]
+  * [_] [`_mm_mmask_i64gather_pd`] //need i1
+  * [_] [`_mm256_mmask_i64gather_pd`] //need i1
+  * [ ] [`_mm512_i32extgather_epi32`] //not in llvm
+  * [ ] [`_mm512_mask_i32extgather_epi32`] //not in llvm
+  * [ ] [`_mm512_i32extgather_ps`] // not in llvm
+  * [ ] [`_mm512_mask_i32extgather_ps`] //not in llvm
+  * [ ] [`_mm512_i32loextgather_epi64`] //not in llvm
+  * [ ] [`_mm512_mask_i32loextgather_epi64`] //not in llvm
+  * [ ] [`_mm512_i32loextgather_pd`] //not in llvm
+  * [ ] [`_mm512_mask_i32loextgather_pd`] //not in llvm
+  * [ ] [`_mm512_i32logather_epi64`] //not in llvm
+  * [ ] [`_mm512_mask_i32logather_epi64`] //not in llvm
+  * [ ] [`_mm512_i32logather_pd`] //not in llvm
+  * [ ] [`_mm512_mask_i32logather_pd`] //not in llvm
+  * [x] [`_mm512_i32scatter_epi32`]
+  * [x] [`_mm512_mask_i32scatter_epi32`]
+  * [_] [`_mm_i32scatter_epi32`] //need i1
+  * [_] [`_mm_mask_i32scatter_epi32`] // need i1
+  * [_] [`_mm256_i32scatter_epi32`] //need i1
+  * [_] [`_mm256_mask_i32scatter_epi32`] //need i1
+  * [x] [`_mm512_i32scatter_epi64`]
+  * [x] [`_mm512_mask_i32scatter_epi64`]
+  * [_] [`_mm_i32scatter_epi64`]//need i1
+  * [_] [`_mm_mask_i32scatter_epi64`] //need i1
+  * [_] [`_mm256_i32scatter_epi64`] //need i1
+  * [_] [`_mm256_mask_i32scatter_epi64`] //need i1
+  * [x] [`_mm512_i32scatter_ps`]
+  * [x] [`_mm512_mask_i32scatter_ps`]
+  * [_] [`_mm_i32scatter_ps`] //need i1
+  * [_] [`_mm_mask_i32scatter_ps`] //need i1
+  * [_] [`_mm256_i32scatter_ps`] //need i1
+  * [_] [`_mm256_mask_i32scatter_ps`] //need i1
+  * [x] [`_mm512_i32scatter_pd`]
+  * [x] [`_mm512_mask_i32scatter_pd`]
+  * [_] [`_mm_i32scatter_pd`] //need i1
+  * [_] [`_mm_mask_i32scatter_pd`] //need i1
+  * [_] [`_mm256_i32scatter_pd`] //need i1
+  * [_] [`_mm256_mask_i32scatter_pd`] //need i1
+  * [x] [`_mm512_i64scatter_epi32`]
+  * [x] [`_mm512_mask_i64scatter_epi32`]
+  * [_] [`_mm_i64scatter_epi32`] //need i1
+  * [_] [`_mm_mask_i64scatter_epi32`] //need i1
+  * [_] [`_mm256_i64scatter_epi32`] //need i1
+  * [_] [`_mm256_mask_i64scatter_epi32`] //need i1
+  * [x] [`_mm512_mask_i64scatter_epi64`]
+  * [x] [`_mm512_i64scatter_epi64`]
+  * [_] [`_mm_i64scatter_epi64`] //need i1
+  * [_] [`_mm_mask_i64scatter_epi64`] //need i1
+  * [_] [`_mm256_i64scatter_epi64`] //need i1
+  * [_] [`_mm256_mask_i64scatter_epi64`] //need i1
+  * [x] [`_mm512_i64scatter_ps`]
+  * [x] [`_mm512_mask_i64scatter_ps`]
+  * [_] [`_mm_i64scatter_ps`] //need i1
+  * [_] [`_mm_mask_i64scatter_ps`] //need i1
+  * [_] [`_mm256_i64scatter_ps`] //need i1
+  * [_] [`_mm256_mask_i64scatter_ps`] //need i1
+  * [x] [`_mm512_i64scatter_pd`]
+  * [x] [`_mm512_mask_i64scatter_pd`]
+  * [_] [`_mm_i64scatter_pd`] //need i1
+  * [_] [`_mm_mask_i64scatter_pd`] //need i1
+  * [_] [`_mm256_i64scatter_pd`] //need i1
+  * [_] [`_mm256_mask_i64scatter_pd`] //need i1
+  * [ ] [`_mm512_i32extscatter_epi32`] //not in llvm
+  * [ ] [`_mm512_mask_i32extscatter_epi32`] //not in llvm
+  * [ ] [`_mm512_i32extscatter_ps`] //not in llvm
+  * [ ] [`_mm512_mask_i32extscatter_ps`] //not in llvm
+  * [ ] [`_mm512_i32loextscatter_epi64`] //not in llvm
+  * [ ] [`_mm512_mask_i32loextscatter_epi64`] //not in llvm
+  * [ ] [`_mm512_i32loextscatter_pd`] //not in llvm
+  * [ ] [`_mm512_mask_i32loextscatter_pd`] //not in llvm
+  * [ ] [`_mm512_i32loscatter_epi64`] //not in llvm
+  * [ ] [`_mm512_mask_i32loscatter_epi64`] //not in llvm
+  * [ ] [`_mm512_i32loscatter_pd`] //not in llvm
+  * [ ] [`_mm512_mask_i32loscatter_pd`] //not in llvm
+  * [x] [`_mm512_inserti32x4`]
+  * [x] [`_mm512_mask_inserti32x4`]
+  * [x] [`_mm512_maskz_inserti32x4`]
+  * [x] [`_mm256_inserti32x4`]
+  * [x] [`_mm256_mask_inserti32x4`]
+  * [x] [`_mm256_maskz_inserti32x4`]
+  * [x] [`_mm512_inserti64x4`]
+  * [x] [`_mm512_mask_inserti64x4`]
+  * [x] [`_mm512_maskz_inserti64x4`]
+  * [x] [`_mm512_insertf32x4`]
+  * [x] [`_mm512_mask_insertf32x4`]
+  * [x] [`_mm512_maskz_insertf32x4`]
+  * [x] [`_mm256_insertf32x4`]
+  * [x] [`_mm256_mask_insertf32x4`]
+  * [x] [`_mm256_maskz_insertf32x4`]
+  * [x] [`_mm512_insertf64x4`]
+  * [x] [`_mm512_mask_insertf64x4`]
+  * [x] [`_mm512_maskz_insertf64x4`]
+  * [x] [`_mm512_extracti32x4_epi32`]
+  * [x] [`_mm512_mask_extracti32x4_epi32`]
+  * [x] [`_mm512_maskz_extracti32x4_epi32`]
+  * [x] [`_mm256_extracti32x4_epi32`]
+  * [x] [`_mm256_mask_extracti32x4_epi32`]
+  * [x] [`_mm256_maskz_extracti32x4_epi32`]
+  * [x] [`_mm512_extracti64x4_epi64`]
+  * [x] [`_mm512_mask_extracti64x4_epi64`]
+  * [x] [`_mm512_maskz_extracti64x4_epi64`]
+  * [x] [`_mm512_extractf32x4_ps`]
+  * [x] [`_mm512_mask_extractf32x4_ps`]
+  * [x] [`_mm512_maskz_extractf32x4_ps`]
+  * [x] [`_mm256_extractf32x4_ps`]
+  * [x] [`_mm256_mask_extractf32x4_ps`]
+  * [x] [`_mm256_maskz_extractf32x4_ps`]
+  * [x] [`_mm512_extractf64x4_pd`]
+  * [x] [`_mm512_mask_extractf64x4_pd`]
+  * [x] [`_mm512_maskz_extractf64x4_pd`]
+  * [x] [`_mm512_maskz_compress_epi32`]
+  * [x] [`_mm512_mask_compress_epi32`]
+  * [x] [`_mm_mask_compress_epi32`]
+  * [x] [`_mm_maskz_compress_epi32`]
+  * [x] [`_mm256_mask_compress_epi32`]
+  * [x] [`_mm256_maskz_compress_epi32`]
+  * [x] [`_mm512_mask_compress_epi64`]
+  * [x] [`_mm512_maskz_compress_epi64`]
+  * [x] [`_mm_mask_compress_epi64`]
+  * [x] [`_mm_maskz_compress_epi64`]
+  * [x] [`_mm256_mask_compress_epi64`]
+  * [x] [`_mm256_maskz_compress_epi64`]
+  * [x] [`_mm512_mask_compress_ps`]
+  * [x] [`_mm512_maskz_compress_ps`]
+  * [x] [`_mm_mask_compress_ps`]
+  * [x] [`_mm_maskz_compress_ps`]
+  * [x] [`_mm256_mask_compress_ps`]
+  * [x] [`_mm256_maskz_compress_ps`]
+  * [x] [`_mm512_mask_compress_pd`]
+  * [x] [`_mm512_maskz_compress_pd`]
+  * [x] [`_mm_mask_compress_pd`]
+  * [x] [`_mm_maskz_compress_pd`]
+  * [x] [`_mm256_mask_compress_pd`]
+  * [x] [`_mm256_maskz_compress_pd`]
+  * [x] [`_mm512_mask_compressstoreu_epi32`] //need i1
+  * [x] [`_mm_mask_compressstoreu_epi32`] //need i1
+  * [x] [`_mm256_mask_compressstoreu_epi32`] //need i1
+  * [x] [`_mm512_mask_compressstoreu_epi64`] //need i1
+  * [x] [`_mm_mask_compressstoreu_epi64`] //need i1
+  * [x] [`_mm256_mask_compressstoreu_epi64`] //need i1
+  * [x] [`_mm512_mask_compressstoreu_ps`] //need i1
+  * [x] [`_mm_mask_compressstoreu_ps`] //need i1
+  * [x] [`_mm256_mask_compressstoreu_ps`] //need i1
+  * [x] [`_mm512_mask_compressstoreu_pd`] //need i1
+  * [x] [`_mm_mask_compressstoreu_pd`] //need i1
+  * [x] [`_mm256_mask_compressstoreu_pd`] //need i1
+  * [x] [`_mm512_mask_expand_epi32`]
+  * [x] [`_mm512_maskz_expand_epi32`]
+  * [x] [`_mm_mask_expand_epi32`]
+  * [x] [`_mm_maskz_expand_epi32`]
+  * [x] [`_mm256_mask_expand_epi32`]
+  * [x] [`_mm256_maskz_expand_epi32`]
+  * [x] [`_mm512_mask_expand_epi64`]
+  * [x] [`_mm512_maskz_expand_epi64`]
+  * [x] [`_mm_mask_expand_epi64`]
+  * [x] [`_mm_maskz_expand_epi64`]
+  * [x] [`_mm256_mask_expand_epi64`]
+  * [x] [`_mm256_maskz_expand_epi64`]
+  * [x] [`_mm512_mask_expand_ps`]
+  * [x] [`_mm512_maskz_expand_ps`]
+  * [x] [`_mm_mask_expand_ps`]
+  * [x] [`_mm_maskz_expand_ps`]
+  * [x] [`_mm256_mask_expand_ps`]
+  * [x] [`_mm256_maskz_expand_ps`]
+  * [x] [`_mm512_mask_expand_pd`]
+  * [x] [`_mm512_maskz_expand_pd`]
+  * [x] [`_mm_mask_expand_pd`]
+  * [x] [`_mm_maskz_expand_pd`]
+  * [x] [`_mm256_mask_expand_pd`]
+  * [x] [`_mm256_maskz_expand_pd`]
+  * [x] [`_mm512_mask_expandloadu_epi32`] //need i1
+  * [x] [`_mm512_maskz_expandloadu_epi32`] //need i1
+  * [x] [`_mm_mask_expandloadu_epi32`] //need i1
+  * [x] [`_mm_maskz_expandloadu_epi32`] //need i1
+  * [x] [`_mm256_mask_expandloadu_epi32`] //need i1
+  * [x] [`_mm256_maskz_expandloadu_epi32`] //need i1
+  * [x] [`_mm512_mask_expandloadu_epi64`] //need i1
+  * [x] [`_mm512_maskz_expandloadu_epi64`] //need i1
+  * [x] [`_mm_mask_expandloadu_epi64`] //need i1
+  * [x] [`_mm_maskz_expandloadu_epi64`] //need i1
+  * [x] [`_mm256_mask_expandloadu_epi64`] //need i1
+  * [x] [`_mm256_maskz_expandloadu_epi64`] //need i1
+  * [x] [`_mm512_mask_expandloadu_ps`] //need i1
+  * [x] [`_mm512_maskz_expandloadu_ps`] //need i1
+  * [x] [`_mm_mask_expandloadu_ps`] //need i1
+  * [x] [`_mm_maskz_expandloadu_ps`] //need i1
+  * [x] [`_mm256_mask_expandloadu_ps`] //need i1
+  * [x] [`_mm256_maskz_expandloadu_ps`] //need i1
+  * [x] [`_mm512_mask_expandloadu_pd`] //need i1
+  * [x] [`_mm512_maskz_expandloadu_pd`] //need i1
+  * [x] [`_mm_mask_expandloadu_pd`] //need i1
+  * [x] [`_mm_maskz_expandloadu_pd`] //need i1
+  * [x] [`_mm256_mask_expandloadu_pd`] //need i1
+  * [x] [`_mm256_maskz_expandloadu_pd`] //need i1
+  * [x] [`_mm512_zextpd128_pd512`]
+  * [x] [`_mm512_zextpd256_pd512`]
+  * [x] [`_mm512_zextps128_ps512`]
+  * [x] [`_mm512_zextps256_ps512`]
+  * [x] [`_mm512_zextsi128_si512`]
+  * [x] [`_mm512_zextsi256_si512`]
+  * [x] [`_mm512_undefined_epi32`]
+  * [x] [`_mm512_undefined_pd`]
+  * [x] [`_mm512_undefined_ps`]
+  * [x] [`_mm512_undefined`]
+  * [ ] [`_mm512_svml_round_pd`] //not in llvm
+  * [x] [`_mm512_ternarylogic_epi32`]
+  * [x] [`_mm512_mask_ternarylogic_epi32`]
+  * [x] [`_mm512_maskz_ternarylogic_epi32`]
+  * [x] [`_mm_mask_ternarylogic_epi32`]
+  * [x] [`_mm_maskz_ternarylogic_epi32`]
+  * [x] [`_mm_ternarylogic_epi32`]
+  * [x] [`_mm256_mask_ternarylogic_epi32`]
+  * [x] [`_mm256_maskz_ternarylogic_epi32`]
+  * [x] [`_mm256_ternarylogic_epi32`]
+  * [x] [`_mm512_ternarylogic_epi64`]
+  * [x] [`_mm512_mask_ternarylogic_epi64`]
+  * [x] [`_mm512_maskz_ternarylogic_epi64`]
+  * [x] [`_mm_mask_ternarylogic_epi64`]
+  * [x] [`_mm_maskz_ternarylogic_epi64`]
+  * [x] [`_mm_ternarylogic_epi64`]
+  * [x] [`_mm256_mask_ternarylogic_epi64`]
+  * [x] [`_mm256_maskz_ternarylogic_epi64`]
+  * [x] [`_mm256_ternarylogic_epi64`]
+  * [x] [`_mm512_test_epi32_mask`]
+  * [x] [`_mm512_mask_test_epi32_mask`]
+  * [x] [`_mm_mask_test_epi32_mask`]
+  * [x] [`_mm_test_epi32_mask`]
+  * [x] [`_mm256_mask_test_epi32_mask`]
+  * [x] [`_mm256_test_epi32_mask`]
+  * [x] [`_mm512_test_epi64_mask`]
+  * [x] [`_mm512_mask_test_epi64_mask`]
+  * [x] [`_mm_mask_test_epi64_mask`]
+  * [x] [`_mm_test_epi64_mask`]
+  * [x] [`_mm256_mask_test_epi64_mask`]
+  * [x] [`_mm256_test_epi64_mask`]
+  * [x] [`_mm512_testn_epi32_mask`]
+  * [x] [`_mm512_mask_testn_epi32_mask`]
+  * [x] [`_mm_mask_testn_epi32_mask`]
+  * [x] [`_mm_testn_epi32_mask`]
+  * [x] [`_mm256_mask_testn_epi32_mask`]
+  * [x] [`_mm256_testn_epi32_mask`]
+  * [x] [`_mm512_testn_epi64_mask`]
+  * [x] [`_mm512_mask_testn_epi64_mask`]
+  * [x] [`_mm_mask_testn_epi64_mask`]
+  * [x] [`_mm_testn_epi64_mask`]
+  * [x] [`_mm256_mask_testn_epi64_mask`]
+  * [x] [`_mm256_testn_epi64_mask`]
+  * [x] [`_mm512_set1_epi8`]
+  * [x] [`_mm512_set1_epi16`]
+  * [x] [`_mm512_set1_epi32`]
+  * [x] [`_mm512_mask_set1_epi32`]
+  * [x] [`_mm512_maskz_set1_epi32`]
+  * [x] [`_mm_mask_set1_epi32`]
+  * [x] [`_mm_maskz_set1_epi32`]
+  * [x] [`_mm256_mask_set1_epi32`]
+  * [x] [`_mm256_maskz_set1_epi32`]
+  * [x] [`_mm512_set1_epi64`]
+  * [x] [`_mm512_mask_set1_epi64`]
+  * [x] [`_mm512_maskz_set1_epi64`]
+  * [x] [`_mm_mask_set1_epi64`]
+  * [x] [`_mm_maskz_set1_epi64`]
+  * [x] [`_mm256_mask_set1_epi64`]
+  * [x] [`_mm256_maskz_set1_epi64`]
+  * [x] [`_mm512_set1_ps`]
+  * [x] [`_mm512_set1_pd`]
+  * [x] [`_mm512_set4_epi32`]
+  * [x] [`_mm512_set4_epi64`]
+  * [x] [`_mm512_set4_pd`]
+  * [x] [`_mm512_set4_ps`]
+  * [x] [`_mm512_set_epi16`]
+  * [x] [`_mm512_set_epi32`]
+  * [x] [`_mm512_set_epi64`]
+  * [x] [`_mm512_set_epi8`]
+  * [x] [`_mm512_set_pd`]
+  * [x] [`_mm512_set_ps`]
+  * [x] [`_mm512_setr4_epi32`]
+  * [x] [`_mm512_setr4_epi64`]
+  * [x] [`_mm512_setr4_pd`]
+  * [x] [`_mm512_setr4_ps`]
+  * [x] [`_mm512_setr_epi32`]
+  * [x] [`_mm512_setr_epi64`]
+  * [x] [`_mm512_setr_pd`]
+  * [x] [`_mm512_setr_ps`]
+  * [x] [`_mm512_setzero_epi32`]
+  * [x] [`_mm512_setzero_pd`]
+  * [x] [`_mm512_setzero_ps`]
+  * [x] [`_mm512_setzero_si512`]
+  * [x] [`_mm512_setzero`]
+  * [x] [`_mm512_load_epi32`]
+  * [x] [`_mm512_mask_load_epi32`] //need i1
+  * [x] [`_mm512_maskz_load_epi32`] //need i1
+  * [x] [`_mm_load_epi32`]
+  * [x] [`_mm_mask_load_epi32`] //need i1
+  * [x] [`_mm_maskz_load_epi32`] //need i1
+  * [x] [`_mm256_load_epi32`]
+  * [x] [`_mm256_mask_load_epi32`] //need i1
+  * [x] [`_mm256_maskz_load_epi32`] //need i1
+  * [x] [`_mm512_load_epi64`]
+  * [x] [`_mm512_mask_load_epi64`] //need i1
+  * [x] [`_mm512_maskz_load_epi64`] //need i1
+  * [x] [`_mm_load_epi64`] //need i1
+  * [x] [`_mm_mask_load_epi64`] //need i1
+  * [x] [`_mm_maskz_load_epi64`] //need i1
+  * [x] [`_mm256_load_epi64`] //need i1
+  * [x] [`_mm256_mask_load_epi64`] //need i1
+  * [x] [`_mm256_maskz_load_epi64`] //need i1
+  * [x] [`_mm512_load_ps`]
+  * [x] [`_mm512_mask_load_ps`] //need i1
+  * [x] [`_mm512_maskz_load_ps`] //need i1
+  * [x] [`_mm_maskz_load_ps`] //need i
+  * [x] [`_mm_mask_load_ps`] //need i1
+  * [x] [`_mm_maskz_load_ps`] //need i1
+  * [x] [`_mm256_mask_load_ps`] //need i1
+  * [x] [`_mm256_maskz_load_ps`] //need i1
+  * [x] [`_mm512_load_pd`]
+  * [x] [`_mm512_mask_load_pd`] //need i1
+  * [x] [`_mm512_maskz_load_pd`] //need i1
+  * [x] [`_mm_mask_load_pd`] //need i1
+  * [x] [`_mm_maskz_load_pd`] //need i1
+  * [x] [`_mm256_mask_load_pd`] //need i1
+  * [x] [`_mm256_maskz_load_pd`] //need i1
+  * [x] [`_mm512_load_si512`]
+  * [x] [`_mm512_loadu_epi32`]
+  * [x] [`_mm512_mask_loadu_epi32`] //need i1
+  * [x] [`_mm_loadu_epi32`]
+  * [x] [`_mm_mask_loadu_epi32`] //need i1
+  * [x] [`_mm_maskz_loadu_epi32`] //need i1
+  * [x] [`_mm512_maskz_loadu_epi32`] //need i1
+  * [x] [`_mm256_loadu_epi32`]
+  * [x] [`_mm256_mask_loadu_epi32`] //need i1
+  * [x] [`_mm256_maskz_loadu_epi32`] //need i1
+  * [x] [`_mm512_loadu_epi64`]
+  * [x] [`_mm512_mask_loadu_epi64`] //need i1
+  * [x] [`_mm512_maskz_loadu_epi64`] //need i1
+  * [x] [`_mm_loadu_epi64`]
+  * [x] [`_mm_mask_loadu_epi64`] //need i1
+  * [x] [`_mm_maskz_loadu_epi64`] //need i1
+  * [x] [`_mm256_loadu_epi64`]
+  * [x] [`_mm256_mask_loadu_epi64`] //need i1
+  * [x] [`_mm256_maskz_loadu_epi64`] //need i1
+  * [x] [`_mm512_loadu_ps`]
+  * [x] [`_mm512_mask_loadu_ps`] //need i1
+  * [x] [`_mm512_maskz_loadu_ps`] //need i1
+  * [x] [`_mm_mask_loadu_ps`] //need i1
+  * [x] [`_mm_maskz_loadu_ps`] //need i1
+  * [x] [`_mm256_mask_loadu_ps`] //need i1
+  * [x] [`_mm256_maskz_loadu_ps`] //need i1
+  * [x] [`_mm512_loadu_pd`]
+  * [x] [`_mm512_mask_loadu_pd`] //need i1
+  * [x] [`_mm512_maskz_loadu_pd`] //need i1
+  * [x] [`_mm_mask_loadu_pd`] //need i1
+  * [x] [`_mm_maskz_loadu_pd`] //need i1
+  * [x] [`_mm256_mask_loadu_pd`] //need i1
+  * [x] [`_mm256_maskz_loadu_pd`] //need i1
+  * [x] [`_mm512_loadu_si512`]
+  * [x] [`_mm512_store_epi32`]
+  * [x] [`_mm512_mask_store_epi32`] //need i1
+  * [x] [`_mm_mask_store_epi32`] //need i1
+  * [x] [`_mm_store_epi32`]
+  * [x] [`_mm256_mask_store_epi32`] //need i1
+  * [x] [`_mm256_store_epi32`]
+  * [x] [`_mm512_store_epi64`]
+  * [x] [`_mm512_mask_store_epi64`] //need i1
+  * [x] [`_mm_mask_store_epi64`] //need i1
+  * [x] [`_mm_store_epi64`]
+  * [x] [`_mm256_mask_store_epi64`] //need i1
+  * [x] [`_mm256_store_epi64`]
+  * [x] [`_mm512_store_ps`]
+  * [x] [`_mm512_mask_store_ps`] //need i1
+  * [x] [`_mm_mask_store_ps`] //need i1
+  * [x] [`_mm256_mask_store_ps`] //need i1
+  * [x] [`_mm512_store_pd`]
+  * [x] [`_mm512_mask_store_pd`] //need i1
+  * [x] [`_mm_mask_store_pd`] //need i1
+  * [x] [`_mm256_mask_store_pd`] //need i1
+  * [x] [`_mm512_store_si512`]
+  * [x] [`_mm512_storeu_epi32`]
+  * [x] [`_mm512_mask_storeu_epi32`] //need i1
+  * [x] [`_mm_mask_storeu_epi32`] //need i1
+  * [x] [`_mm_storeu_epi32`]
+  * [x] [`_mm256_mask_storeu_epi32`] //need i1
+  * [x] [`_mm256_storeu_epi32`]
+  * [x] [`_mm512_storeu_epi64`]
+  * [x] [`_mm512_mask_storeu_epi64`] //need i1
+  * [x] [`_mm_mask_storeu_epi64`] //need i1
+  * [x] [`_mm_storeu_epi64`]
+  * [x] [`_mm256_mask_storeu_epi64`] //need i1
+  * [x] [`_mm256_storeu_epi64`]
+  * [x] [`_mm512_storeu_ps`]
+  * [x] [`_mm512_mask_storeu_ps`] //need i1
+  * [x] [`_mm_mask_storeu_ps`] //need i1
+  * [x] [`_mm256_mask_storeu_ps`] //need i1
+  * [x] [`_mm512_storeu_pd`]
+  * [x] [`_mm512_mask_storeu_pd`] //need i1
+  * [x] [`_mm_mask_storeu_pd`] //need i1
+  * [x] [`_mm256_mask_storeu_pd`] //need i1
+  * [x] [`_mm512_storeu_si512`]
+  * [ ] [`_mm512_stream_load_si512`] //stream_load_si256, ... not implment yet
+  * [x] [`_mm512_stream_pd`]
+  * [x] [`_mm512_stream_ps`]
+  * [x] [`_mm512_stream_si512`]
+  * [x] [`_mm512_castpd128_pd512`]
+  * [x] [`_mm512_castpd256_pd512`]
+  * [x] [`_mm512_castpd512_pd128`]
+  * [x] [`_mm512_castpd512_pd256`]
+  * [x] [`_mm512_castpd_ps`]
+  * [x] [`_mm512_castpd_si512`]
+  * [x] [`_mm512_castps128_ps512`]
+  * [x] [`_mm512_castps256_ps512`]
+  * [x] [`_mm512_castps512_ps128`]
+  * [x] [`_mm512_castps512_ps256`]
+  * [x] [`_mm512_castps_pd`]
+  * [x] [`_mm512_castps_si512`]
+  * [x] [`_mm512_castsi128_si512`]
+  * [x] [`_mm512_castsi256_si512`]
+  * [x] [`_mm512_castsi512_pd`]
+  * [x] [`_mm512_castsi512_ps`]
+  * [x] [`_mm512_castsi512_si128`]
+  * [x] [`_mm512_castsi512_si256`]
+  * [x] [`_mm512_cvt_roundps_ph`]
+  * [x] [`_mm512_mask_cvt_roundps_ph`]
+  * [x] [`_mm512_maskz_cvt_roundps_ph`]
+  * [x] [`_mm_mask_cvt_roundps_ph`]
+  * [x] [`_mm_maskz_cvt_roundps_ph`]
+  * [x] [`_mm256_mask_cvt_roundps_ph`]
+  * [x] [`_mm256_maskz_cvt_roundps_ph`]
+  * [x] [`_mm512_cvtepi16_epi32`]
+  * [x] [`_mm512_mask_cvtepi16_epi32`]
+  * [x] [`_mm512_maskz_cvtepi16_epi32`]
+  * [x] [`_mm_mask_cvtepi16_epi32`]
+  * [x] [`_mm_maskz_cvtepi16_epi32`]
+  * [x] [`_mm256_mask_cvtepi16_epi32`]
+  * [x] [`_mm256_maskz_cvtepi16_epi32`]
+  * [x] [`_mm512_cvtepi16_epi64`]
+  * [x] [`_mm512_mask_cvtepi16_epi64`]
+  * [x] [`_mm512_maskz_cvtepi16_epi64`]
+  * [x] [`_mm_mask_cvtepi16_epi64`]
+  * [x] [`_mm_maskz_cvtepi16_epi64`]
+  * [x] [`_mm256_mask_cvtepi16_epi64`]
+  * [x] [`_mm256_maskz_cvtepi16_epi64`]
+  * [x] [`_mm512_cvtepi32_epi16`]
+  * [x] [`_mm512_mask_cvtepi32_epi16`]
+  * [x] [`_mm512_maskz_cvtepi32_epi16`]
+  * [x] [`_mm512_mask_cvtepi32_storeu_epi16`]
+  * [x] [`_mm_mask_cvtepi32_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtepi32_storeu_epi16`]
+  * [x] [`_mm_cvtepi32_epi16`]
+  * [x] [`_mm_mask_cvtepi32_epi16`]
+  * [x] [`_mm_maskz_cvtepi32_epi16`]
+  * [x] [`_mm256_cvtepi32_epi16`]
+  * [x] [`_mm256_mask_cvtepi32_epi16`]
+  * [x] [`_mm256_maskz_cvtepi32_epi16`]
+  * [x] [`_mm512_cvtepi32_epi64`]
+  * [x] [`_mm512_mask_cvtepi32_epi64`]
+  * [x] [`_mm512_maskz_cvtepi32_epi64`]
+  * [x] [`_mm_mask_cvtepi32_epi64`]
+  * [x] [`_mm_maskz_cvtepi32_epi64`]
+  * [x] [`_mm256_mask_cvtepi32_epi64`]
+  * [x] [`_mm256_maskz_cvtepi32_epi64`]
+  * [x] [`_mm512_cvtepi32_epi8`]
+  * [x] [`_mm512_mask_cvtepi32_epi8`]
+  * [x] [`_mm512_maskz_cvtepi32_epi8`]
+  * [x] [`_mm512_mask_cvtepi32_storeu_epi8`]
+  * [x] [`_mm_mask_cvtepi32_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtepi32_storeu_epi8`]
+  * [x] [`_mm_cvtepi32_epi8`]
+  * [x] [`_mm_mask_cvtepi32_epi8`]
+  * [x] [`_mm_maskz_cvtepi32_epi8`]
+  * [x] [`_mm256_cvtepi32_epi8`]
+  * [x] [`_mm256_mask_cvtepi32_epi8`]
+  * [x] [`_mm256_maskz_cvtepi32_epi8`]
+  * [x] [`_mm512_cvtepi32_ps`]
+  * [x] [`_mm512_mask_cvtepi32_ps`]
+  * [x] [`_mm512_maskz_cvtepi32_ps`]
+  * [x] [`_mm_mask_cvtepi32_ps`]
+  * [x] [`_mm_maskz_cvtepi32_ps`]
+  * [x] [`_mm256_mask_cvtepi32_ps`]
+  * [x] [`_mm256_maskz_cvtepi32_ps`]
+  * [x] [`_mm512_cvtepi32_pd`]
+  * [x] [`_mm512_mask_cvtepi32_pd`]
+  * [x] [`_mm512_maskz_cvtepi32_pd`]
+  * [x] [`_mm_mask_cvtepi32_pd`]
+  * [x] [`_mm_maskz_cvtepi32_pd`]
+  * [x] [`_mm256_mask_cvtepi32_pd`]
+  * [x] [`_mm256_maskz_cvtepi32_pd`]
+  * [x] [`_mm512_cvtepi32lo_pd`]
+  * [x] [`_mm512_mask_cvtepi32lo_pd`]
+  * [x] [`_mm512_cvtepi64_epi16`]
+  * [x] [`_mm512_mask_cvtepi64_epi16`]
+  * [x] [`_mm512_maskz_cvtepi64_epi16`]
+  * [x] [`_mm_cvtepi64_epi16`]
+  * [x] [`_mm_mask_cvtepi64_epi16`]
+  * [x] [`_mm_maskz_cvtepi64_epi16`]
+  * [x] [`_mm256_cvtepi64_epi16`]
+  * [x] [`_mm256_mask_cvtepi64_epi16`]
+  * [x] [`_mm256_maskz_cvtepi64_epi16`]
+  * [x] [`_mm512_mask_cvtepi64_storeu_epi16`]
+  * [x] [`_mm_mask_cvtepi64_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtepi64_storeu_epi16`]
+  * [x] [`_mm512_cvtepi64_epi8`]
+  * [x] [`_mm512_mask_cvtepi64_epi8`]
+  * [x] [`_mm512_maskz_cvtepi64_epi8`]
+  * [x] [`_mm_cvtepi64_epi8`]
+  * [x] [`_mm_mask_cvtepi64_epi8`]
+  * [x] [`_mm_maskz_cvtepi64_epi8`]
+  * [x] [`_mm256_cvtepi64_epi8`]
+  * [x] [`_mm256_mask_cvtepi64_epi8`]
+  * [x] [`_mm256_maskz_cvtepi64_epi8`]
+  * [x] [`_mm512_mask_cvtepi64_storeu_epi8`]
+  * [x] [`_mm_mask_cvtepi64_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtepi64_storeu_epi8`]
+  * [x] [`_mm512_cvtepi64_epi32`]
+  * [x] [`_mm512_mask_cvtepi64_epi32`]
+  * [x] [`_mm512_maskz_cvtepi64_epi32`]
+  * [x] [`_mm_cvtepi64_epi32`]
+  * [x] [`_mm_mask_cvtepi64_epi32`]
+  * [x] [`_mm_maskz_cvtepi64_epi32`]
+  * [x] [`_mm256_cvtepi64_epi32`]
+  * [x] [`_mm256_mask_cvtepi64_epi32`]
+  * [x] [`_mm256_maskz_cvtepi64_epi32`]
+  * [x] [`_mm512_mask_cvtepi64_storeu_epi32`]
+  * [x] [`_mm_mask_cvtepi64_storeu_epi32`]
+  * [x] [`_mm256_mask_cvtepi64_storeu_epi32`]
+  * [x] [`_mm512_cvtepi8_epi32`]
+  * [x] [`_mm512_mask_cvtepi8_epi32`]
+  * [x] [`_mm512_maskz_cvtepi8_epi32`]
+  * [x] [`_mm_mask_cvtepi8_epi32`]
+  * [x] [`_mm_maskz_cvtepi8_epi32`]
+  * [x] [`_mm256_mask_cvtepi8_epi32`]
+  * [x] [`_mm256_maskz_cvtepi8_epi32`]
+  * [x] [`_mm512_cvtepi8_epi64`]
+  * [x] [`_mm512_mask_cvtepi8_epi64`]
+  * [x] [`_mm512_maskz_cvtepi8_epi64`]
+  * [x] [`_mm_mask_cvtepi8_epi64`]
+  * [x] [`_mm_maskz_cvtepi8_epi64`]
+  * [x] [`_mm256_mask_cvtepi8_epi64`]
+  * [x] [`_mm256_maskz_cvtepi8_epi64`]
+  * [x] [`_mm512_cvtepu16_epi32`]
+  * [x] [`_mm512_mask_cvtepu16_epi32`]
+  * [x] [`_mm512_maskz_cvtepu16_epi32`]
+  * [x] [`_mm_mask_cvtepu16_epi32`]
+  * [x] [`_mm_maskz_cvtepu16_epi32`]
+  * [x] [`_mm256_mask_cvtepu16_epi32`]
+  * [x] [`_mm256_maskz_cvtepu16_epi32`]
+  * [x] [`_mm512_cvtepu16_epi64`]
+  * [x] [`_mm512_mask_cvtepu16_epi64`]
+  * [x] [`_mm512_maskz_cvtepu16_epi64`]
+  * [x] [`_mm_mask_cvtepu16_epi64`]
+  * [x] [`_mm_maskz_cvtepu16_epi64`]
+  * [x] [`_mm256_mask_cvtepu16_epi64`]
+  * [x] [`_mm256_maskz_cvtepu16_epi64`]
+  * [x] [`_mm512_cvtepu32_epi64`]
+  * [x] [`_mm512_mask_cvtepu32_epi64`]
+  * [x] [`_mm512_maskz_cvtepu32_epi64`]
+  * [x] [`_mm_mask_cvtepu32_epi64`]
+  * [x] [`_mm_maskz_cvtepu32_epi64`]
+  * [x] [`_mm256_mask_cvtepu32_epi64`]
+  * [x] [`_mm256_maskz_cvtepu32_epi64`]
+  * [x] [`_mm512_cvtepu32_ps`]
+  * [x] [`_mm512_mask_cvtepu32_ps`]
+  * [x] [`_mm512_maskz_cvtepu32_ps`]
+  * [x] [`_mm512_cvtepu32_pd`]
+  * [x] [`_mm512_mask_cvtepu32_pd`]
+  * [x] [`_mm512_maskz_cvtepu32_pd`]
+  * [x] [`_mm_cvtepu32_pd`]
+  * [x] [`_mm_mask_cvtepu32_pd`]
+  * [x] [`_mm_maskz_cvtepu32_pd`]
+  * [x] [`_mm256_cvtepu32_pd`]
+  * [x] [`_mm256_mask_cvtepu32_pd`]
+  * [x] [`_mm256_maskz_cvtepu32_pd`]
+  * [x] [`_mm512_cvtepu32lo_pd`]
+  * [x] [`_mm512_mask_cvtepu32lo_pd`]
+  * [x] [`_mm512_cvtepu8_epi32`]
+  * [x] [`_mm512_mask_cvtepu8_epi32`]
+  * [x] [`_mm512_maskz_cvtepu8_epi32`]
+  * [x] [`_mm_mask_cvtepu8_epi32`]
+  * [x] [`_mm_maskz_cvtepu8_epi32`]
+  * [x] [`_mm256_mask_cvtepu8_epi32`]
+  * [x] [`_mm256_maskz_cvtepu8_epi32`]
+  * [x] [`_mm512_cvtepu8_epi64`]
+  * [x] [`_mm512_mask_cvtepu8_epi64`]
+  * [x] [`_mm512_maskz_cvtepu8_epi64`]
+  * [x] [`_mm_mask_cvtepu8_epi64`]
+  * [x] [`_mm_maskz_cvtepu8_epi64`]
+  * [x] [`_mm256_mask_cvtepu8_epi64`]
+  * [x] [`_mm256_maskz_cvtepu8_epi64`]
+  * [x] [`_mm512_cvtpd_epi32`]
+  * [x] [`_mm512_mask_cvtpd_epi32`]
+  * [x] [`_mm512_maskz_cvtpd_epi32`]
+  * [x] [`_mm_mask_cvtpd_epi32`]
+  * [x] [`_mm_maskz_cvtpd_epi32`]
+  * [x] [`_mm256_mask_cvtpd_epi32`]
+  * [x] [`_mm256_maskz_cvtpd_epi32`]
+  * [x] [`_mm512_cvtpd_epu32`]
+  * [x] [`_mm512_mask_cvtpd_epu32`]
+  * [x] [`_mm512_maskz_cvtpd_epu32`]
+  * [x] [`_mm_cvtpd_epu32`]
+  * [x] [`_mm_mask_cvtpd_epu32`]
+  * [x] [`_mm_maskz_cvtpd_epu32`]
+  * [x] [`_mm256_cvtpd_epu32`]
+  * [x] [`_mm256_mask_cvtpd_epu32`]
+  * [x] [`_mm256_maskz_cvtpd_epu32`]
+  * [x] [`_mm512_cvtpd_ps`]
+  * [x] [`_mm512_mask_cvtpd_ps`]
+  * [x] [`_mm512_maskz_cvtpd_ps`]
+  * [x] [`_mm_mask_cvtpd_ps`]
+  * [x] [`_mm_maskz_cvtpd_ps`]
+  * [x] [`_mm256_mask_cvtpd_ps`]
+  * [x] [`_mm256_maskz_cvtpd_ps`]
+  * [x] [`_mm512_cvtpd_pslo`]
+  * [x] [`_mm512_mask_cvtpd_pslo`]
+  * [x] [`_mm512_cvtph_ps`]
+  * [x] [`_mm512_mask_cvtph_ps`]
+  * [x] [`_mm512_maskz_cvtph_ps`]
+  * [x] [`_mm_mask_cvtph_ps`]
+  * [x] [`_mm_maskz_cvtph_ps`]
+  * [x] [`_mm256_mask_cvtph_ps`]
+  * [x] [`_mm256_maskz_cvtph_ps`]
+  * [x] [`_mm512_cvtps_epi32`]
+  * [x] [`_mm512_mask_cvtps_epi32`]
+  * [x] [`_mm512_maskz_cvtps_epi32`]
+  * [x] [`_mm_mask_cvtps_epi32`]
+  * [x] [`_mm_maskz_cvtps_epi32`]
+  * [x] [`_mm256_mask_cvtps_epi32`]
+  * [x] [`_mm256_maskz_cvtps_epi32`]
+  * [x] [`_mm512_cvtps_epu32`]
+  * [x] [`_mm512_mask_cvtps_epu32`]
+  * [x] [`_mm512_maskz_cvtps_epu32`]
+  * [x] [`_mm_cvtps_epu32`]
+  * [x] [`_mm_mask_cvtps_epu32`]
+  * [x] [`_mm_maskz_cvtps_epu32`]
+  * [x] [`_mm256_cvtps_epu32`]
+  * [x] [`_mm256_mask_cvtps_epu32`]
+  * [x] [`_mm256_maskz_cvtps_epu32`]
+  * [x] [`_mm512_cvtps_pd`]
+  * [x] [`_mm512_mask_cvtps_pd`]
+  * [x] [`_mm512_maskz_cvtps_pd`]
+  * [x] [`_mm512_cvtps_ph`]
+  * [x] [`_mm512_mask_cvtps_ph`]
+  * [x] [`_mm512_maskz_cvtps_ph`]
+  * [x] [`_mm_mask_cvtps_ph`]
+  * [x] [`_mm_maskz_cvtps_ph`]
+  * [x] [`_mm256_mask_cvtps_ph`]
+  * [x] [`_mm256_maskz_cvtps_ph`]
+  * [x] [`_mm512_cvtpslo_pd`]
+  * [x] [`_mm512_mask_cvtpslo_pd`]
+  * [x] [`_mm512_cvtsepi32_epi16`]
+  * [x] [`_mm512_mask_cvtsepi32_epi16`]
+  * [x] [`_mm512_maskz_cvtsepi32_epi16`]
+  * [x] [`_mm_cvtsepi32_epi16`]
+  * [x] [`_mm_mask_cvtsepi32_epi16`]
+  * [x] [`_mm_maskz_cvtsepi32_epi16`]
+  * [x] [`_mm256_cvtsepi32_epi16`]
+  * [x] [`_mm256_mask_cvtsepi32_epi16`]
+  * [x] [`_mm256_maskz_cvtsepi32_epi16`]
+  * [x] [`_mm512_cvtsepi32_epi8`]
+  * [x] [`_mm512_mask_cvtsepi32_epi8`]
+  * [x] [`_mm512_maskz_cvtsepi32_epi8`]
+  * [x] [`_mm_cvtsepi32_epi8`]
+  * [x] [`_mm_mask_cvtsepi32_epi8`]
+  * [x] [`_mm_maskz_cvtsepi32_epi8`]
+  * [x] [`_mm256_cvtsepi32_epi8`]
+  * [x] [`_mm256_mask_cvtsepi32_epi8`]
+  * [x] [`_mm256_maskz_cvtsepi32_epi8`]
+  * [x] [`_mm512_mask_cvtsepi32_storeu_epi16`]
+  * [x] [`_mm_mask_cvtsepi32_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtsepi32_storeu_epi16`]
+  * [x] [`_mm512_mask_cvtsepi32_storeu_epi8`]
+  * [x] [`_mm_mask_cvtsepi32_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtsepi32_storeu_epi8`]
+  * [x] [`_mm512_cvtsepi64_epi16`]
+  * [x] [`_mm512_mask_cvtsepi64_epi16`]
+  * [x] [`_mm512_maskz_cvtsepi64_epi16`]
+  * [x] [`_mm_cvtsepi64_epi16`]
+  * [x] [`_mm_mask_cvtsepi64_epi16`]
+  * [x] [`_mm_maskz_cvtsepi64_epi16`]
+  * [x] [`_mm256_cvtsepi64_epi16`]
+  * [x] [`_mm256_mask_cvtsepi64_epi16`]
+  * [x] [`_mm256_maskz_cvtsepi64_epi16`]
+  * [x] [`_mm512_cvtsepi64_epi32`]
+  * [x] [`_mm512_mask_cvtsepi64_epi32`]
+  * [x] [`_mm512_maskz_cvtsepi64_epi32`]
+  * [x] [`_mm_cvtsepi64_epi32`]
+  * [x] [`_mm_mask_cvtsepi64_epi32`]
+  * [x] [`_mm_maskz_cvtsepi64_epi32`]
+  * [x] [`_mm256_cvtsepi64_epi32`]
+  * [x] [`_mm256_mask_cvtsepi64_epi32`]
+  * [x] [`_mm256_maskz_cvtsepi64_epi32`]
+  * [x] [`_mm512_cvtsepi64_epi8`]
+  * [x] [`_mm512_mask_cvtsepi64_epi8`]
+  * [x] [`_mm512_maskz_cvtsepi64_epi8`]
+  * [x] [`_mm_cvtsepi64_epi8`]
+  * [x] [`_mm_mask_cvtsepi64_epi8`]
+  * [x] [`_mm_maskz_cvtsepi64_epi8`]
+  * [x] [`_mm256_cvtsepi64_epi8`]
+  * [x] [`_mm256_mask_cvtsepi64_epi8`]
+  * [x] [`_mm256_maskz_cvtsepi64_epi8`]
+  * [x] [`_mm512_mask_cvtsepi64_storeu_epi16`]
+  * [x] [`_mm_mask_cvtsepi64_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtsepi64_storeu_epi16`]
+  * [x] [`_mm512_mask_cvtsepi64_storeu_epi32`]
+  * [x] [`_mm_mask_cvtsepi64_storeu_epi32`]
+  * [x] [`_mm256_mask_cvtsepi64_storeu_epi32`]
+  * [x] [`_mm512_mask_cvtsepi64_storeu_epi8`]
+  * [x] [`_mm_mask_cvtsepi64_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtsepi64_storeu_epi8`]
+  * [x] [`_mm512_cvtusepi32_epi16`]
+  * [x] [`_mm512_mask_cvtusepi32_epi16`]
+  * [x] [`_mm512_maskz_cvtusepi32_epi16`]
+  * [x] [`_mm_cvtusepi32_epi16`]
+  * [x] [`_mm_mask_cvtusepi32_epi16`]
+  * [x] [`_mm_maskz_cvtusepi32_epi16`]
+  * [x] [`_mm256_cvtusepi32_epi16`]
+  * [x] [`_mm256_mask_cvtusepi32_epi16`]
+  * [x] [`_mm256_maskz_cvtusepi32_epi16`]
+  * [x] [`_mm512_cvtusepi32_epi8`]
+  * [x] [`_mm512_mask_cvtusepi32_epi8`]
+  * [x] [`_mm512_maskz_cvtusepi32_epi8`]
+  * [x] [`_mm_cvtusepi32_epi8`]
+  * [x] [`_mm_mask_cvtusepi32_epi8`]
+  * [x] [`_mm_maskz_cvtusepi32_epi8`]
+  * [x] [`_mm256_cvtusepi32_epi8`]
+  * [x] [`_mm256_mask_cvtusepi32_epi8`]
+  * [x] [`_mm256_maskz_cvtusepi32_epi8`]
+  * [x] [`_mm512_mask_cvtusepi32_storeu_epi16`]
+  * [x] [`_mm_mask_cvtusepi32_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtusepi32_storeu_epi16`]
+  * [x] [`_mm512_mask_cvtusepi32_storeu_epi8`]
+  * [x] [`_mm_mask_cvtusepi32_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtusepi32_storeu_epi8`]
+  * [x] [`_mm512_cvtusepi64_epi16`]
+  * [x] [`_mm512_mask_cvtusepi64_epi16`]
+  * [x] [`_mm512_maskz_cvtusepi64_epi16`]
+  * [x] [`_mm_cvtusepi64_epi16`]
+  * [x] [`_mm_mask_cvtusepi64_epi16`]
+  * [x] [`_mm_maskz_cvtusepi64_epi16`]
+  * [x] [`_mm256_cvtusepi64_epi16`]
+  * [x] [`_mm256_mask_cvtusepi64_epi16`]
+  * [x] [`_mm256_maskz_cvtusepi64_epi16`]
+  * [x] [`_mm512_cvtusepi64_epi32`]
+  * [x] [`_mm512_mask_cvtusepi64_epi32`]
+  * [x] [`_mm512_maskz_cvtusepi64_epi32`]
+  * [x] [`_mm_cvtusepi64_epi32`]
+  * [x] [`_mm_mask_cvtusepi64_epi32`]
+  * [x] [`_mm_maskz_cvtusepi64_epi32`]
+  * [x] [`_mm256_cvtusepi64_epi32`]
+  * [x] [`_mm256_mask_cvtusepi64_epi32`]
+  * [x] [`_mm256_maskz_cvtusepi64_epi32`]
+  * [x] [`_mm512_cvtusepi64_epi8`]
+  * [x] [`_mm512_mask_cvtusepi64_epi8`]
+  * [x] [`_mm512_maskz_cvtusepi64_epi8`]
+  * [x] [`_mm_cvtusepi64_epi8`]
+  * [x] [`_mm_mask_cvtusepi64_epi8`]
+  * [x] [`_mm_maskz_cvtusepi64_epi8`]
+  * [x] [`_mm256_cvtusepi64_epi8`]
+  * [x] [`_mm256_mask_cvtusepi64_epi8`]
+  * [x] [`_mm256_maskz_cvtusepi64_epi8`]
+  * [x] [`_mm512_mask_cvtusepi64_storeu_epi16`]
+  * [x] [`_mm_mask_cvtusepi64_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtusepi64_storeu_epi16`]
+  * [x] [`_mm512_mask_cvtusepi64_storeu_epi32`]
+  * [x] [`_mm_mask_cvtusepi64_storeu_epi32`]
+  * [x] [`_mm256_mask_cvtusepi64_storeu_epi32`]
+  * [x] [`_mm512_mask_cvtusepi64_storeu_epi8`]
+  * [x] [`_mm_mask_cvtusepi64_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtusepi64_storeu_epi8`]
+  * [x] [`_mm512_cvtsi512_si32`]
+  * [x] [`_mm512_cvttpd_epi32`]
+  * [x] [`_mm512_mask_cvttpd_epi32`]
+  * [x] [`_mm512_maskz_cvttpd_epi32`]
+  * [x] [`_mm_mask_cvttpd_epi32`]
+  * [x] [`_mm_maskz_cvttpd_epi32`]
+  * [x] [`_mm256_mask_cvttpd_epi32`]
+  * [x] [`_mm256_maskz_cvttpd_epi32`]
+  * [x] [`_mm512_cvttpd_epu32`]
+  * [x] [`_mm512_mask_cvttpd_epu32`]
+  * [x] [`_mm512_maskz_cvttpd_epu32`]
+  * [x] [`_mm_cvttpd_epu32`]
+  * [x] [`_mm_mask_cvttpd_epu32`]
+  * [x] [`_mm_maskz_cvttpd_epu32`]
+  * [x] [`_mm256_cvttpd_epu32`]
+  * [x] [`_mm256_mask_cvttpd_epu32`]
+  * [x] [`_mm256_maskz_cvttpd_epu32`]
+  * [x] [`_mm512_cvttps_epi32`]
+  * [x] [`_mm512_mask_cvttps_epi32`]
+  * [x] [`_mm512_maskz_cvttps_epi32`]
+  * [x] [`_mm_mask_cvttps_epi32`]
+  * [x] [`_mm_maskz_cvttps_epi32`]
+  * [x] [`_mm256_mask_cvttps_epi32`]
+  * [x] [`_mm256_maskz_cvttps_epi32`]
+  * [x] [`_mm512_cvttps_epu32`]
+  * [x] [`_mm512_mask_cvttps_epu32`]
+  * [x] [`_mm512_maskz_cvttps_epu32`]
+  * [x] [`_mm_cvttps_epu32`]
+  * [x] [`_mm_mask_cvttps_epu32`]
+  * [x] [`_mm_maskz_cvttps_epu32`]
+  * [x] [`_mm256_cvttps_epu32`]
+  * [x] [`_mm256_mask_cvttps_epu32`]
+  * [x] [`_mm256_maskz_cvttps_epu32`]
+  * [x] [`_mm512_cvt_roundepi32_ps`]
+  * [x] [`_mm512_mask_cvt_roundepi32_ps`]
+  * [x] [`_mm512_maskz_cvt_roundepi32_ps`]
+  * [x] [`_mm512_cvt_roundepu32_ps`]
+  * [x] [`_mm512_mask_cvt_roundepu32_ps`]
+  * [x] [`_mm512_maskz_cvt_roundepu32_ps`]
+  * [x] [`_mm512_cvt_roundpd_epi32`]
+  * [x] [`_mm512_mask_cvt_roundpd_epi32`]
+  * [x] [`_mm512_maskz_cvt_roundpd_epi32`]
+  * [x] [`_mm512_cvt_roundpd_epu32`]
+  * [x] [`_mm512_mask_cvt_roundpd_epu32`]
+  * [x] [`_mm512_maskz_cvt_roundpd_epu32`]
+  * [x] [`_mm512_cvt_roundpd_ps`]
+  * [x] [`_mm512_mask_cvt_roundpd_ps`]
+  * [x] [`_mm512_maskz_cvt_roundpd_ps`]
+  * [x] [`_mm512_cvt_roundph_ps`]
+  * [x] [`_mm512_mask_cvt_roundph_ps`]
+  * [x] [`_mm512_maskz_cvt_roundph_ps`]
+  * [x] [`_mm512_cvt_roundps_epi32`]
+  * [x] [`_mm512_mask_cvt_roundps_epi32`]
+  * [x] [`_mm512_maskz_cvt_roundps_epi32`]
+  * [x] [`_mm512_cvt_roundps_epu32`]
+  * [x] [`_mm512_mask_cvt_roundps_epu32`]
+  * [x] [`_mm512_maskz_cvt_roundps_epu32`]
+  * [x] [`_mm512_cvt_roundps_pd`]
+  * [x] [`_mm512_mask_cvt_roundps_pd`]
+  * [x] [`_mm512_maskz_cvt_roundps_pd`]
+  * [x] [`_mm512_cvtt_roundpd_epi32`]
+  * [x] [`_mm512_mask_cvtt_roundpd_epi32`]
+  * [x] [`_mm512_maskz_cvtt_roundpd_epi32`]
+  * [x] [`_mm512_cvtt_roundpd_epu32`]
+  * [x] [`_mm512_mask_cvtt_roundpd_epu32`]
+  * [x] [`_mm512_maskz_cvtt_roundpd_epu32`]
+  * [x] [`_mm512_cvtt_roundps_epi32`]
+  * [x] [`_mm512_mask_cvtt_roundps_epi32`]
+  * [x] [`_mm512_maskz_cvtt_roundps_epi32`]
+  * [x] [`_mm512_cvtt_roundps_epu32`]
+  * [x] [`_mm512_mask_cvtt_roundps_epu32`]
+  * [x] [`_mm512_maskz_cvtt_roundps_epu32`]
+  * [x] [`_mm_add_round_sd`]
+  * [x] [`_mm_add_round_ss`]
+  * [x] [`_mm_cmp_round_sd_mask`]
+  * [x] [`_mm_cmp_round_ss_mask`]
+  * [x] [`_mm_cmp_sd_mask`]
+  * [x] [`_mm_cmp_ss_mask`]
+  * [x] [`_mm_comi_round_sd`]
+  * [x] [`_mm_comi_round_ss`]
+  * [x] [`_mm_cvt_roundi32_ss`]
+  * [x] [`_mm_cvt_roundi64_sd`]
+  * [x] [`_mm_cvt_roundi64_ss`]
+  * [x] [`_mm_cvt_roundsd_i32`]
+  * [x] [`_mm_cvt_roundsd_i64`]
+  * [x] [`_mm_cvt_roundsd_si32`]
+  * [x] [`_mm_cvt_roundsd_si64`]
+  * [x] [`_mm_cvt_roundsd_ss`]
+  * [x] [`_mm_cvt_roundsd_u32`]
+  * [x] [`_mm_cvt_roundsd_u64`]
+  * [x] [`_mm_cvt_roundsi32_ss`]
+  * [x] [`_mm_cvt_roundsi64_sd`]
+  * [x] [`_mm_cvt_roundsi64_ss`]
+  * [x] [`_mm_cvt_roundss_i32`]
+  * [x] [`_mm_cvt_roundss_i64`]
+  * [x] [`_mm_cvt_roundss_sd`]
+  * [x] [`_mm_cvt_roundss_si32`]
+  * [x] [`_mm_cvt_roundss_si64`]
+  * [x] [`_mm_cvt_roundss_u32`]
+  * [x] [`_mm_cvt_roundss_u64`]
+  * [x] [`_mm_cvt_roundu32_ss`]
+  * [x] [`_mm_cvt_roundu64_sd`]
+  * [x] [`_mm_cvt_roundu64_ss`]
+  * [x] [`_mm_cvti32_sd`]
+  * [x] [`_mm_cvti32_ss`]
+  * [x] [`_mm_cvti64_sd`]
+  * [x] [`_mm_cvti64_ss`]
+  * [x] [`_mm_cvtsd_i32`]
+  * [x] [`_mm_cvtsd_i64`]
+  * [x] [`_mm_cvtsd_u32`]
+  * [x] [`_mm_cvtsd_u64`]
+  * [x] [`_mm_cvtss_i32`]
+  * [x] [`_mm_cvtss_i64`]
+  * [x] [`_mm_cvtss_u32`]
+  * [x] [`_mm_cvtss_u64`]
+  * [x] [`_mm_cvtt_roundsd_i32`]
+  * [x] [`_mm_cvtt_roundsd_i64`]
+  * [x] [`_mm_cvtt_roundsd_si32`]
+  * [x] [`_mm_cvtt_roundsd_si64`]
+  * [x] [`_mm_cvtt_roundsd_u32`]
+  * [x] [`_mm_cvtt_roundsd_u64`]
+  * [x] [`_mm_cvtt_roundss_i32`]
+  * [x] [`_mm_cvtt_roundss_i64`]
+  * [x] [`_mm_cvtt_roundss_si32`]
+  * [x] [`_mm_cvtt_roundss_si64`]
+  * [x] [`_mm_cvtt_roundss_u32`]
+  * [x] [`_mm_cvtt_roundss_u64`]
+  * [x] [`_mm_cvttsd_i32`]
+  * [x] [`_mm_cvttsd_i64`]
+  * [x] [`_mm_cvttsd_u32`]
+  * [x] [`_mm_cvttsd_u64`]
+  * [x] [`_mm_cvttss_i32`]
+  * [x] [`_mm_cvttss_i64`]
+  * [x] [`_mm_cvttss_u32`]
+  * [x] [`_mm_cvttss_u64`]
+  * [x] [`_mm_cvtu32_sd`]
+  * [x] [`_mm_cvtu32_ss`]
+  * [x] [`_mm_cvtu64_sd`]
+  * [x] [`_mm_cvtu64_ss`]
+  * [x] [`_mm_div_round_sd`]
+  * [x] [`_mm_div_round_ss`]
+  * [x] [`_mm_fixupimm_round_sd`]
+  * [x] [`_mm_fixupimm_round_ss`]
+  * [x] [`_mm_fixupimm_sd`]
+  * [x] [`_mm_fixupimm_ss`]
+  * [x] [`_mm_fmadd_round_sd`]
+  * [x] [`_mm_fmadd_round_ss`]
+  * [x] [`_mm_fmsub_round_sd`]
+  * [x] [`_mm_fmsub_round_ss`]
+  * [x] [`_mm_fnmadd_round_sd`]
+  * [x] [`_mm_fnmadd_round_ss`]
+  * [x] [`_mm_fnmsub_round_sd`]
+  * [x] [`_mm_fnmsub_round_ss`]
+  * [x] [`_mm_getexp_round_sd`]
+  * [x] [`_mm_getexp_round_ss`]
+  * [x] [`_mm_getexp_sd`]
+  * [x] [`_mm_getexp_ss`]
+  * [x] [`_mm_getmant_round_sd`]
+  * [x] [`_mm_getmant_round_ss`]
+  * [x] [`_mm_getmant_sd`]
+  * [x] [`_mm_getmant_ss`]
+  * [x] [`_mm_mask3_fmadd_round_sd`]
+  * [x] [`_mm_mask3_fmadd_round_ss`]
+  * [x] [`_mm_mask3_fmadd_sd`]
+  * [x] [`_mm_mask3_fmadd_ss`]
+  * [x] [`_mm_mask3_fmsub_round_sd`]
+  * [x] [`_mm_mask3_fmsub_round_ss`]
+  * [x] [`_mm_mask3_fmsub_sd`]
+  * [x] [`_mm_mask3_fmsub_ss`]
+  * [x] [`_mm_mask3_fnmadd_round_sd`]
+  * [x] [`_mm_mask3_fnmadd_round_ss`]
+  * [x] [`_mm_mask3_fnmadd_sd`]
+  * [x] [`_mm_mask3_fnmadd_ss`]
+  * [x] [`_mm_mask3_fnmsub_round_sd`]
+  * [x] [`_mm_mask3_fnmsub_round_ss`]
+  * [x] [`_mm_mask3_fnmsub_sd`]
+  * [x] [`_mm_mask3_fnmsub_ss`]
+  * [x] [`_mm_mask_add_round_sd`]
+  * [x] [`_mm_mask_add_round_ss`]
+  * [x] [`_mm_mask_add_sd`]
+  * [x] [`_mm_mask_add_ss`]
+  * [x] [`_mm_mask_cmp_round_sd_mask`]
+  * [x] [`_mm_mask_cmp_round_ss_mask`]
+  * [x] [`_mm_mask_cmp_sd_mask`]
+  * [x] [`_mm_mask_cmp_ss_mask`]
+  * [x] [`_mm_mask_cvt_roundsd_ss`]
+  * [x] [`_mm_mask_cvt_roundss_sd`]
+  * [x] [`_mm_mask_cvtsd_ss`]
+  * [x] [`_mm_mask_cvtss_sd`]
+  * [x] [`_mm_mask_div_round_sd`]
+  * [x] [`_mm_mask_div_round_ss`]
+  * [x] [`_mm_mask_div_sd`]
+  * [x] [`_mm_mask_div_ss`]
+  * [x] [`_mm_mask_fixupimm_round_sd`]
+  * [x] [`_mm_mask_fixupimm_round_ss`]
+  * [x] [`_mm_mask_fixupimm_sd`]
+  * [x] [`_mm_mask_fixupimm_ss`]
+  * [x] [`_mm_mask_fmadd_round_sd`]
+  * [x] [`_mm_mask_fmadd_round_ss`]
+  * [x] [`_mm_mask_fmadd_sd`]
+  * [x] [`_mm_mask_fmadd_ss`]
+  * [x] [`_mm_mask_fmsub_round_sd`]
+  * [x] [`_mm_mask_fmsub_round_ss`]
+  * [x] [`_mm_mask_fmsub_sd`]
+  * [x] [`_mm_mask_fmsub_ss`]
+  * [x] [`_mm_mask_fnmadd_round_sd`]
+  * [x] [`_mm_mask_fnmadd_round_ss`]
+  * [x] [`_mm_mask_fnmadd_sd`]
+  * [x] [`_mm_mask_fnmadd_ss`]
+  * [x] [`_mm_mask_fnmsub_round_sd`]
+  * [x] [`_mm_mask_fnmsub_round_ss`]
+  * [x] [`_mm_mask_fnmsub_sd`]
+  * [x] [`_mm_mask_fnmsub_ss`]
+  * [x] [`_mm_mask_getexp_round_sd`]
+  * [x] [`_mm_mask_getexp_round_ss`]
+  * [x] [`_mm_mask_getexp_sd`]
+  * [x] [`_mm_mask_getexp_ss`]
+  * [x] [`_mm_mask_getmant_round_sd`]
+  * [x] [`_mm_mask_getmant_round_ss`]
+  * [x] [`_mm_mask_getmant_sd`]
+  * [x] [`_mm_mask_getmant_ss`]
+  * [ ] [`_mm_mask_load_sd`] //need i1
+  * [ ] [`_mm_mask_load_ss`] //need i1
+  * [x] [`_mm_mask_max_round_sd`]
+  * [x] [`_mm_mask_max_round_ss`]
+  * [x] [`_mm_mask_max_sd`]
+  * [x] [`_mm_mask_max_ss`]
+  * [x] [`_mm_mask_min_round_sd`]
+  * [x] [`_mm_mask_min_round_ss`]
+  * [x] [`_mm_mask_min_sd`]
+  * [x] [`_mm_mask_min_ss`]
+  * [x] [`_mm_mask_move_sd`]
+  * [x] [`_mm_mask_move_ss`]
+  * [x] [`_mm_mask_mul_round_sd`]
+  * [x] [`_mm_mask_mul_round_ss`]
+  * [x] [`_mm_mask_mul_sd`]
+  * [x] [`_mm_mask_mul_ss`]
+  * [x] [`_mm_mask_rcp14_sd`]
+  * [x] [`_mm_mask_rcp14_ss`]
+  * [x] [`_mm_mask_roundscale_round_sd`]
+  * [x] [`_mm_mask_roundscale_round_ss`]
+  * [x] [`_mm_mask_roundscale_sd`]
+  * [x] [`_mm_mask_roundscale_ss`]
+  * [x] [`_mm_mask_rsqrt14_sd`]
+  * [x] [`_mm_mask_rsqrt14_ss`]
+  * [x] [`_mm_mask_scalef_round_sd`]
+  * [x] [`_mm_mask_scalef_round_ss`]
+  * [x] [`_mm_mask_scalef_sd`]
+  * [x] [`_mm_mask_scalef_ss`]
+  * [x] [`_mm_mask_sqrt_round_sd`]
+  * [x] [`_mm_mask_sqrt_round_ss`]
+  * [x] [`_mm_mask_sqrt_sd`]
+  * [x] [`_mm_mask_sqrt_ss`]
+  * [ ] [`_mm_mask_store_sd`] //need i1
+  * [ ] [`_mm_mask_store_ss`] //need i1
+  * [x] [`_mm_mask_sub_round_sd`]
+  * [x] [`_mm_mask_sub_round_ss`]
+  * [x] [`_mm_mask_sub_sd`]
+  * [x] [`_mm_mask_sub_ss`]
+  * [x] [`_mm_maskz_add_round_sd`]
+  * [x] [`_mm_maskz_add_round_ss`]
+  * [x] [`_mm_maskz_add_sd`]
+  * [x] [`_mm_maskz_add_ss`]
+  * [x] [`_mm_maskz_cvt_roundsd_ss`]
+  * [x] [`_mm_maskz_cvt_roundss_sd`]
+  * [x] [`_mm_maskz_cvtsd_ss`]
+  * [x] [`_mm_maskz_cvtss_sd`]
+  * [x] [`_mm_maskz_div_round_sd`]
+  * [x] [`_mm_maskz_div_round_ss`]
+  * [x] [`_mm_maskz_div_sd`]
+  * [x] [`_mm_maskz_div_ss`]
+  * [x] [`_mm_maskz_fixupimm_round_sd`]
+  * [x] [`_mm_maskz_fixupimm_round_ss`]
+  * [x] [`_mm_maskz_fixupimm_sd`]
+  * [x] [`_mm_maskz_fixupimm_ss`]
+  * [x] [`_mm_maskz_fmadd_round_sd`]
+  * [x] [`_mm_maskz_fmadd_round_ss`]
+  * [x] [`_mm_maskz_fmadd_sd`]
+  * [x] [`_mm_maskz_fmadd_ss`]
+  * [x] [`_mm_maskz_fmsub_round_sd`]
+  * [x] [`_mm_maskz_fmsub_round_ss`]
+  * [x] [`_mm_maskz_fmsub_sd`]
+  * [x] [`_mm_maskz_fmsub_ss`]
+  * [x] [`_mm_maskz_fnmadd_round_sd`]
+  * [x] [`_mm_maskz_fnmadd_round_ss`]
+  * [x] [`_mm_maskz_fnmadd_sd`]
+  * [x] [`_mm_maskz_fnmadd_ss`]
+  * [x] [`_mm_maskz_fnmsub_round_sd`]
+  * [x] [`_mm_maskz_fnmsub_round_ss`]
+  * [x] [`_mm_maskz_fnmsub_sd`]
+  * [x] [`_mm_maskz_fnmsub_ss`]
+  * [x] [`_mm_maskz_getexp_round_sd`]
+  * [x] [`_mm_maskz_getexp_round_ss`]
+  * [x] [`_mm_maskz_getexp_sd`]
+  * [x] [`_mm_maskz_getexp_ss`]
+  * [x] [`_mm_maskz_getmant_round_sd`]
+  * [x] [`_mm_maskz_getmant_round_ss`]
+  * [x] [`_mm_maskz_getmant_sd`]
+  * [x] [`_mm_maskz_getmant_ss`]
+  * [ ] [`_mm_maskz_load_sd`] //need i1
+  * [ ] [`_mm_maskz_load_ss`] //need i1
+  * [x] [`_mm_maskz_max_round_sd`]
+  * [x] [`_mm_maskz_max_round_ss`]
+  * [x] [`_mm_maskz_max_sd`]
+  * [x] [`_mm_maskz_max_ss`]
+  * [x] [`_mm_maskz_min_round_sd`]
+  * [x] [`_mm_maskz_min_round_ss`]
+  * [x] [`_mm_maskz_min_sd`]
+  * [x] [`_mm_maskz_min_ss`]
+  * [x] [`_mm_maskz_move_sd`]
+  * [x] [`_mm_maskz_move_ss`]
+  * [x] [`_mm_maskz_mul_round_sd`]
+  * [x] [`_mm_maskz_mul_round_ss`]
+  * [x] [`_mm_maskz_mul_sd`]
+  * [x] [`_mm_maskz_mul_ss`]
+  * [x] [`_mm_maskz_rcp14_sd`]
+  * [x] [`_mm_maskz_rcp14_ss`]
+  * [x] [`_mm_maskz_roundscale_round_sd`]
+  * [x] [`_mm_maskz_roundscale_round_ss`]
+  * [x] [`_mm_maskz_roundscale_sd`]
+  * [x] [`_mm_maskz_roundscale_ss`]
+  * [x] [`_mm_maskz_rsqrt14_sd`]
+  * [x] [`_mm_maskz_rsqrt14_ss`]
+  * [x] [`_mm_maskz_scalef_round_sd`]
+  * [x] [`_mm_maskz_scalef_round_ss`]
+  * [x] [`_mm_maskz_scalef_sd`]
+  * [x] [`_mm_maskz_scalef_ss`]
+  * [x] [`_mm_maskz_sqrt_round_sd`]
+  * [x] [`_mm_maskz_sqrt_round_ss`]
+  * [x] [`_mm_maskz_sqrt_sd`]
+  * [x] [`_mm_maskz_sqrt_ss`]
+  * [x] [`_mm_maskz_sub_round_sd`]
+  * [x] [`_mm_maskz_sub_round_ss`]
+  * [x] [`_mm_maskz_sub_sd`]
+  * [x] [`_mm_maskz_sub_ss`]
+  * [x] [`_mm_max_round_sd`]
+  * [x] [`_mm_max_round_ss`]
+  * [x] [`_mm_min_round_sd`]
+  * [x] [`_mm_min_round_ss`]
+  * [x] [`_mm_mul_round_sd`]
+  * [x] [`_mm_mul_round_ss`]
+  * [x] [`_mm_rcp14_sd`]
+  * [x] [`_mm_rcp14_ss`]
+  * [x] [`_mm_roundscale_round_sd`]
+  * [x] [`_mm_roundscale_round_ss`]
+  * [x] [`_mm_roundscale_sd`]
+  * [x] [`_mm_roundscale_ss`]
+  * [x] [`_mm_rsqrt14_sd`]
+  * [x] [`_mm_rsqrt14_ss`]
+  * [x] [`_mm_scalef_round_sd`]
+  * [x] [`_mm_scalef_round_ss`]
+  * [x] [`_mm_scalef_sd`]
+  * [x] [`_mm_scalef_ss`]
+  * [x] [`_mm_sqrt_round_sd`]
+  * [x] [`_mm_sqrt_round_ss`]
+  * [x] [`_mm_sub_round_sd`]
+  * [x] [`_mm_sub_round_ss`]
+  * [x] [`_mm512_int2mask`]
+  * [x] [`_mm512_kand`]
+  * [x] [`_mm512_kandn`]
+  * [x] [`_mm512_kmov`]
+  * [x] [`_mm512_knot`]
+  * [x] [`_mm512_kor`]
+  * [x] [`_mm512_kortestc`]
+  * [ ] [`_mm512_kortestz`] //not sure
+  * [x] [`_mm512_kunpackb`]
+  * [x] [`_mm512_kxnor`]
+  * [x] [`_mm512_kxor`]
+  * [x] [`_mm512_mask2int`]
+</p>
diff --git a/library/stdarch/crates/core_arch/build.rs b/library/stdarch/crates/core_arch/build.rs
new file mode 100644
index 000000000..4d65e9ddc
--- /dev/null
+++ b/library/stdarch/crates/core_arch/build.rs
@@ -0,0 +1,3 @@
+fn main() {
+    println!("cargo:rustc-cfg=core_arch_docs");
+}
diff --git a/library/stdarch/crates/core_arch/rustfmt.toml b/library/stdarch/crates/core_arch/rustfmt.toml
new file mode 100644
index 000000000..4ae742ba8
--- /dev/null
+++ b/library/stdarch/crates/core_arch/rustfmt.toml
@@ -0,0 +1,3 @@
+ignore = [
+    "src/simd.rs",
+]
diff --git a/library/stdarch/crates/core_arch/src/aarch64/armclang.rs b/library/stdarch/crates/core_arch/src/aarch64/armclang.rs
new file mode 100644
index 000000000..7ad6ae50c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/armclang.rs
@@ -0,0 +1,23 @@
+//! ARM compiler specific intrinsics
+//!
+//! # References
+//!
+//! - [ARM Compiler v 6.10 - armclang Reference Guide][arm_comp_ref]
+//!
+//! [arm_comp_ref]: https://developer.arm.com/docs/100067/0610
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Inserts a breakpoint instruction.
+///
+/// `VAL` is a compile-time constant integer in range `[0, 65535]`.
+///
+/// The breakpoint instruction inserted is `BRK` on A64.
+#[cfg_attr(test, assert_instr(brk, VAL = 0))]
+#[inline(always)]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __breakpoint<const VAL: i32>() {
+    static_assert_imm16!(VAL);
+    crate::arch::asm!("brk {}", const VAL);
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/crc.rs b/library/stdarch/crates/core_arch/src/aarch64/crc.rs
new file mode 100644
index 000000000..6e8128534
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/crc.rs
@@ -0,0 +1,45 @@
+extern "unadjusted" {
+    #[link_name = "llvm.aarch64.crc32x"]
+    fn crc32x_(crc: u32, data: u64) -> u32;
+
+    #[link_name = "llvm.aarch64.crc32cx"]
+    fn crc32cx_(crc: u32, data: u64) -> u32;
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// CRC32 single round checksum for quad words (64 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32x))]
+pub unsafe fn __crc32d(crc: u32, data: u64) -> u32 {
+    crc32x_(crc, data)
+}
+
+/// CRC32-C single round checksum for quad words (64 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(test, assert_instr(crc32cx))]
+pub unsafe fn __crc32cd(crc: u32, data: u64) -> u32 {
+    crc32cx_(crc, data)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::{aarch64::*, simd::*};
+    use std::mem;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32d() {
+        assert_eq!(__crc32d(0, 0), 0);
+        assert_eq!(__crc32d(0, 18446744073709551615), 1147535477);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32cd() {
+        assert_eq!(__crc32cd(0, 0), 0);
+        assert_eq!(__crc32cd(0, 18446744073709551615), 3293575501);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
new file mode 100644
index 000000000..0411fc106
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
@@ -0,0 +1,41 @@
+//! AArch64 intrinsics.
+//!
+//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The
+//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful.
+//!
+//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
+//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
+
+mod v8;
+pub use self::v8::*;
+
+mod neon;
+pub use self::neon::*;
+
+mod tme;
+pub use self::tme::*;
+
+mod crc;
+pub use self::crc::*;
+
+mod prefetch;
+pub use self::prefetch::*;
+
+pub use super::arm_shared::*;
+
+mod armclang;
+
+pub use self::armclang::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Generates the trap instruction `BRK 1`
+#[cfg_attr(test, assert_instr(brk))]
+#[inline]
+pub unsafe fn brk() -> ! {
+    crate::intrinsics::abort()
+}
+
+#[cfg(test)]
+pub(crate) mod test_support;
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
new file mode 100644
index 000000000..74ea2963c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@@ -0,0 +1,25758 @@
+// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen/neon.spec` and run the following command to re-generate this file:
+//
+// ```
+// OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen -- crates/stdarch-gen/neon.spec
+// ```
+use super::*;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Three-way exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub unsafe fn veor3q_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v16i8")]
+        fn veor3q_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    }
+    veor3q_s8_(a, b, c)
+}
+
+/// Three-way exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub unsafe fn veor3q_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v8i16")]
+        fn veor3q_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    veor3q_s16_(a, b, c)
+}
+
+/// Three-way exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub unsafe fn veor3q_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v4i32")]
+        fn veor3q_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    veor3q_s32_(a, b, c)
+}
+
+/// Three-way exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub unsafe fn veor3q_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v2i64")]
+        fn veor3q_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t;
+    }
+    veor3q_s64_(a, b, c)
+}
+
+/// Three-way exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub unsafe fn veor3q_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v16i8")]
+        fn veor3q_u8_(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t;
+    }
+    veor3q_u8_(a, b, c)
+}
+
+/// Three-way exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub unsafe fn veor3q_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v8i16")]
+        fn veor3q_u16_(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t;
+    }
+    veor3q_u16_(a, b, c)
+}
+
+/// Three-way exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub unsafe fn veor3q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v4i32")]
+        fn veor3q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    veor3q_u32_(a, b, c)
+}
+
+/// Three-way exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
+pub unsafe fn veor3q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v2i64")]
+        fn veor3q_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    veor3q_u64_(a, b, c)
+}
+
+/// Absolute difference between the arguments of Floating
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fabd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fabd.v1f64")]
+        fn vabd_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    vabd_f64_(a, b)
+}
+
+/// Absolute difference between the arguments of Floating
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fabd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabdq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fabd.v2f64")]
+        fn vabdq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vabdq_f64_(a, b)
+}
+
+/// Floating-point absolute difference
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fabd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabds_f32(a: f32, b: f32) -> f32 {
+    simd_extract(vabd_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+}
+
+/// Floating-point absolute difference
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fabd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabdd_f64(a: f64, b: f64) -> f64 {
+    simd_extract(vabd_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+}
+
+/// Unsigned Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uabdl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabdl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    let c: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let d: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    simd_cast(vabd_u8(c, d))
+}
+
+/// Unsigned Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uabdl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabdl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    let c: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let d: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    simd_cast(vabd_u16(c, d))
+}
+
+/// Unsigned Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uabdl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabdl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    let c: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let d: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    simd_cast(vabd_u32(c, d))
+}
+
+/// Signed Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sabdl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabdl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    let c: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let d: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let e: uint8x8_t = simd_cast(vabd_s8(c, d));
+    simd_cast(e)
+}
+
+/// Signed Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sabdl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabdl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let c: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let d: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let e: uint16x4_t = simd_cast(vabd_s16(c, d));
+    simd_cast(e)
+}
+
+/// Signed Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sabdl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabdl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let c: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let d: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let e: uint32x2_t = simd_cast(vabd_s32(c, d));
+    simd_cast(e)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceq_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceq_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceq_p64(a: poly64x1_t, b: poly64x1_t) -> uint64x1_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqq_p64(a: poly64x2_t, b: poly64x2_t) -> uint64x2_t {
+    simd_eq(a, b)
+}
+
+/// Floating-point compare equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceq_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    simd_eq(a, b)
+}
+
+/// Floating-point compare equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqd_s64(a: i64, b: i64) -> u64 {
+    transmute(vceq_s64(transmute(a), transmute(b)))
+}
+
+/// Compare bitwise equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqd_u64(a: u64, b: u64) -> u64 {
+    transmute(vceq_u64(transmute(a), transmute(b)))
+}
+
+/// Floating-point compare equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqs_f32(a: f32, b: f32) -> u32 {
+    simd_extract(vceq_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+}
+
+/// Floating-point compare equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqd_f64(a: f64, b: f64) -> u64 {
+    simd_extract(vceq_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_p8(a: poly8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_p8(a: poly8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_p64(a: poly64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    simd_eq(a, transmute(b))
+}
+
+/// Signed compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_p64(a: poly64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Unsigned compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_u8(a: uint8x8_t) -> uint8x8_t {
+    let b: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Unsigned compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_u8(a: uint8x16_t) -> uint8x16_t {
+    let b: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Unsigned compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_u16(a: uint16x4_t) -> uint16x4_t {
+    let b: u16x4 = u16x4::new(0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Unsigned compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_u16(a: uint16x8_t) -> uint16x8_t {
+    let b: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Unsigned compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_u32(a: uint32x2_t) -> uint32x2_t {
+    let b: u32x2 = u32x2::new(0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Unsigned compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_u32(a: uint32x4_t) -> uint32x4_t {
+    let b: u32x4 = u32x4::new(0, 0, 0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Unsigned compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_u64(a: uint64x1_t) -> uint64x1_t {
+    let b: u64x1 = u64x1::new(0);
+    simd_eq(a, transmute(b))
+}
+
+/// Unsigned compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_u64(a: uint64x2_t) -> uint64x2_t {
+    let b: u64x2 = u64x2::new(0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    simd_eq(a, transmute(b))
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    simd_eq(a, transmute(b))
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqz_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    simd_eq(a, transmute(b))
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    simd_eq(a, transmute(b))
+}
+
+/// Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzd_s64(a: i64) -> u64 {
+    transmute(vceqz_s64(transmute(a)))
+}
+
+/// Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzd_u64(a: u64) -> u64 {
+    transmute(vceqz_u64(transmute(a)))
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzs_f32(a: f32) -> u32 {
+    simd_extract(vceqz_f32(vdup_n_f32(a)), 0)
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vceqzd_f64(a: f64) -> u64 {
+    simd_extract(vceqz_f64(vdup_n_f64(a)), 0)
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtst_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    let c: int64x1_t = simd_and(a, b);
+    let d: i64x1 = i64x1::new(0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtstq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    let c: int64x2_t = simd_and(a, b);
+    let d: i64x2 = i64x2::new(0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtst_p64(a: poly64x1_t, b: poly64x1_t) -> uint64x1_t {
+    let c: poly64x1_t = simd_and(a, b);
+    let d: i64x1 = i64x1::new(0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtstq_p64(a: poly64x2_t, b: poly64x2_t) -> uint64x2_t {
+    let c: poly64x2_t = simd_and(a, b);
+    let d: i64x2 = i64x2::new(0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Unsigned compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtst_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    let c: uint64x1_t = simd_and(a, b);
+    let d: u64x1 = u64x1::new(0);
+    simd_ne(c, transmute(d))
+}
+
+/// Unsigned compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmtst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtstq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    let c: uint64x2_t = simd_and(a, b);
+    let d: u64x2 = u64x2::new(0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Compare bitwise test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtstd_s64(a: i64, b: i64) -> u64 {
+    transmute(vtst_s64(transmute(a), transmute(b)))
+}
+
+/// Compare bitwise test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tst))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtstd_u64(a: u64, b: u64) -> u64 {
+    transmute(vtst_u64(transmute(a), transmute(b)))
+}
+
+/// Signed saturating accumulate of unsigned value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqadds_s32(a: i32, b: u32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.suqadd.i32")]
+        fn vuqadds_s32_(a: i32, b: u32) -> i32;
+    }
+    vuqadds_s32_(a, b)
+}
+
+/// Signed saturating accumulate of unsigned value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqaddd_s64(a: i64, b: u64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.suqadd.i64")]
+        fn vuqaddd_s64_(a: i64, b: u64) -> i64;
+    }
+    vuqaddd_s64_(a, b)
+}
+
+/// Signed saturating accumulate of unsigned value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqaddb_s8(a: i8, b: u8) -> i8 {
+    simd_extract(vuqadd_s8(vdup_n_s8(a), vdup_n_u8(b)), 0)
+}
+
+/// Signed saturating accumulate of unsigned value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqaddh_s16(a: i16, b: u16) -> i16 {
+    simd_extract(vuqadd_s16(vdup_n_s16(a), vdup_n_u16(b)), 0)
+}
+
+/// Floating-point absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabs_f64(a: float64x1_t) -> float64x1_t {
+    simd_fabs(a)
+}
+
+/// Floating-point absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabsq_f64(a: float64x2_t) -> float64x2_t {
+    simd_fabs(a)
+}
+
+/// Compare signed greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgt_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    simd_gt(a, b)
+}
+
+/// Compare signed greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    simd_gt(a, b)
+}
+
+/// Compare unsigned highe
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhi))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgt_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_gt(a, b)
+}
+
+/// Compare unsigned highe
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhi))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_gt(a, b)
+}
+
+/// Floating-point compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    simd_gt(a, b)
+}
+
+/// Floating-point compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    simd_gt(a, b)
+}
+
+/// Compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtd_s64(a: i64, b: i64) -> u64 {
+    transmute(vcgt_s64(transmute(a), transmute(b)))
+}
+
+/// Compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtd_u64(a: u64, b: u64) -> u64 {
+    transmute(vcgt_u64(transmute(a), transmute(b)))
+}
+
+/// Floating-point compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgts_f32(a: f32, b: f32) -> u32 {
+    simd_extract(vcgt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+}
+
+/// Floating-point compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtd_f64(a: f64, b: f64) -> u64 {
+    simd_extract(vcgt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+}
+
+/// Compare signed less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclt_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    simd_lt(a, b)
+}
+
+/// Compare signed less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    simd_lt(a, b)
+}
+
+/// Compare unsigned less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhi))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclt_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_lt(a, b)
+}
+
+/// Compare unsigned less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhi))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_lt(a, b)
+}
+
+/// Floating-point compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    simd_lt(a, b)
+}
+
+/// Floating-point compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    simd_lt(a, b)
+}
+
+/// Compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltd_s64(a: i64, b: i64) -> u64 {
+    transmute(vclt_s64(transmute(a), transmute(b)))
+}
+
+/// Compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltd_u64(a: u64, b: u64) -> u64 {
+    transmute(vclt_u64(transmute(a), transmute(b)))
+}
+
+/// Floating-point compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclts_f32(a: f32, b: f32) -> u32 {
+    simd_extract(vclt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+}
+
+/// Floating-point compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltd_f64(a: f64, b: f64) -> u64 {
+    simd_extract(vclt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+}
+
+/// Compare signed less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcle_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    simd_le(a, b)
+}
+
+/// Compare signed less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcleq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    simd_le(a, b)
+}
+
+/// Compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcged_s64(a: i64, b: i64) -> u64 {
+    transmute(vcge_s64(transmute(a), transmute(b)))
+}
+
+/// Compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcged_u64(a: u64, b: u64) -> u64 {
+    transmute(vcge_u64(transmute(a), transmute(b)))
+}
+
+/// Floating-point compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcges_f32(a: f32, b: f32) -> u32 {
+    simd_extract(vcge_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+}
+
+/// Floating-point compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcged_f64(a: f64, b: f64) -> u64 {
+    simd_extract(vcge_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+}
+
+/// Compare unsigned less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcle_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_le(a, b)
+}
+
+/// Compare unsigned less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcleq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_le(a, b)
+}
+
+/// Floating-point compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcle_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    simd_le(a, b)
+}
+
+/// Floating-point compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcleq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    simd_le(a, b)
+}
+
+/// Compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcled_s64(a: i64, b: i64) -> u64 {
+    transmute(vcle_s64(transmute(a), transmute(b)))
+}
+
+/// Compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcled_u64(a: u64, b: u64) -> u64 {
+    transmute(vcle_u64(transmute(a), transmute(b)))
+}
+
+/// Floating-point compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcles_f32(a: f32, b: f32) -> u32 {
+    simd_extract(vcle_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+}
+
+/// Floating-point compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcled_f64(a: f64, b: f64) -> u64 {
+    simd_extract(vcle_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+}
+
+/// Compare signed greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcge_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t {
+    simd_ge(a, b)
+}
+
+/// Compare signed greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgeq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
+    simd_ge(a, b)
+}
+
+/// Compare unsigned greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcge_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_ge(a, b)
+}
+
+/// Compare unsigned greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmhs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgeq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_ge(a, b)
+}
+
+/// Floating-point compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcge_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    simd_ge(a, b)
+}
+
+/// Floating-point compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgeq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    simd_ge(a, b)
+}
+
+/// Compare signed greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgez_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ge(a, transmute(b))
+}
+
+/// Compare signed greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgezq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ge(a, transmute(b))
+}
+
+/// Compare signed greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgez_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    simd_ge(a, transmute(b))
+}
+
+/// Compare signed greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgezq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ge(a, transmute(b))
+}
+
+/// Compare signed greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgez_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    simd_ge(a, transmute(b))
+}
+
+/// Compare signed greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgezq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    simd_ge(a, transmute(b))
+}
+
+/// Compare signed greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgez_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    simd_ge(a, transmute(b))
+}
+
+/// Compare signed greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgezq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    simd_ge(a, transmute(b))
+}
+
+/// Floating-point compare greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgez_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    simd_ge(a, transmute(b))
+}
+
+/// Floating-point compare greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgezq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    simd_ge(a, transmute(b))
+}
+
+/// Floating-point compare greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgez_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    simd_ge(a, transmute(b))
+}
+
+/// Floating-point compare greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgezq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    simd_ge(a, transmute(b))
+}
+
+/// Compare signed greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(eor))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgezd_s64(a: i64) -> u64 {
+    transmute(vcgez_s64(transmute(a)))
+}
+
+/// Floating-point compare greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgezs_f32(a: f32) -> u32 {
+    simd_extract(vcgez_f32(vdup_n_f32(a)), 0)
+}
+
+/// Floating-point compare greater than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgezd_f64(a: f64) -> u64 {
+    simd_extract(vcgez_f64(vdup_n_f64(a)), 0)
+}
+
+/// Compare signed greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtz_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_gt(a, transmute(b))
+}
+
+/// Compare signed greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtzq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_gt(a, transmute(b))
+}
+
+/// Compare signed greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtz_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    simd_gt(a, transmute(b))
+}
+
+/// Compare signed greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtzq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_gt(a, transmute(b))
+}
+
+/// Compare signed greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtz_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    simd_gt(a, transmute(b))
+}
+
+/// Compare signed greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtzq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    simd_gt(a, transmute(b))
+}
+
+/// Compare signed greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtz_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    simd_gt(a, transmute(b))
+}
+
+/// Compare signed greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtzq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    simd_gt(a, transmute(b))
+}
+
+/// Floating-point compare greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtz_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    simd_gt(a, transmute(b))
+}
+
+/// Floating-point compare greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtzq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    simd_gt(a, transmute(b))
+}
+
+/// Floating-point compare greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtz_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    simd_gt(a, transmute(b))
+}
+
+/// Floating-point compare greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtzq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    simd_gt(a, transmute(b))
+}
+
+/// Compare signed greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtzd_s64(a: i64) -> u64 {
+    transmute(vcgtz_s64(transmute(a)))
+}
+
+/// Floating-point compare greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtzs_f32(a: f32) -> u32 {
+    simd_extract(vcgtz_f32(vdup_n_f32(a)), 0)
+}
+
+/// Floating-point compare greater than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcgtzd_f64(a: f64) -> u64 {
+    simd_extract(vcgtz_f64(vdup_n_f64(a)), 0)
+}
+
+/// Compare signed less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclez_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_le(a, transmute(b))
+}
+
+/// Compare signed less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclezq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_le(a, transmute(b))
+}
+
+/// Compare signed less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclez_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    simd_le(a, transmute(b))
+}
+
+/// Compare signed less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclezq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_le(a, transmute(b))
+}
+
+/// Compare signed less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclez_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    simd_le(a, transmute(b))
+}
+
+/// Compare signed less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclezq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    simd_le(a, transmute(b))
+}
+
+/// Compare signed less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclez_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    simd_le(a, transmute(b))
+}
+
+/// Compare signed less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclezq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    simd_le(a, transmute(b))
+}
+
+/// Floating-point compare less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclez_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    simd_le(a, transmute(b))
+}
+
+/// Floating-point compare less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclezq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    simd_le(a, transmute(b))
+}
+
+/// Floating-point compare less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclez_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    simd_le(a, transmute(b))
+}
+
+/// Floating-point compare less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmle))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclezq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    simd_le(a, transmute(b))
+}
+
+/// Compare less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclezd_s64(a: i64) -> u64 {
+    transmute(vclez_s64(transmute(a)))
+}
+
+/// Floating-point compare less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclezs_f32(a: f32) -> u32 {
+    simd_extract(vclez_f32(vdup_n_f32(a)), 0)
+}
+
+/// Floating-point compare less than or equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vclezd_f64(a: f64) -> u64 {
+    simd_extract(vclez_f64(vdup_n_f64(a)), 0)
+}
+
+/// Compare signed less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltz_s8(a: int8x8_t) -> uint8x8_t {
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_lt(a, transmute(b))
+}
+
+/// Compare signed less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltzq_s8(a: int8x16_t) -> uint8x16_t {
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_lt(a, transmute(b))
+}
+
+/// Compare signed less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltz_s16(a: int16x4_t) -> uint16x4_t {
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    simd_lt(a, transmute(b))
+}
+
+/// Compare signed less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltzq_s16(a: int16x8_t) -> uint16x8_t {
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_lt(a, transmute(b))
+}
+
+/// Compare signed less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltz_s32(a: int32x2_t) -> uint32x2_t {
+    let b: i32x2 = i32x2::new(0, 0);
+    simd_lt(a, transmute(b))
+}
+
+/// Compare signed less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltzq_s32(a: int32x4_t) -> uint32x4_t {
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    simd_lt(a, transmute(b))
+}
+
+/// Compare signed less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltz_s64(a: int64x1_t) -> uint64x1_t {
+    let b: i64x1 = i64x1::new(0);
+    simd_lt(a, transmute(b))
+}
+
+/// Compare signed less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(cmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltzq_s64(a: int64x2_t) -> uint64x2_t {
+    let b: i64x2 = i64x2::new(0, 0);
+    simd_lt(a, transmute(b))
+}
+
+/// Floating-point compare less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltz_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    simd_lt(a, transmute(b))
+}
+
+/// Floating-point compare less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltzq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    simd_lt(a, transmute(b))
+}
+
+/// Floating-point compare less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltz_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    simd_lt(a, transmute(b))
+}
+
+/// Floating-point compare less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmlt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltzq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    simd_lt(a, transmute(b))
+}
+
+/// Compare less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(asr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltzd_s64(a: i64) -> u64 {
+    transmute(vcltz_s64(transmute(a)))
+}
+
+/// Floating-point compare less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltzs_f32(a: f32) -> u32 {
+    simd_extract(vcltz_f32(vdup_n_f32(a)), 0)
+}
+
+/// Floating-point compare less than zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcltzd_f64(a: f64) -> u64 {
+    simd_extract(vcltz_f64(vdup_n_f64(a)), 0)
+}
+
+/// Floating-point absolute compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcagt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facgt.v1i64.v1f64")]
+        fn vcagt_f64_(a: float64x1_t, b: float64x1_t) -> uint64x1_t;
+    }
+    vcagt_f64_(a, b)
+}
+
+/// Floating-point absolute compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcagtq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facgt.v2i64.v2f64")]
+        fn vcagtq_f64_(a: float64x2_t, b: float64x2_t) -> uint64x2_t;
+    }
+    vcagtq_f64_(a, b)
+}
+
+/// Floating-point absolute compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcagts_f32(a: f32, b: f32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facgt.i32.f32")]
+        fn vcagts_f32_(a: f32, b: f32) -> u32;
+    }
+    vcagts_f32_(a, b)
+}
+
+/// Floating-point absolute compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcagtd_f64(a: f64, b: f64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facgt.i64.f64")]
+        fn vcagtd_f64_(a: f64, b: f64) -> u64;
+    }
+    vcagtd_f64_(a, b)
+}
+
+/// Floating-point absolute compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcage_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facge.v1i64.v1f64")]
+        fn vcage_f64_(a: float64x1_t, b: float64x1_t) -> uint64x1_t;
+    }
+    vcage_f64_(a, b)
+}
+
+/// Floating-point absolute compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcageq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facge.v2i64.v2f64")]
+        fn vcageq_f64_(a: float64x2_t, b: float64x2_t) -> uint64x2_t;
+    }
+    vcageq_f64_(a, b)
+}
+
+/// Floating-point absolute compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcages_f32(a: f32, b: f32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facge.i32.f32")]
+        fn vcages_f32_(a: f32, b: f32) -> u32;
+    }
+    vcages_f32_(a, b)
+}
+
+/// Floating-point absolute compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcaged_f64(a: f64, b: f64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facge.i64.f64")]
+        fn vcaged_f64_(a: f64, b: f64) -> u64;
+    }
+    vcaged_f64_(a, b)
+}
+
+/// Floating-point absolute compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcalt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    vcagt_f64(b, a)
+}
+
+/// Floating-point absolute compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcaltq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    vcagtq_f64(b, a)
+}
+
+/// Floating-point absolute compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcalts_f32(a: f32, b: f32) -> u32 {
+    vcagts_f32(b, a)
+}
+
+/// Floating-point absolute compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facgt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcaltd_f64(a: f64, b: f64) -> u64 {
+    vcagtd_f64(b, a)
+}
+
+/// Floating-point absolute compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcale_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t {
+    vcage_f64(b, a)
+}
+
+/// Floating-point absolute compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcaleq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
+    vcageq_f64(b, a)
+}
+
+/// Floating-point absolute compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcales_f32(a: f32, b: f32) -> u32 {
+    vcages_f32(b, a)
+}
+
+/// Floating-point absolute compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(facge))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcaled_f64(a: f64, b: f64) -> u64 {
+    vcaged_f64(b, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_s8<const LANE1: i32, const LANE2: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm3!(LANE2);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_s8<const LANE1: i32, const LANE2: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert_imm4!(LANE1);
+    static_assert_imm4!(LANE2);
+    match LANE1 & 0b1111 {
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_s16<const LANE1: i32, const LANE2: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm2!(LANE2);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_s16<const LANE1: i32, const LANE2: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm3!(LANE2);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_s32<const LANE1: i32, const LANE2: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm1!(LANE2);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_s32<const LANE1: i32, const LANE2: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm2!(LANE2);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_s64<const LANE1: i32, const LANE2: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm1!(LANE2);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_u8<const LANE1: i32, const LANE2: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm3!(LANE2);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_u8<const LANE1: i32, const LANE2: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert_imm4!(LANE1);
+    static_assert_imm4!(LANE2);
+    match LANE1 & 0b1111 {
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_u16<const LANE1: i32, const LANE2: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm2!(LANE2);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_u16<const LANE1: i32, const LANE2: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm3!(LANE2);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_u32<const LANE1: i32, const LANE2: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm1!(LANE2);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_u32<const LANE1: i32, const LANE2: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm2!(LANE2);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_u64<const LANE1: i32, const LANE2: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm1!(LANE2);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_p8<const LANE1: i32, const LANE2: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm3!(LANE2);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_p8<const LANE1: i32, const LANE2: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert_imm4!(LANE1);
+    static_assert_imm4!(LANE2);
+    match LANE1 & 0b1111 {
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_p16<const LANE1: i32, const LANE2: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm2!(LANE2);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_p16<const LANE1: i32, const LANE2: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm3!(LANE2);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_p64<const LANE1: i32, const LANE2: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm1!(LANE2);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_f32<const LANE1: i32, const LANE2: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm1!(LANE2);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_f32<const LANE1: i32, const LANE2: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm2!(LANE2);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_laneq_f64<const LANE1: i32, const LANE2: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm1!(LANE2);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_s8<const LANE1: i32, const LANE2: i32>(a: int8x8_t, b: int8x16_t) -> int8x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm4!(LANE2);
+    let a: int8x16_t = simd_shuffle16!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_s16<const LANE1: i32, const LANE2: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm3!(LANE2);
+    let a: int16x8_t = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_s32<const LANE1: i32, const LANE2: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm2!(LANE2);
+    let a: int32x4_t = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_u8<const LANE1: i32, const LANE2: i32>(a: uint8x8_t, b: uint8x16_t) -> uint8x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm4!(LANE2);
+    let a: uint8x16_t = simd_shuffle16!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_u16<const LANE1: i32, const LANE2: i32>(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm3!(LANE2);
+    let a: uint16x8_t = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_u32<const LANE1: i32, const LANE2: i32>(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm2!(LANE2);
+    let a: uint32x4_t = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_p8<const LANE1: i32, const LANE2: i32>(a: poly8x8_t, b: poly8x16_t) -> poly8x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm4!(LANE2);
+    let a: poly8x16_t = simd_shuffle16!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_p16<const LANE1: i32, const LANE2: i32>(a: poly16x4_t, b: poly16x8_t) -> poly16x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm3!(LANE2);
+    let a: poly16x8_t = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_f32<const LANE1: i32, const LANE2: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert_imm2!(LANE2);
+    let a: float32x4_t = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_s8<const LANE1: i32, const LANE2: i32>(a: int8x16_t, b: int8x8_t) -> int8x16_t {
+    static_assert_imm4!(LANE1);
+    static_assert_imm3!(LANE2);
+    let b: int8x16_t = simd_shuffle16!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    match LANE1 & 0b1111 {
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_s16<const LANE1: i32, const LANE2: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm2!(LANE2);
+    let b: int16x8_t = simd_shuffle8!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_s32<const LANE1: i32, const LANE2: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm1!(LANE2);
+    let b: int32x4_t = simd_shuffle4!(b, b, [0, 1, 2, 3]);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_u8<const LANE1: i32, const LANE2: i32>(a: uint8x16_t, b: uint8x8_t) -> uint8x16_t {
+    static_assert_imm4!(LANE1);
+    static_assert_imm3!(LANE2);
+    let b: uint8x16_t = simd_shuffle16!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    match LANE1 & 0b1111 {
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_u16<const LANE1: i32, const LANE2: i32>(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm2!(LANE2);
+    let b: uint16x8_t = simd_shuffle8!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_u32<const LANE1: i32, const LANE2: i32>(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm1!(LANE2);
+    let b: uint32x4_t = simd_shuffle4!(b, b, [0, 1, 2, 3]);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_p8<const LANE1: i32, const LANE2: i32>(a: poly8x16_t, b: poly8x8_t) -> poly8x16_t {
+    static_assert_imm4!(LANE1);
+    static_assert_imm3!(LANE2);
+    let b: poly8x16_t = simd_shuffle16!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    match LANE1 & 0b1111 {
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_p16<const LANE1: i32, const LANE2: i32>(a: poly16x8_t, b: poly16x4_t) -> poly16x8_t {
+    static_assert_imm3!(LANE1);
+    static_assert_imm2!(LANE2);
+    let b: poly16x8_t = simd_shuffle8!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);
+    match LANE1 & 0b111 {
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_s64<const LANE1: i32, const LANE2: i32>(a: int64x2_t, b: int64x1_t) -> int64x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert!(LANE2 : i32 where LANE2 == 0);
+    let b: int64x2_t = simd_shuffle2!(b, b, [0, 1]);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_u64<const LANE1: i32, const LANE2: i32>(a: uint64x2_t, b: uint64x1_t) -> uint64x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert!(LANE2 : i32 where LANE2 == 0);
+    let b: uint64x2_t = simd_shuffle2!(b, b, [0, 1]);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_p64<const LANE1: i32, const LANE2: i32>(a: poly64x2_t, b: poly64x1_t) -> poly64x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert!(LANE2 : i32 where LANE2 == 0);
+    let b: poly64x2_t = simd_shuffle2!(b, b, [0, 1]);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_f32<const LANE1: i32, const LANE2: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
+    static_assert_imm2!(LANE1);
+    static_assert_imm1!(LANE2);
+    let b: float32x4_t = simd_shuffle4!(b, b, [0, 1, 2, 3]);
+    match LANE1 & 0b11 {
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopyq_lane_f64<const LANE1: i32, const LANE2: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
+    static_assert_imm1!(LANE1);
+    static_assert!(LANE2 : i32 where LANE2 == 0);
+    let b: float64x2_t = simd_shuffle2!(b, b, [0, 1]);
+    match LANE1 & 0b1 {
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcreate_f64(a: u64) -> float64x1_t {
+    transmute(a)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_f64_s64(a: int64x1_t) -> float64x1_t {
+    simd_cast(a)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_f64_s64(a: int64x2_t) -> float64x2_t {
+    simd_cast(a)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_f64_u64(a: uint64x1_t) -> float64x1_t {
+    simd_cast(a)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_f64_u64(a: uint64x2_t) -> float64x2_t {
+    simd_cast(a)
+}
+
+/// Floating-point convert to higher precision long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_f64_f32(a: float32x2_t) -> float64x2_t {
+    simd_cast(a)
+}
+
+/// Floating-point convert to higher precision long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_high_f64_f32(a: float32x4_t) -> float64x2_t {
+    let b: float32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    simd_cast(b)
+}
+
+/// Floating-point convert to lower precision narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_f32_f64(a: float64x2_t) -> float32x2_t {
+    simd_cast(a)
+}
+
+/// Floating-point convert to lower precision narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t {
+    simd_shuffle4!(a, simd_cast(b), [0, 1, 2, 3])
+}
+
+/// Floating-point convert to lower precision narrow, rounding to odd
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtxn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtx_f32_f64(a: float64x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtxn.v2f32.v2f64")]
+        fn vcvtx_f32_f64_(a: float64x2_t) -> float32x2_t;
+    }
+    vcvtx_f32_f64_(a)
+}
+
+/// Floating-point convert to lower precision narrow, rounding to odd
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtxn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtxd_f32_f64(a: f64) -> f32 {
+    simd_extract(vcvtx_f32_f64(vdupq_n_f64(a)), 0)
+}
+
+/// Floating-point convert to lower precision narrow, rounding to odd
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtxn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtx_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t {
+    simd_shuffle4!(a, vcvtx_f32_f64(b), [0, 1, 2, 3])
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_n_f64_s64<const N: i32>(a: int64x1_t) -> float64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64")]
+        fn vcvt_n_f64_s64_(a: int64x1_t, n: i32) -> float64x1_t;
+    }
+    vcvt_n_f64_s64_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_n_f64_s64<const N: i32>(a: int64x2_t) -> float64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64")]
+        fn vcvtq_n_f64_s64_(a: int64x2_t, n: i32) -> float64x2_t;
+    }
+    vcvtq_n_f64_s64_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvts_n_f32_s32<const N: i32>(a: i32) -> f32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxs2fp.f32.i32")]
+        fn vcvts_n_f32_s32_(a: i32, n: i32) -> f32;
+    }
+    vcvts_n_f32_s32_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtd_n_f64_s64<const N: i32>(a: i64) -> f64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxs2fp.f64.i64")]
+        fn vcvtd_n_f64_s64_(a: i64, n: i32) -> f64;
+    }
+    vcvtd_n_f64_s64_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_n_f64_u64<const N: i32>(a: uint64x1_t) -> float64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64")]
+        fn vcvt_n_f64_u64_(a: uint64x1_t, n: i32) -> float64x1_t;
+    }
+    vcvt_n_f64_u64_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_n_f64_u64<const N: i32>(a: uint64x2_t) -> float64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64")]
+        fn vcvtq_n_f64_u64_(a: uint64x2_t, n: i32) -> float64x2_t;
+    }
+    vcvtq_n_f64_u64_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvts_n_f32_u32<const N: i32>(a: u32) -> f32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxu2fp.f32.i32")]
+        fn vcvts_n_f32_u32_(a: u32, n: i32) -> f32;
+    }
+    vcvts_n_f32_u32_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtd_n_f64_u64<const N: i32>(a: u64) -> f64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxu2fp.f64.i64")]
+        fn vcvtd_n_f64_u64_(a: u64, n: i32) -> f64;
+    }
+    vcvtd_n_f64_u64_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_n_s64_f64<const N: i32>(a: float64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64")]
+        fn vcvt_n_s64_f64_(a: float64x1_t, n: i32) -> int64x1_t;
+    }
+    vcvt_n_s64_f64_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_n_s64_f64<const N: i32>(a: float64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64")]
+        fn vcvtq_n_s64_f64_(a: float64x2_t, n: i32) -> int64x2_t;
+    }
+    vcvtq_n_s64_f64_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvts_n_s32_f32<const N: i32>(a: f32) -> i32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxs.i32.f32")]
+        fn vcvts_n_s32_f32_(a: f32, n: i32) -> i32;
+    }
+    vcvts_n_s32_f32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtd_n_s64_f64<const N: i32>(a: f64) -> i64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxs.i64.f64")]
+        fn vcvtd_n_s64_f64_(a: f64, n: i32) -> i64;
+    }
+    vcvtd_n_s64_f64_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_n_u64_f64<const N: i32>(a: float64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64")]
+        fn vcvt_n_u64_f64_(a: float64x1_t, n: i32) -> uint64x1_t;
+    }
+    vcvt_n_u64_f64_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_n_u64_f64<const N: i32>(a: float64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64")]
+        fn vcvtq_n_u64_f64_(a: float64x2_t, n: i32) -> uint64x2_t;
+    }
+    vcvtq_n_u64_f64_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvts_n_u32_f32<const N: i32>(a: f32) -> u32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxu.i32.f32")]
+        fn vcvts_n_u32_f32_(a: f32, n: i32) -> u32;
+    }
+    vcvts_n_u32_f32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtd_n_u64_f64<const N: i32>(a: f64) -> u64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxu.i64.f64")]
+        fn vcvtd_n_u64_f64_(a: f64, n: i32) -> u64;
+    }
+    vcvtd_n_u64_f64_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvts_f32_s32(a: i32) -> f32 {
+    a as f32
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtd_f64_s64(a: i64) -> f64 {
+    a as f64
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvts_f32_u32(a: u32) -> f32 {
+    a as f32
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtd_f64_u64(a: u64) -> f64 {
+    a as f64
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvts_s32_f32(a: f32) -> i32 {
+    a as i32
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtd_s64_f64(a: f64) -> i64 {
+    a as i64
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvts_u32_f32(a: f32) -> u32 {
+    a as u32
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtd_u64_f64(a: f64) -> u64 {
+    a as u64
+}
+
+/// Floating-point convert to signed fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_s64_f64(a: float64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptosi.sat.v1i64.v1f64")]
+        fn vcvt_s64_f64_(a: float64x1_t) -> int64x1_t;
+    }
+    vcvt_s64_f64_(a)
+}
+
+/// Floating-point convert to signed fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_s64_f64(a: float64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptosi.sat.v2i64.v2f64")]
+        fn vcvtq_s64_f64_(a: float64x2_t) -> int64x2_t;
+    }
+    vcvtq_s64_f64_(a)
+}
+
+/// Floating-point convert to unsigned fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_u64_f64(a: float64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptoui.sat.v1i64.v1f64")]
+        fn vcvt_u64_f64_(a: float64x1_t) -> uint64x1_t;
+    }
+    vcvt_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptoui.sat.v2i64.v2f64")]
+        fn vcvtq_u64_f64_(a: float64x2_t) -> uint64x2_t;
+    }
+    vcvtq_u64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvta_s32_f32(a: float32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtas.v2i32.v2f32")]
+        fn vcvta_s32_f32_(a: float32x2_t) -> int32x2_t;
+    }
+    vcvta_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtaq_s32_f32(a: float32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtas.v4i32.v4f32")]
+        fn vcvtaq_s32_f32_(a: float32x4_t) -> int32x4_t;
+    }
+    vcvtaq_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvta_s64_f64(a: float64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtas.v1i64.v1f64")]
+        fn vcvta_s64_f64_(a: float64x1_t) -> int64x1_t;
+    }
+    vcvta_s64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtaq_s64_f64(a: float64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtas.v2i64.v2f64")]
+        fn vcvtaq_s64_f64_(a: float64x2_t) -> int64x2_t;
+    }
+    vcvtaq_s64_f64_(a)
+}
+
+/// Floating-point convert to integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtas_s32_f32(a: f32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtas.i32.f32")]
+        fn vcvtas_s32_f32_(a: f32) -> i32;
+    }
+    vcvtas_s32_f32_(a)
+}
+
+/// Floating-point convert to integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtas))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtad_s64_f64(a: f64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtas.i64.f64")]
+        fn vcvtad_s64_f64_(a: f64) -> i64;
+    }
+    vcvtad_s64_f64_(a)
+}
+
+/// Floating-point convert to integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtas_u32_f32(a: f32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtau.i32.f32")]
+        fn vcvtas_u32_f32_(a: f32) -> u32;
+    }
+    vcvtas_u32_f32_(a)
+}
+
+/// Floating-point convert to integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtad_u64_f64(a: f64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtau.i64.f64")]
+        fn vcvtad_u64_f64_(a: f64) -> u64;
+    }
+    vcvtad_u64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtn_s32_f32(a: float32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtns.v2i32.v2f32")]
+        fn vcvtn_s32_f32_(a: float32x2_t) -> int32x2_t;
+    }
+    vcvtn_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtnq_s32_f32(a: float32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtns.v4i32.v4f32")]
+        fn vcvtnq_s32_f32_(a: float32x4_t) -> int32x4_t;
+    }
+    vcvtnq_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtn_s64_f64(a: float64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtns.v1i64.v1f64")]
+        fn vcvtn_s64_f64_(a: float64x1_t) -> int64x1_t;
+    }
+    vcvtn_s64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtnq_s64_f64(a: float64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtns.v2i64.v2f64")]
+        fn vcvtnq_s64_f64_(a: float64x2_t) -> int64x2_t;
+    }
+    vcvtnq_s64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtns_s32_f32(a: f32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtns.i32.f32")]
+        fn vcvtns_s32_f32_(a: f32) -> i32;
+    }
+    vcvtns_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtns))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtnd_s64_f64(a: f64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtns.i64.f64")]
+        fn vcvtnd_s64_f64_(a: f64) -> i64;
+    }
+    vcvtnd_s64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtm_s32_f32(a: float32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtms.v2i32.v2f32")]
+        fn vcvtm_s32_f32_(a: float32x2_t) -> int32x2_t;
+    }
+    vcvtm_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtmq_s32_f32(a: float32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtms.v4i32.v4f32")]
+        fn vcvtmq_s32_f32_(a: float32x4_t) -> int32x4_t;
+    }
+    vcvtmq_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtm_s64_f64(a: float64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtms.v1i64.v1f64")]
+        fn vcvtm_s64_f64_(a: float64x1_t) -> int64x1_t;
+    }
+    vcvtm_s64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtmq_s64_f64(a: float64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtms.v2i64.v2f64")]
+        fn vcvtmq_s64_f64_(a: float64x2_t) -> int64x2_t;
+    }
+    vcvtmq_s64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtms_s32_f32(a: f32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtms.i32.f32")]
+        fn vcvtms_s32_f32_(a: f32) -> i32;
+    }
+    vcvtms_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtms))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtmd_s64_f64(a: f64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtms.i64.f64")]
+        fn vcvtmd_s64_f64_(a: f64) -> i64;
+    }
+    vcvtmd_s64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtp_s32_f32(a: float32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtps.v2i32.v2f32")]
+        fn vcvtp_s32_f32_(a: float32x2_t) -> int32x2_t;
+    }
+    vcvtp_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtpq_s32_f32(a: float32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtps.v4i32.v4f32")]
+        fn vcvtpq_s32_f32_(a: float32x4_t) -> int32x4_t;
+    }
+    vcvtpq_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtp_s64_f64(a: float64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtps.v1i64.v1f64")]
+        fn vcvtp_s64_f64_(a: float64x1_t) -> int64x1_t;
+    }
+    vcvtp_s64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtpq_s64_f64(a: float64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtps.v2i64.v2f64")]
+        fn vcvtpq_s64_f64_(a: float64x2_t) -> int64x2_t;
+    }
+    vcvtpq_s64_f64_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtps_s32_f32(a: f32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtps.i32.f32")]
+        fn vcvtps_s32_f32_(a: f32) -> i32;
+    }
+    vcvtps_s32_f32_(a)
+}
+
+/// Floating-point convert to signed integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtpd_s64_f64(a: f64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtps.i64.f64")]
+        fn vcvtpd_s64_f64_(a: f64) -> i64;
+    }
+    vcvtpd_s64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvta_u32_f32(a: float32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtau.v2i32.v2f32")]
+        fn vcvta_u32_f32_(a: float32x2_t) -> uint32x2_t;
+    }
+    vcvta_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtaq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtau.v4i32.v4f32")]
+        fn vcvtaq_u32_f32_(a: float32x4_t) -> uint32x4_t;
+    }
+    vcvtaq_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvta_u64_f64(a: float64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtau.v1i64.v1f64")]
+        fn vcvta_u64_f64_(a: float64x1_t) -> uint64x1_t;
+    }
+    vcvta_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtau))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtaq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtau.v2i64.v2f64")]
+        fn vcvtaq_u64_f64_(a: float64x2_t) -> uint64x2_t;
+    }
+    vcvtaq_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtn_u32_f32(a: float32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtnu.v2i32.v2f32")]
+        fn vcvtn_u32_f32_(a: float32x2_t) -> uint32x2_t;
+    }
+    vcvtn_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtnq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtnu.v4i32.v4f32")]
+        fn vcvtnq_u32_f32_(a: float32x4_t) -> uint32x4_t;
+    }
+    vcvtnq_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtn_u64_f64(a: float64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtnu.v1i64.v1f64")]
+        fn vcvtn_u64_f64_(a: float64x1_t) -> uint64x1_t;
+    }
+    vcvtn_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtnq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtnu.v2i64.v2f64")]
+        fn vcvtnq_u64_f64_(a: float64x2_t) -> uint64x2_t;
+    }
+    vcvtnq_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtns_u32_f32(a: f32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtnu.i32.f32")]
+        fn vcvtns_u32_f32_(a: f32) -> u32;
+    }
+    vcvtns_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtnu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtnd_u64_f64(a: f64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtnu.i64.f64")]
+        fn vcvtnd_u64_f64_(a: f64) -> u64;
+    }
+    vcvtnd_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtm_u32_f32(a: float32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtmu.v2i32.v2f32")]
+        fn vcvtm_u32_f32_(a: float32x2_t) -> uint32x2_t;
+    }
+    vcvtm_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtmq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtmu.v4i32.v4f32")]
+        fn vcvtmq_u32_f32_(a: float32x4_t) -> uint32x4_t;
+    }
+    vcvtmq_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtm_u64_f64(a: float64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtmu.v1i64.v1f64")]
+        fn vcvtm_u64_f64_(a: float64x1_t) -> uint64x1_t;
+    }
+    vcvtm_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtmq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtmu.v2i64.v2f64")]
+        fn vcvtmq_u64_f64_(a: float64x2_t) -> uint64x2_t;
+    }
+    vcvtmq_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtms_u32_f32(a: f32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtmu.i32.f32")]
+        fn vcvtms_u32_f32_(a: f32) -> u32;
+    }
+    vcvtms_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtmu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtmd_u64_f64(a: f64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtmu.i64.f64")]
+        fn vcvtmd_u64_f64_(a: f64) -> u64;
+    }
+    vcvtmd_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtp_u32_f32(a: float32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtpu.v2i32.v2f32")]
+        fn vcvtp_u32_f32_(a: float32x2_t) -> uint32x2_t;
+    }
+    vcvtp_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtpq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtpu.v4i32.v4f32")]
+        fn vcvtpq_u32_f32_(a: float32x4_t) -> uint32x4_t;
+    }
+    vcvtpq_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtp_u64_f64(a: float64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtpu.v1i64.v1f64")]
+        fn vcvtp_u64_f64_(a: float64x1_t) -> uint64x1_t;
+    }
+    vcvtp_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtpq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtpu.v2i64.v2f64")]
+        fn vcvtpq_u64_f64_(a: float64x2_t) -> uint64x2_t;
+    }
+    vcvtpq_u64_f64_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtps_u32_f32(a: f32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtpu.i32.f32")]
+        fn vcvtps_u32_f32_(a: f32) -> u32;
+    }
+    vcvtps_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned integer, rounding toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtpu))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtpd_u64_f64(a: f64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fcvtpu.i64.f64")]
+        fn vcvtpd_u64_f64_(a: f64) -> u64;
+    }
+    vcvtpd_u64_f64_(a)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupq_laneq_p64<const N: i32>(a: poly64x2_t) -> poly64x2_t {
+    static_assert_imm1!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupq_lane_p64<const N: i32>(a: poly64x1_t) -> poly64x2_t {
+    static_assert!(N : i32 where N == 0);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupq_laneq_f64<const N: i32>(a: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupq_lane_f64<const N: i32>(a: float64x1_t) -> float64x2_t {
+    static_assert!(N : i32 where N == 0);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdup_lane_p64<const N: i32>(a: poly64x1_t) -> poly64x1_t {
+    static_assert!(N : i32 where N == 0);
+    a
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdup_lane_f64<const N: i32>(a: float64x1_t) -> float64x1_t {
+    static_assert!(N : i32 where N == 0);
+    a
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdup_laneq_p64<const N: i32>(a: poly64x2_t) -> poly64x1_t {
+    static_assert_imm1!(N);
+    transmute::<u64, _>(simd_extract(a, N as u32))
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdup_laneq_f64<const N: i32>(a: float64x2_t) -> float64x1_t {
+    static_assert_imm1!(N);
+    transmute::<f64, _>(simd_extract(a, N as u32))
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupb_lane_s8<const N: i32>(a: int8x8_t) -> i8 {
+    static_assert_imm3!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupb_laneq_s8<const N: i32>(a: int8x16_t) -> i8 {
+    static_assert_imm4!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vduph_lane_s16<const N: i32>(a: int16x4_t) -> i16 {
+    static_assert_imm2!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vduph_laneq_s16<const N: i32>(a: int16x8_t) -> i16 {
+    static_assert_imm3!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdups_lane_s32<const N: i32>(a: int32x2_t) -> i32 {
+    static_assert_imm1!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdups_laneq_s32<const N: i32>(a: int32x4_t) -> i32 {
+    static_assert_imm2!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupd_lane_s64<const N: i32>(a: int64x1_t) -> i64 {
+    static_assert!(N : i32 where N == 0);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupd_laneq_s64<const N: i32>(a: int64x2_t) -> i64 {
+    static_assert_imm1!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupb_lane_u8<const N: i32>(a: uint8x8_t) -> u8 {
+    static_assert_imm3!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupb_laneq_u8<const N: i32>(a: uint8x16_t) -> u8 {
+    static_assert_imm4!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vduph_lane_u16<const N: i32>(a: uint16x4_t) -> u16 {
+    static_assert_imm2!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vduph_laneq_u16<const N: i32>(a: uint16x8_t) -> u16 {
+    static_assert_imm3!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdups_lane_u32<const N: i32>(a: uint32x2_t) -> u32 {
+    static_assert_imm1!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdups_laneq_u32<const N: i32>(a: uint32x4_t) -> u32 {
+    static_assert_imm2!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupd_lane_u64<const N: i32>(a: uint64x1_t) -> u64 {
+    static_assert!(N : i32 where N == 0);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupd_laneq_u64<const N: i32>(a: uint64x2_t) -> u64 {
+    static_assert_imm1!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupb_lane_p8<const N: i32>(a: poly8x8_t) -> p8 {
+    static_assert_imm3!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupb_laneq_p8<const N: i32>(a: poly8x16_t) -> p8 {
+    static_assert_imm4!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vduph_lane_p16<const N: i32>(a: poly16x4_t) -> p16 {
+    static_assert_imm2!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vduph_laneq_p16<const N: i32>(a: poly16x8_t) -> p16 {
+    static_assert_imm3!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdups_lane_f32<const N: i32>(a: float32x2_t) -> f32 {
+    static_assert_imm1!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdups_laneq_f32<const N: i32>(a: float32x4_t) -> f32 {
+    static_assert_imm2!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupd_lane_f64<const N: i32>(a: float64x1_t) -> f64 {
+    static_assert!(N : i32 where N == 0);
+    simd_extract(a, N as u32)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupd_laneq_f64<const N: i32>(a: float64x2_t) -> f64 {
+    static_assert_imm1!(N);
+    simd_extract(a, N as u32)
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ext, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vextq_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert_imm1!(N);
+    match N & 0b1 {
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ext, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vextq_f64<const N: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(N);
+    match N & 0b1 {
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Floating-point multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmla_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Floating-point multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Signed multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: int8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vmlal_s8(a, b, c)
+}
+
+/// Signed multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let c: int16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
+    vmlal_s16(a, b, c)
+}
+
+/// Signed multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let c: int32x2_t = simd_shuffle2!(c, c, [2, 3]);
+    vmlal_s32(a, b, c)
+}
+
+/// Unsigned multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: uint8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vmlal_u8(a, b, c)
+}
+
+/// Unsigned multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let c: uint16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
+    vmlal_u16(a, b, c)
+}
+
+/// Unsigned multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let c: uint32x2_t = simd_shuffle2!(c, c, [2, 3]);
+    vmlal_u32(a, b, c)
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vmlal_high_s16(a, b, vdupq_n_s16(c))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vmlal_high_s32(a, b, vdupq_n_s32(c))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t {
+    vmlal_high_u16(a, b, vdupq_n_u16(c))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t {
+    vmlal_high_u32(a, b, vdupq_n_u32(c))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlal_high_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmlal_high_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmlal_high_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmlal_high_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlal_high_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmlal_high_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmlal_high_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlal_high_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmlal_high_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmls_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Floating-point multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Signed multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: int8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vmlsl_s8(a, b, c)
+}
+
+/// Signed multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let c: int16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
+    vmlsl_s16(a, b, c)
+}
+
+/// Signed multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let c: int32x2_t = simd_shuffle2!(c, c, [2, 3]);
+    vmlsl_s32(a, b, c)
+}
+
+/// Unsigned multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: uint8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vmlsl_u8(a, b, c)
+}
+
+/// Unsigned multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let c: uint16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
+    vmlsl_u16(a, b, c)
+}
+
+/// Unsigned multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let c: uint32x2_t = simd_shuffle2!(c, c, [2, 3]);
+    vmlsl_u32(a, b, c)
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vmlsl_high_s16(a, b, vdupq_n_s16(c))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vmlsl_high_s32(a, b, vdupq_n_s32(c))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t {
+    vmlsl_high_u16(a, b, vdupq_n_u16(c))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t {
+    vmlsl_high_u32(a, b, vdupq_n_u32(c))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsl_high_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmlsl_high_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmlsl_high_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmlsl_high_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsl_high_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmlsl_high_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmlsl_high_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmlsl_high_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmlsl_high_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    let c: int8x8_t = simd_cast(b);
+    simd_shuffle16!(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    let c: int16x4_t = simd_cast(b);
+    simd_shuffle8!(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    let c: int32x2_t = simd_cast(b);
+    simd_shuffle4!(a, c, [0, 1, 2, 3])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    let c: uint8x8_t = simd_cast(b);
+    simd_shuffle16!(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    let c: uint16x4_t = simd_cast(b);
+    simd_shuffle8!(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    let c: uint32x2_t = simd_cast(b);
+    simd_shuffle4!(a, c, [0, 1, 2, 3])
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(neg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vneg_s64(a: int64x1_t) -> int64x1_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(neg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vnegq_s64(a: int64x2_t) -> int64x2_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(neg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vnegd_s64(a: i64) -> i64 {
+    a.wrapping_neg()
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fneg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vneg_f64(a: float64x1_t) -> float64x1_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fneg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vnegq_f64(a: float64x2_t) -> float64x2_t {
+    simd_neg(a)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqneg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqneg_s64(a: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v1i64")]
+        fn vqneg_s64_(a: int64x1_t) -> int64x1_t;
+    }
+    vqneg_s64_(a)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqneg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqnegq_s64(a: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v2i64")]
+        fn vqnegq_s64_(a: int64x2_t) -> int64x2_t;
+    }
+    vqnegq_s64_(a)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqneg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqnegb_s8(a: i8) -> i8 {
+    simd_extract(vqneg_s8(vdup_n_s8(a)), 0)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqneg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqnegh_s16(a: i16) -> i16 {
+    simd_extract(vqneg_s16(vdup_n_s16(a)), 0)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqneg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqnegs_s32(a: i32) -> i32 {
+    simd_extract(vqneg_s32(vdup_n_s32(a)), 0)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqneg))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqnegd_s64(a: i64) -> i64 {
+    simd_extract(vqneg_s64(vdup_n_s64(a)), 0)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqsubb_s8(a: i8, b: i8) -> i8 {
+    let a: int8x8_t = vdup_n_s8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    simd_extract(vqsub_s8(a, b), 0)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqsubh_s16(a: i16, b: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    simd_extract(vqsub_s16(a, b), 0)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqsubb_u8(a: u8, b: u8) -> u8 {
+    let a: uint8x8_t = vdup_n_u8(a);
+    let b: uint8x8_t = vdup_n_u8(b);
+    simd_extract(vqsub_u8(a, b), 0)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqsubh_u16(a: u16, b: u16) -> u16 {
+    let a: uint16x4_t = vdup_n_u16(a);
+    let b: uint16x4_t = vdup_n_u16(b);
+    simd_extract(vqsub_u16(a, b), 0)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqsubs_u32(a: u32, b: u32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.i32")]
+        fn vqsubs_u32_(a: u32, b: u32) -> u32;
+    }
+    vqsubs_u32_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqsubd_u64(a: u64, b: u64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.i64")]
+        fn vqsubd_u64_(a: u64, b: u64) -> u64;
+    }
+    vqsubd_u64_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqsubs_s32(a: i32, b: i32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.i32")]
+        fn vqsubs_s32_(a: i32, b: i32) -> i32;
+    }
+    vqsubs_s32_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqsubd_s64(a: i64, b: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.i64")]
+        fn vqsubd_s64_(a: i64, b: i64) -> i64;
+    }
+    vqsubd_s64_(a, b)
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrbit_s8(a: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rbit.v8i8")]
+        fn vrbit_s8_(a: int8x8_t) -> int8x8_t;
+    }
+    vrbit_s8_(a)
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrbitq_s8(a: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rbit.v16i8")]
+        fn vrbitq_s8_(a: int8x16_t) -> int8x16_t;
+    }
+    vrbitq_s8_(a)
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrbit_u8(a: uint8x8_t) -> uint8x8_t {
+    transmute(vrbit_s8(transmute(a)))
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrbitq_u8(a: uint8x16_t) -> uint8x16_t {
+    transmute(vrbitq_s8(transmute(a)))
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrbit_p8(a: poly8x8_t) -> poly8x8_t {
+    transmute(vrbit_s8(transmute(a)))
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrbitq_p8(a: poly8x16_t) -> poly8x16_t {
+    transmute(vrbitq_s8(transmute(a)))
+}
+
+/// Floating-point round to integral exact, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndx_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.rint.v2f32")]
+        fn vrndx_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrndx_f32_(a)
+}
+
+/// Floating-point round to integral exact, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndxq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.rint.v4f32")]
+        fn vrndxq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndxq_f32_(a)
+}
+
+/// Floating-point round to integral exact, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndx_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.rint.v1f64")]
+        fn vrndx_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndx_f64_(a)
+}
+
+/// Floating-point round to integral exact, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndxq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.rint.v2f64")]
+        fn vrndxq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndxq_f64_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinta))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrnda_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.round.v2f32")]
+        fn vrnda_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnda_f32_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinta))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndaq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.round.v4f32")]
+        fn vrndaq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndaq_f32_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinta))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrnda_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.round.v1f64")]
+        fn vrnda_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrnda_f64_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinta))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndaq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.round.v2f64")]
+        fn vrndaq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndaq_f64_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndn_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v1f64")]
+        fn vrndn_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndn_f64_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndnq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v2f64")]
+        fn vrndnq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndnq_f64_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndns_f32(a: f32) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.roundeven.f32")]
+        fn vrndns_f32_(a: f32) -> f32;
+    }
+    vrndns_f32_(a)
+}
+
+/// Floating-point round to integral, toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintm))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndm_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.floor.v2f32")]
+        fn vrndm_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrndm_f32_(a)
+}
+
+/// Floating-point round to integral, toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintm))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndmq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.floor.v4f32")]
+        fn vrndmq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndmq_f32_(a)
+}
+
+/// Floating-point round to integral, toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintm))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndm_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.floor.v1f64")]
+        fn vrndm_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndm_f64_(a)
+}
+
+/// Floating-point round to integral, toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintm))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndmq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.floor.v2f64")]
+        fn vrndmq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndmq_f64_(a)
+}
+
+/// Floating-point round to integral, toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndp_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ceil.v2f32")]
+        fn vrndp_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrndp_f32_(a)
+}
+
+/// Floating-point round to integral, toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndpq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ceil.v4f32")]
+        fn vrndpq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndpq_f32_(a)
+}
+
+/// Floating-point round to integral, toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndp_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ceil.v1f64")]
+        fn vrndp_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndp_f64_(a)
+}
+
+/// Floating-point round to integral, toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndpq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ceil.v2f64")]
+        fn vrndpq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndpq_f64_(a)
+}
+
+/// Floating-point round to integral, toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintz))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrnd_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.trunc.v2f32")]
+        fn vrnd_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd_f32_(a)
+}
+
+/// Floating-point round to integral, toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintz))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.trunc.v4f32")]
+        fn vrndq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndq_f32_(a)
+}
+
+/// Floating-point round to integral, toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintz))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrnd_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.trunc.v1f64")]
+        fn vrnd_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrnd_f64_(a)
+}
+
+/// Floating-point round to integral, toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintz))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.trunc.v2f64")]
+        fn vrndq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndq_f64_(a)
+}
+
+/// Floating-point round to integral, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinti))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndi_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.nearbyint.v2f32")]
+        fn vrndi_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrndi_f32_(a)
+}
+
+/// Floating-point round to integral, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinti))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndiq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.nearbyint.v4f32")]
+        fn vrndiq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndiq_f32_(a)
+}
+
+/// Floating-point round to integral, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinti))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndi_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.nearbyint.v1f64")]
+        fn vrndi_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndi_f64_(a)
+}
+
+/// Floating-point round to integral, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinti))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrndiq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.nearbyint.v2f64")]
+        fn vrndiq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndiq_f64_(a)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqaddb_s8(a: i8, b: i8) -> i8 {
+    let a: int8x8_t = vdup_n_s8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    simd_extract(vqadd_s8(a, b), 0)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqaddh_s16(a: i16, b: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    simd_extract(vqadd_s16(a, b), 0)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqaddb_u8(a: u8, b: u8) -> u8 {
+    let a: uint8x8_t = vdup_n_u8(a);
+    let b: uint8x8_t = vdup_n_u8(b);
+    simd_extract(vqadd_u8(a, b), 0)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqaddh_u16(a: u16, b: u16) -> u16 {
+    let a: uint16x4_t = vdup_n_u16(a);
+    let b: uint16x4_t = vdup_n_u16(b);
+    simd_extract(vqadd_u16(a, b), 0)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqadds_u32(a: u32, b: u32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.i32")]
+        fn vqadds_u32_(a: u32, b: u32) -> u32;
+    }
+    vqadds_u32_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqaddd_u64(a: u64, b: u64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.i64")]
+        fn vqaddd_u64_(a: u64, b: u64) -> u64;
+    }
+    vqaddd_u64_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqadds_s32(a: i32, b: i32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.i32")]
+        fn vqadds_s32_(a: i32, b: i32) -> i32;
+    }
+    vqadds_s32_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqaddd_s64(a: i64, b: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.i64")]
+        fn vqaddd_s64_(a: i64, b: i64) -> i64;
+    }
+    vqaddd_s64_(a, b)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_f64_x2(a: *const f64) -> float64x1x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v1f64.p0f64")]
+        fn vld1_f64_x2_(a: *const f64) -> float64x1x2_t;
+    }
+    vld1_f64_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_f64_x2(a: *const f64) -> float64x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v2f64.p0f64")]
+        fn vld1q_f64_x2_(a: *const f64) -> float64x2x2_t;
+    }
+    vld1q_f64_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_f64_x3(a: *const f64) -> float64x1x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v1f64.p0f64")]
+        fn vld1_f64_x3_(a: *const f64) -> float64x1x3_t;
+    }
+    vld1_f64_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_f64_x3(a: *const f64) -> float64x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v2f64.p0f64")]
+        fn vld1q_f64_x3_(a: *const f64) -> float64x2x3_t;
+    }
+    vld1q_f64_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_f64_x4(a: *const f64) -> float64x1x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v1f64.p0f64")]
+        fn vld1_f64_x4_(a: *const f64) -> float64x1x4_t;
+    }
+    vld1_f64_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_f64_x4(a: *const f64) -> float64x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v2f64.p0f64")]
+        fn vld1q_f64_x4_(a: *const f64) -> float64x2x4_t;
+    }
+    vld1q_f64_x4_(a)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_s64(a: *const i64) -> int64x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2i64.p0v2i64")]
+        fn vld2q_s64_(ptr: *const int64x2_t) -> int64x2x2_t;
+    }
+    vld2q_s64_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_u64(a: *const u64) -> uint64x2x2_t {
+    transmute(vld2q_s64(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_p64(a: *const p64) -> poly64x2x2_t {
+    transmute(vld2q_s64(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v1f64.p0v1f64")]
+        fn vld2_f64_(ptr: *const float64x1_t) -> float64x1x2_t;
+    }
+    vld2_f64_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_f64(a: *const f64) -> float64x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2f64.p0v2f64")]
+        fn vld2q_f64_(ptr: *const float64x2_t) -> float64x2x2_t;
+    }
+    vld2q_f64_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_dup_s64(a: *const i64) -> int64x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2i64.p0i64")]
+        fn vld2q_dup_s64_(ptr: *const i64) -> int64x2x2_t;
+    }
+    vld2q_dup_s64_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_dup_u64(a: *const u64) -> uint64x2x2_t {
+    transmute(vld2q_dup_s64(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_dup_p64(a: *const p64) -> poly64x2x2_t {
+    transmute(vld2q_dup_s64(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_dup_f64(a: *const f64) -> float64x1x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1f64.p0f64")]
+        fn vld2_dup_f64_(ptr: *const f64) -> float64x1x2_t;
+    }
+    vld2_dup_f64_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_dup_f64(a: *const f64) -> float64x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2f64.p0f64")]
+        fn vld2q_dup_f64_(ptr: *const f64) -> float64x2x2_t;
+    }
+    vld2q_dup_f64_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x2_t) -> int8x16x2_t {
+    static_assert_imm4!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v16i8.p0i8")]
+        fn vld2q_lane_s8_(a: int8x16_t, b: int8x16_t, n: i64, ptr: *const i8) -> int8x16x2_t;
+    }
+    vld2q_lane_s8_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_s64<const LANE: i32>(a: *const i64, b: int64x1x2_t) -> int64x1x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v1i64.p0i8")]
+        fn vld2_lane_s64_(a: int64x1_t, b: int64x1_t, n: i64, ptr: *const i8) -> int64x1x2_t;
+    }
+    vld2_lane_s64_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_s64<const LANE: i32>(a: *const i64, b: int64x2x2_t) -> int64x2x2_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2i64.p0i8")]
+        fn vld2q_lane_s64_(a: int64x2_t, b: int64x2_t, n: i64, ptr: *const i8) -> int64x2x2_t;
+    }
+    vld2q_lane_s64_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_p64<const LANE: i32>(a: *const p64, b: poly64x1x2_t) -> poly64x1x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vld2_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_p64<const LANE: i32>(a: *const p64, b: poly64x2x2_t) -> poly64x2x2_t {
+    static_assert_imm1!(LANE);
+    transmute(vld2q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_u8<const LANE: i32>(a: *const u8, b: uint8x16x2_t) -> uint8x16x2_t {
+    static_assert_imm4!(LANE);
+    transmute(vld2q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_u64<const LANE: i32>(a: *const u64, b: uint64x1x2_t) -> uint64x1x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vld2_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_u64<const LANE: i32>(a: *const u64, b: uint64x2x2_t) -> uint64x2x2_t {
+    static_assert_imm1!(LANE);
+    transmute(vld2q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_p8<const LANE: i32>(a: *const p8, b: poly8x16x2_t) -> poly8x16x2_t {
+    static_assert_imm4!(LANE);
+    transmute(vld2q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_f64<const LANE: i32>(a: *const f64, b: float64x1x2_t) -> float64x1x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v1f64.p0i8")]
+        fn vld2_lane_f64_(a: float64x1_t, b: float64x1_t, n: i64, ptr: *const i8) -> float64x1x2_t;
+    }
+    vld2_lane_f64_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x2_t) -> float64x2x2_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2f64.p0i8")]
+        fn vld2q_lane_f64_(a: float64x2_t, b: float64x2_t, n: i64, ptr: *const i8) -> float64x2x2_t;
+    }
+    vld2q_lane_f64_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_s64(a: *const i64) -> int64x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2i64.p0v2i64")]
+        fn vld3q_s64_(ptr: *const int64x2_t) -> int64x2x3_t;
+    }
+    vld3q_s64_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_u64(a: *const u64) -> uint64x2x3_t {
+    transmute(vld3q_s64(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_p64(a: *const p64) -> poly64x2x3_t {
+    transmute(vld3q_s64(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_f64(a: *const f64) -> float64x1x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v1f64.p0v1f64")]
+        fn vld3_f64_(ptr: *const float64x1_t) -> float64x1x3_t;
+    }
+    vld3_f64_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_f64(a: *const f64) -> float64x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2f64.p0v2f64")]
+        fn vld3q_f64_(ptr: *const float64x2_t) -> float64x2x3_t;
+    }
+    vld3q_f64_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_dup_s64(a: *const i64) -> int64x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2i64.p0i64")]
+        fn vld3q_dup_s64_(ptr: *const i64) -> int64x2x3_t;
+    }
+    vld3q_dup_s64_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_dup_u64(a: *const u64) -> uint64x2x3_t {
+    transmute(vld3q_dup_s64(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_dup_p64(a: *const p64) -> poly64x2x3_t {
+    transmute(vld3q_dup_s64(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_dup_f64(a: *const f64) -> float64x1x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v1f64.p0f64")]
+        fn vld3_dup_f64_(ptr: *const f64) -> float64x1x3_t;
+    }
+    vld3_dup_f64_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_dup_f64(a: *const f64) -> float64x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2f64.p0f64")]
+        fn vld3q_dup_f64_(ptr: *const f64) -> float64x2x3_t;
+    }
+    vld3q_dup_f64_(a as _)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x3_t) -> int8x16x3_t {
+    static_assert_imm4!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v16i8.p0i8")]
+        fn vld3q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, n: i64, ptr: *const i8) -> int8x16x3_t;
+    }
+    vld3q_lane_s8_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_s64<const LANE: i32>(a: *const i64, b: int64x1x3_t) -> int64x1x3_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v1i64.p0i8")]
+        fn vld3_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, n: i64, ptr: *const i8) -> int64x1x3_t;
+    }
+    vld3_lane_s64_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_s64<const LANE: i32>(a: *const i64, b: int64x2x3_t) -> int64x2x3_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2i64.p0i8")]
+        fn vld3q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, n: i64, ptr: *const i8) -> int64x2x3_t;
+    }
+    vld3q_lane_s64_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_p64<const LANE: i32>(a: *const p64, b: poly64x1x3_t) -> poly64x1x3_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vld3_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_p64<const LANE: i32>(a: *const p64, b: poly64x2x3_t) -> poly64x2x3_t {
+    static_assert_imm1!(LANE);
+    transmute(vld3q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_p8<const LANE: i32>(a: *const p8, b: poly8x16x3_t) -> poly8x16x3_t {
+    static_assert_imm4!(LANE);
+    transmute(vld3q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_u8<const LANE: i32>(a: *const u8, b: uint8x16x3_t) -> uint8x16x3_t {
+    static_assert_imm4!(LANE);
+    transmute(vld3q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_u64<const LANE: i32>(a: *const u64, b: uint64x1x3_t) -> uint64x1x3_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vld3_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_u64<const LANE: i32>(a: *const u64, b: uint64x2x3_t) -> uint64x2x3_t {
+    static_assert_imm1!(LANE);
+    transmute(vld3q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_f64<const LANE: i32>(a: *const f64, b: float64x1x3_t) -> float64x1x3_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v1f64.p0i8")]
+        fn vld3_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, n: i64, ptr: *const i8) -> float64x1x3_t;
+    }
+    vld3_lane_f64_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x3_t) -> float64x2x3_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2f64.p0i8")]
+        fn vld3q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, n: i64, ptr: *const i8) -> float64x2x3_t;
+    }
+    vld3q_lane_f64_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_s64(a: *const i64) -> int64x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2i64.p0v2i64")]
+        fn vld4q_s64_(ptr: *const int64x2_t) -> int64x2x4_t;
+    }
+    vld4q_s64_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_u64(a: *const u64) -> uint64x2x4_t {
+    transmute(vld4q_s64(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_p64(a: *const p64) -> poly64x2x4_t {
+    transmute(vld4q_s64(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_f64(a: *const f64) -> float64x1x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v1f64.p0v1f64")]
+        fn vld4_f64_(ptr: *const float64x1_t) -> float64x1x4_t;
+    }
+    vld4_f64_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_f64(a: *const f64) -> float64x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2f64.p0v2f64")]
+        fn vld4q_f64_(ptr: *const float64x2_t) -> float64x2x4_t;
+    }
+    vld4q_f64_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_s64(a: *const i64) -> int64x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2i64.p0i64")]
+        fn vld4q_dup_s64_(ptr: *const i64) -> int64x2x4_t;
+    }
+    vld4q_dup_s64_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_u64(a: *const u64) -> uint64x2x4_t {
+    transmute(vld4q_dup_s64(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_p64(a: *const p64) -> poly64x2x4_t {
+    transmute(vld4q_dup_s64(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_f64(a: *const f64) -> float64x1x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v1f64.p0f64")]
+        fn vld4_dup_f64_(ptr: *const f64) -> float64x1x4_t;
+    }
+    vld4_dup_f64_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_f64(a: *const f64) -> float64x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2f64.p0f64")]
+        fn vld4q_dup_f64_(ptr: *const f64) -> float64x2x4_t;
+    }
+    vld4q_dup_f64_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x4_t) -> int8x16x4_t {
+    static_assert_imm4!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v16i8.p0i8")]
+        fn vld4q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, n: i64, ptr: *const i8) -> int8x16x4_t;
+    }
+    vld4q_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_s64<const LANE: i32>(a: *const i64, b: int64x1x4_t) -> int64x1x4_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v1i64.p0i8")]
+        fn vld4_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, n: i64, ptr: *const i8) -> int64x1x4_t;
+    }
+    vld4_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_s64<const LANE: i32>(a: *const i64, b: int64x2x4_t) -> int64x2x4_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2i64.p0i8")]
+        fn vld4q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, n: i64, ptr: *const i8) -> int64x2x4_t;
+    }
+    vld4q_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_p64<const LANE: i32>(a: *const p64, b: poly64x1x4_t) -> poly64x1x4_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vld4_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_p64<const LANE: i32>(a: *const p64, b: poly64x2x4_t) -> poly64x2x4_t {
+    static_assert_imm1!(LANE);
+    transmute(vld4q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_p8<const LANE: i32>(a: *const p8, b: poly8x16x4_t) -> poly8x16x4_t {
+    static_assert_imm4!(LANE);
+    transmute(vld4q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_u8<const LANE: i32>(a: *const u8, b: uint8x16x4_t) -> uint8x16x4_t {
+    static_assert_imm4!(LANE);
+    transmute(vld4q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_u64<const LANE: i32>(a: *const u64, b: uint64x1x4_t) -> uint64x1x4_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vld4_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_u64<const LANE: i32>(a: *const u64, b: uint64x2x4_t) -> uint64x2x4_t {
+    static_assert_imm1!(LANE);
+    transmute(vld4q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_f64<const LANE: i32>(a: *const f64, b: float64x1x4_t) -> float64x1x4_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v1f64.p0i8")]
+        fn vld4_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, n: i64, ptr: *const i8) -> float64x1x4_t;
+    }
+    vld4_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x4_t) -> float64x2x4_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2f64.p0i8")]
+        fn vld4q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, n: i64, ptr: *const i8) -> float64x2x4_t;
+    }
+    vld4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2_t) {
+    static_assert_imm1!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f64_x2(a: *mut f64, b: float64x1x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v1f64.p0f64")]
+        fn vst1_f64_x2_(a: float64x1_t, b: float64x1_t, ptr: *mut f64);
+    }
+    vst1_f64_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f64_x2(a: *mut f64, b: float64x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v2f64.p0f64")]
+        fn vst1q_f64_x2_(a: float64x2_t, b: float64x2_t, ptr: *mut f64);
+    }
+    vst1q_f64_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f64_x3(a: *mut f64, b: float64x1x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v1f64.p0f64")]
+        fn vst1_f64_x3_(a: float64x1_t, b: float64x1_t, c: float64x1_t, ptr: *mut f64);
+    }
+    vst1_f64_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f64_x3(a: *mut f64, b: float64x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v2f64.p0f64")]
+        fn vst1q_f64_x3_(a: float64x2_t, b: float64x2_t, c: float64x2_t, ptr: *mut f64);
+    }
+    vst1q_f64_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f64_x4(a: *mut f64, b: float64x1x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v1f64.p0f64")]
+        fn vst1_f64_x4_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, ptr: *mut f64);
+    }
+    vst1_f64_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f64_x4(a: *mut f64, b: float64x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v2f64.p0f64")]
+        fn vst1q_f64_x4_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, ptr: *mut f64);
+    }
+    vst1q_f64_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_s64(a: *mut i64, b: int64x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2i64.p0i8")]
+        fn vst2q_s64_(a: int64x2_t, b: int64x2_t, ptr: *mut i8);
+    }
+    vst2q_s64_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_u64(a: *mut u64, b: uint64x2x2_t) {
+    transmute(vst2q_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_p64(a: *mut p64, b: poly64x2x2_t) {
+    transmute(vst2q_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_f64(a: *mut f64, b: float64x1x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v1f64.p0i8")]
+        fn vst2_f64_(a: float64x1_t, b: float64x1_t, ptr: *mut i8);
+    }
+    vst2_f64_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_f64(a: *mut f64, b: float64x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2f64.p0i8")]
+        fn vst2q_f64_(a: float64x2_t, b: float64x2_t, ptr: *mut i8);
+    }
+    vst2q_f64_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16x2_t) {
+    static_assert_imm4!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v16i8.p0i8")]
+        fn vst2q_lane_s8_(a: int8x16_t, b: int8x16_t, n: i64, ptr: *mut i8);
+    }
+    vst2q_lane_s8_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1x2_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v1i64.p0i8")]
+        fn vst2_lane_s64_(a: int64x1_t, b: int64x1_t, n: i64, ptr: *mut i8);
+    }
+    vst2_lane_s64_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2x2_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2i64.p0i8")]
+        fn vst2q_lane_s64_(a: int64x2_t, b: int64x2_t, n: i64, ptr: *mut i8);
+    }
+    vst2q_lane_s64_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16x2_t) {
+    static_assert_imm4!(LANE);
+    transmute(vst2q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1x2_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vst2_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2x2_t) {
+    static_assert_imm1!(LANE);
+    transmute(vst2q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16x2_t) {
+    static_assert_imm4!(LANE);
+    transmute(vst2q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1x2_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vst2_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2x2_t) {
+    static_assert_imm1!(LANE);
+    transmute(vst2q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1x2_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v1f64.p0i8")]
+        fn vst2_lane_f64_(a: float64x1_t, b: float64x1_t, n: i64, ptr: *mut i8);
+    }
+    vst2_lane_f64_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x2_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2f64.p0i8")]
+        fn vst2q_lane_f64_(a: float64x2_t, b: float64x2_t, n: i64, ptr: *mut i8);
+    }
+    vst2q_lane_f64_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_s64(a: *mut i64, b: int64x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2i64.p0i8")]
+        fn vst3q_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, ptr: *mut i8);
+    }
+    vst3q_s64_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_u64(a: *mut u64, b: uint64x2x3_t) {
+    transmute(vst3q_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_p64(a: *mut p64, b: poly64x2x3_t) {
+    transmute(vst3q_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_f64(a: *mut f64, b: float64x1x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v1f64.p0i8")]
+        fn vst3_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, ptr: *mut i8);
+    }
+    vst3_f64_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_f64(a: *mut f64, b: float64x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2f64.p0i8")]
+        fn vst3q_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, ptr: *mut i8);
+    }
+    vst3q_f64_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16x3_t) {
+    static_assert_imm4!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v16i8.p0i8")]
+        fn vst3q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, n: i64, ptr: *mut i8);
+    }
+    vst3q_lane_s8_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1x3_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v1i64.p0i8")]
+        fn vst3_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, n: i64, ptr: *mut i8);
+    }
+    vst3_lane_s64_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2x3_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2i64.p0i8")]
+        fn vst3q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, n: i64, ptr: *mut i8);
+    }
+    vst3q_lane_s64_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16x3_t) {
+    static_assert_imm4!(LANE);
+    transmute(vst3q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1x3_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vst3_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2x3_t) {
+    static_assert_imm1!(LANE);
+    transmute(vst3q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16x3_t) {
+    static_assert_imm4!(LANE);
+    transmute(vst3q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1x3_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vst3_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2x3_t) {
+    static_assert_imm1!(LANE);
+    transmute(vst3q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1x3_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v1f64.p0i8")]
+        fn vst3_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, n: i64, ptr: *mut i8);
+    }
+    vst3_lane_f64_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x3_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2f64.p0i8")]
+        fn vst3q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, n: i64, ptr: *mut i8);
+    }
+    vst3q_lane_f64_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_s64(a: *mut i64, b: int64x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2i64.p0i8")]
+        fn vst4q_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, ptr: *mut i8);
+    }
+    vst4q_s64_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_u64(a: *mut u64, b: uint64x2x4_t) {
+    transmute(vst4q_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_p64(a: *mut p64, b: poly64x2x4_t) {
+    transmute(vst4q_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_f64(a: *mut f64, b: float64x1x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v1f64.p0i8")]
+        fn vst4_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, ptr: *mut i8);
+    }
+    vst4_f64_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_f64(a: *mut f64, b: float64x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2f64.p0i8")]
+        fn vst4q_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, ptr: *mut i8);
+    }
+    vst4q_f64_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16x4_t) {
+    static_assert_imm4!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v16i8.p0i8")]
+        fn vst4q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, n: i64, ptr: *mut i8);
+    }
+    vst4q_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1x4_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v1i64.p0i8")]
+        fn vst4_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, n: i64, ptr: *mut i8);
+    }
+    vst4_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2x4_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2i64.p0i8")]
+        fn vst4q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, n: i64, ptr: *mut i8);
+    }
+    vst4q_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16x4_t) {
+    static_assert_imm4!(LANE);
+    transmute(vst4q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1x4_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vst4_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2x4_t) {
+    static_assert_imm1!(LANE);
+    transmute(vst4q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16x4_t) {
+    static_assert_imm4!(LANE);
+    transmute(vst4q_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1x4_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    transmute(vst4_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2x4_t) {
+    static_assert_imm1!(LANE);
+    transmute(vst4q_lane_s64::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1x4_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v1f64.p0i8")]
+        fn vst4_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, n: i64, ptr: *mut i8);
+    }
+    vst4_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x4_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2f64.p0i8")]
+        fn vst4q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, n: i64, ptr: *mut i8);
+    }
+    vst4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmul_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_mul(a, b)
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmul_n_f64(a: float64x1_t, b: f64) -> float64x1_t {
+    simd_mul(a, vdup_n_f64(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulq_n_f64(a: float64x2_t, b: f64) -> float64x2_t {
+    simd_mul(a, vdupq_n_f64(b))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmul_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_mul(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmul_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) -> float64x1_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmuls_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
+    static_assert_imm1!(LANE);
+    let b: f32 = simd_extract(b, LANE as u32);
+    a * b
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmuls_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
+    static_assert_imm2!(LANE);
+    let b: f32 = simd_extract(b, LANE as u32);
+    a * b
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmuld_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
+    static_assert!(LANE : i32 where LANE == 0);
+    let b: f64 = simd_extract(b, LANE as u32);
+    a * b
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmuld_laneq_f64<const LANE: i32>(a: f64, b: float64x2_t) -> f64 {
+    static_assert_imm1!(LANE);
+    let b: f64 = simd_extract(b, LANE as u32);
+    a * b
+}
+
+/// Signed multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    let a: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vmull_s8(a, b)
+}
+
+/// Signed multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    vmull_s16(a, b)
+}
+
+/// Signed multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    vmull_s32(a, b)
+}
+
+/// Unsigned multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    let a: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vmull_u8(a, b)
+}
+
+/// Unsigned multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    let a: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    vmull_u16(a, b)
+}
+
+/// Unsigned multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    let a: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    vmull_u32(a, b)
+}
+
+/// Polynomial multiply long
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(pmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_p64(a: p64, b: p64) -> p128 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmull64")]
+        fn vmull_p64_(a: p64, b: p64) -> int8x16_t;
+    }
+    transmute(vmull_p64_(a, b))
+}
+
+/// Polynomial multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(pmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t {
+    let a: poly8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: poly8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vmull_p8(a, b)
+}
+
+/// Polynomial multiply long
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(pmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_p64(a: poly64x2_t, b: poly64x2_t) -> p128 {
+    vmull_p64(simd_extract(a, 1), simd_extract(b, 1))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t {
+    vmull_high_s16(a, vdupq_n_s16(b))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t {
+    vmull_high_s32(a, vdupq_n_s32(b))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_n_u16(a: uint16x8_t, b: u16) -> uint32x4_t {
+    vmull_high_u16(a, vdupq_n_u16(b))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_n_u32(a: uint32x4_t, b: u32) -> uint64x2_t {
+    vmull_high_u32(a, vdupq_n_u32(b))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_high_s16(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_high_s16(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_high_s32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_high_s32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_high_u16(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_high_u16(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_high_u32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmull_high_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_high_u32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulx_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v2f32")]
+        fn vmulx_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    vmulx_f32_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v4f32")]
+        fn vmulxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    vmulxq_f32_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulx_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v1f64")]
+        fn vmulx_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    vmulx_f64_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v2f64")]
+        fn vmulxq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vmulxq_f64_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulx_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    vmulx_f64(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulx_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) -> float64x1_t {
+    static_assert_imm1!(LANE);
+    vmulx_f64(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulx_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    vmulx_f32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulx_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    vmulx_f32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    vmulxq_f32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    vmulxq_f32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    vmulxq_f64(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(LANE);
+    vmulxq_f64(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxs_f32(a: f32, b: f32) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.f32")]
+        fn vmulxs_f32_(a: f32, b: f32) -> f32;
+    }
+    vmulxs_f32_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxd_f64(a: f64, b: f64) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.f64")]
+        fn vmulxd_f64_(a: f64, b: f64) -> f64;
+    }
+    vmulxd_f64_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxs_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
+    static_assert_imm1!(LANE);
+    vmulxs_f32(a, simd_extract(b, LANE as u32))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxs_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
+    static_assert_imm2!(LANE);
+    vmulxs_f32(a, simd_extract(b, LANE as u32))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxd_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
+    static_assert!(LANE : i32 where LANE == 0);
+    vmulxd_f64(a, simd_extract(b, LANE as u32))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmulxd_laneq_f64<const LANE: i32>(a: f64, b: float64x2_t) -> f64 {
+    static_assert_imm1!(LANE);
+    vmulxd_f64(a, simd_extract(b, LANE as u32))
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfma_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v1f64")]
+        fn vfma_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t;
+    }
+    vfma_f64_(b, c, a)
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f64")]
+        fn vfmaq_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
+    }
+    vfmaq_f64_(b, c, a)
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
+    vfma_f64(a, b, vdup_n_f64(c))
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
+    vfmaq_f64(a, b, vdupq_n_f64(c))
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfma_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfma_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmaq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfma_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfma_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t {
+    static_assert_imm1!(LANE);
+    vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmaq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmaq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(LANE);
+    vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmas_lane_f32<const LANE: i32>(a: f32, b: f32, c: float32x2_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f32")]
+        fn vfmas_lane_f32_(a: f32, b: f32, c: f32) -> f32;
+    }
+    static_assert_imm1!(LANE);
+    let c: f32 = simd_extract(c, LANE as u32);
+    vfmas_lane_f32_(b, c, a)
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmas_laneq_f32<const LANE: i32>(a: f32, b: f32, c: float32x4_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f32")]
+        fn vfmas_laneq_f32_(a: f32, b: f32, c: f32) -> f32;
+    }
+    static_assert_imm2!(LANE);
+    let c: f32 = simd_extract(c, LANE as u32);
+    vfmas_laneq_f32_(b, c, a)
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmad_lane_f64<const LANE: i32>(a: f64, b: f64, c: float64x1_t) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f64")]
+        fn vfmad_lane_f64_(a: f64, b: f64, c: f64) -> f64;
+    }
+    static_assert!(LANE : i32 where LANE == 0);
+    let c: f64 = simd_extract(c, LANE as u32);
+    vfmad_lane_f64_(b, c, a)
+}
+
+/// Floating-point fused multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmad_laneq_f64<const LANE: i32>(a: f64, b: f64, c: float64x2_t) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f64")]
+        fn vfmad_laneq_f64_(a: f64, b: f64, c: f64) -> f64;
+    }
+    static_assert_imm1!(LANE);
+    let c: f64 = simd_extract(c, LANE as u32);
+    vfmad_laneq_f64_(b, c, a)
+}
+
+/// Floating-point fused multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfms_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    let b: float64x1_t = simd_neg(b);
+    vfma_f64(a, b, c)
+}
+
+/// Floating-point fused multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    let b: float64x2_t = simd_neg(b);
+    vfmaq_f64(a, b, c)
+}
+
+/// Floating-point fused Multiply-subtract to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfms_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
+    vfms_f64(a, b, vdup_n_f64(c))
+}
+
+/// Floating-point fused Multiply-subtract to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmsq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
+    vfmsq_f64(a, b, vdupq_n_f64(c))
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfms_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfms_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmsq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmsq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfms_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfms_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t {
+    static_assert_imm1!(LANE);
+    vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmsq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmsq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(LANE);
+    vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmss_lane_f32<const LANE: i32>(a: f32, b: f32, c: float32x2_t) -> f32 {
+    vfmas_lane_f32::<LANE>(a, -b, c)
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmss_laneq_f32<const LANE: i32>(a: f32, b: f32, c: float32x4_t) -> f32 {
+    vfmas_laneq_f32::<LANE>(a, -b, c)
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmsd_lane_f64<const LANE: i32>(a: f64, b: f64, c: float64x1_t) -> f64 {
+    vfmad_lane_f64::<LANE>(a, -b, c)
+}
+
+/// Floating-point fused multiply-subtract to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vfmsd_laneq_f64<const LANE: i32>(a: f64, b: f64, c: float64x2_t) -> f64 {
+    vfmad_laneq_f64::<LANE>(a, -b, c)
+}
+
+/// Divide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fdiv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdiv_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_div(a, b)
+}
+
+/// Divide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fdiv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdivq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_div(a, b)
+}
+
+/// Divide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fdiv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdiv_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    simd_div(a, b)
+}
+
+/// Divide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fdiv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdivq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_div(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsub_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsub))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubd_s64(a: i64, b: i64) -> i64 {
+    a.wrapping_sub(b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubd_u64(a: u64, b: u64) -> u64 {
+    a.wrapping_sub(b)
+}
+
+/// Add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
+    a.wrapping_add(b)
+}
+
+/// Add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 {
+    a.wrapping_add(b)
+}
+
+/// Floating-point add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(faddp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddv_f32(a: float32x2_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.faddv.f32.v2f32")]
+        fn vaddv_f32_(a: float32x2_t) -> f32;
+    }
+    vaddv_f32_(a)
+}
+
+/// Floating-point add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(faddp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_f32(a: float32x4_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.faddv.f32.v4f32")]
+        fn vaddvq_f32_(a: float32x4_t) -> f32;
+    }
+    vaddvq_f32_(a)
+}
+
+/// Floating-point add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(faddp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_f64(a: float64x2_t) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.faddv.f64.v2f64")]
+        fn vaddvq_f64_(a: float64x2_t) -> f64;
+    }
+    vaddvq_f64_(a)
+}
+
+/// Signed Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(saddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlv_s16(a: int16x4_t) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.saddlv.i32.v4i16")]
+        fn vaddlv_s16_(a: int16x4_t) -> i32;
+    }
+    vaddlv_s16_(a)
+}
+
+/// Signed Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(saddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlvq_s16(a: int16x8_t) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.saddlv.i32.v8i16")]
+        fn vaddlvq_s16_(a: int16x8_t) -> i32;
+    }
+    vaddlvq_s16_(a)
+}
+
+/// Signed Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(saddlp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlv_s32(a: int32x2_t) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.saddlv.i64.v2i32")]
+        fn vaddlv_s32_(a: int32x2_t) -> i64;
+    }
+    vaddlv_s32_(a)
+}
+
+/// Signed Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(saddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlvq_s32(a: int32x4_t) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.saddlv.i64.v4i32")]
+        fn vaddlvq_s32_(a: int32x4_t) -> i64;
+    }
+    vaddlvq_s32_(a)
+}
+
+/// Unsigned Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlv_u16(a: uint16x4_t) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uaddlv.i32.v4i16")]
+        fn vaddlv_u16_(a: uint16x4_t) -> u32;
+    }
+    vaddlv_u16_(a)
+}
+
+/// Unsigned Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlvq_u16(a: uint16x8_t) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uaddlv.i32.v8i16")]
+        fn vaddlvq_u16_(a: uint16x8_t) -> u32;
+    }
+    vaddlvq_u16_(a)
+}
+
+/// Unsigned Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uaddlp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlv_u32(a: uint32x2_t) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uaddlv.i64.v2i32")]
+        fn vaddlv_u32_(a: uint32x2_t) -> u64;
+    }
+    vaddlv_u32_(a)
+}
+
+/// Unsigned Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlvq_u32(a: uint32x4_t) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uaddlv.i64.v4i32")]
+        fn vaddlvq_u32_(a: uint32x4_t) -> u64;
+    }
+    vaddlvq_u32_(a)
+}
+
+/// Signed Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ssubw))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    let c: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    simd_sub(a, simd_cast(c))
+}
+
+/// Signed Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ssubw))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    let c: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    simd_sub(a, simd_cast(c))
+}
+
+/// Signed Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ssubw))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    let c: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    simd_sub(a, simd_cast(c))
+}
+
+/// Unsigned Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usubw))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    let c: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    simd_sub(a, simd_cast(c))
+}
+
+/// Unsigned Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usubw))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    let c: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    simd_sub(a, simd_cast(c))
+}
+
+/// Unsigned Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usubw))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    let c: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    simd_sub(a, simd_cast(c))
+}
+
+/// Signed Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ssubl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    let c: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let d: int16x8_t = simd_cast(c);
+    let e: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let f: int16x8_t = simd_cast(e);
+    simd_sub(d, f)
+}
+
+/// Signed Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ssubl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let c: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let d: int32x4_t = simd_cast(c);
+    let e: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let f: int32x4_t = simd_cast(e);
+    simd_sub(d, f)
+}
+
+/// Signed Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ssubl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let c: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let d: int64x2_t = simd_cast(c);
+    let e: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let f: int64x2_t = simd_cast(e);
+    simd_sub(d, f)
+}
+
+/// Unsigned Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usubl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    let c: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let d: uint16x8_t = simd_cast(c);
+    let e: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let f: uint16x8_t = simd_cast(e);
+    simd_sub(d, f)
+}
+
+/// Unsigned Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usubl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    let c: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let d: uint32x4_t = simd_cast(c);
+    let e: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let f: uint32x4_t = simd_cast(e);
+    simd_sub(d, f)
+}
+
+/// Unsigned Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usubl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    let c: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let d: uint64x2_t = simd_cast(c);
+    let e: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let f: uint64x2_t = simd_cast(e);
+    simd_sub(d, f)
+}
+
+/// Bit clear and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub unsafe fn vbcaxq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v16i8")]
+        fn vbcaxq_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    }
+    vbcaxq_s8_(a, b, c)
+}
+
+/// Bit clear and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub unsafe fn vbcaxq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v8i16")]
+        fn vbcaxq_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    vbcaxq_s16_(a, b, c)
+}
+
+/// Bit clear and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub unsafe fn vbcaxq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v4i32")]
+        fn vbcaxq_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    vbcaxq_s32_(a, b, c)
+}
+
+/// Bit clear and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub unsafe fn vbcaxq_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v2i64")]
+        fn vbcaxq_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t;
+    }
+    vbcaxq_s64_(a, b, c)
+}
+
+/// Bit clear and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub unsafe fn vbcaxq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v16i8")]
+        fn vbcaxq_u8_(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t;
+    }
+    vbcaxq_u8_(a, b, c)
+}
+
+/// Bit clear and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub unsafe fn vbcaxq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v8i16")]
+        fn vbcaxq_u16_(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t;
+    }
+    vbcaxq_u16_(a, b, c)
+}
+
+/// Bit clear and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub unsafe fn vbcaxq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v4i32")]
+        fn vbcaxq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    vbcaxq_u32_(a, b, c)
+}
+
+/// Bit clear and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
+pub unsafe fn vbcaxq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v2i64")]
+        fn vbcaxq_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    vbcaxq_u64_(a, b, c)
+}
+
+/// Floating-point complex add
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub unsafe fn vcadd_rot270_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcadd.rot270.v2f32")]
+        fn vcadd_rot270_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    vcadd_rot270_f32_(a, b)
+}
+
+/// Floating-point complex add
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub unsafe fn vcaddq_rot270_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcadd.rot270.v4f32")]
+        fn vcaddq_rot270_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    vcaddq_rot270_f32_(a, b)
+}
+
+/// Floating-point complex add
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub unsafe fn vcaddq_rot270_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcadd.rot270.v2f64")]
+        fn vcaddq_rot270_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vcaddq_rot270_f64_(a, b)
+}
+
+/// Floating-point complex add
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub unsafe fn vcadd_rot90_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcadd.rot90.v2f32")]
+        fn vcadd_rot90_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    vcadd_rot90_f32_(a, b)
+}
+
+/// Floating-point complex add
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub unsafe fn vcaddq_rot90_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcadd.rot90.v4f32")]
+        fn vcaddq_rot90_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    vcaddq_rot90_f32_(a, b)
+}
+
+/// Floating-point complex add
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcadd))]
+pub unsafe fn vcaddq_rot90_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcadd.rot90.v2f64")]
+        fn vcaddq_rot90_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vcaddq_rot90_f64_(a, b)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmla_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot0.v2f32")]
+        fn vcmla_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+    vcmla_f32_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmlaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot0.v4f32")]
+        fn vcmlaq_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+    vcmlaq_f32_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot0.v2f64")]
+        fn vcmlaq_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
+    }
+    vcmlaq_f64_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmla_rot90_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot90.v2f32")]
+        fn vcmla_rot90_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+    vcmla_rot90_f32_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmlaq_rot90_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot90.v4f32")]
+        fn vcmlaq_rot90_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+    vcmlaq_rot90_f32_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmlaq_rot90_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot90.v2f64")]
+        fn vcmlaq_rot90_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
+    }
+    vcmlaq_rot90_f64_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmla_rot180_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot180.v2f32")]
+        fn vcmla_rot180_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+    vcmla_rot180_f32_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmlaq_rot180_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot180.v4f32")]
+        fn vcmlaq_rot180_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+    vcmlaq_rot180_f32_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmlaq_rot180_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot180.v2f64")]
+        fn vcmlaq_rot180_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
+    }
+    vcmlaq_rot180_f64_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmla_rot270_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot270.v2f32")]
+        fn vcmla_rot270_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+    vcmla_rot270_f32_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmlaq_rot270_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot270.v4f32")]
+        fn vcmlaq_rot270_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+    vcmlaq_rot270_f32_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla))]
+pub unsafe fn vcmlaq_rot270_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcmla.rot270.v2f64")]
+        fn vcmlaq_rot270_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
+    }
+    vcmlaq_rot270_f64_(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmla_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    let c: float32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmla_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmla_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    let c: float32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmla_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmlaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    let c: float32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmlaq_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmlaq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    let c: float32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmlaq_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmla_rot90_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    let c: float32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmla_rot90_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmla_rot90_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    let c: float32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmla_rot90_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmlaq_rot90_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    let c: float32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmlaq_rot90_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmlaq_rot90_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    let c: float32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmlaq_rot90_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmla_rot180_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    let c: float32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmla_rot180_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmla_rot180_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    let c: float32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmla_rot180_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmlaq_rot180_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    let c: float32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmlaq_rot180_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmlaq_rot180_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    let c: float32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmlaq_rot180_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmla_rot270_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    let c: float32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmla_rot270_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmla_rot270_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    let c: float32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmla_rot270_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmlaq_rot270_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    let c: float32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmlaq_rot270_f32(a, b, c)
+}
+
+/// Floating-point complex multiply accumulate
+#[inline]
+#[target_feature(enable = "neon,fcma")]
+#[cfg_attr(test, assert_instr(fcmla, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vcmlaq_rot270_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    let c: float32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]);
+    vcmlaq_rot270_f32(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(sdot))]
+pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")]
+        fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t;
+    }
+    vdot_s32_(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(sdot))]
+pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")]
+        fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+    vdotq_s32_(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(udot))]
+pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")]
+        fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t;
+    }
+    vdot_u32_(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(udot))]
+pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")]
+        fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
+    }
+    vdotq_u32_(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    let c: int8x8_t = simd_shuffle8!(c, c, <const LANE: i32> [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
+    vdot_s32(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x16_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    let c: int8x8_t = simd_shuffle8!(c, c, <const LANE: i32> [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
+    vdot_s32(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    let c: int8x16_t = simd_shuffle16!(c, c, <const LANE: i32> [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
+    vdotq_s32(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    let c: int8x16_t = simd_shuffle16!(c, c, <const LANE: i32> [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
+    vdotq_s32(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(udot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    let c: uint8x8_t = simd_shuffle8!(c, c, <const LANE: i32> [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
+    vdot_u32(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(udot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdot_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x16_t) -> uint32x2_t {
+    static_assert_imm2!(LANE);
+    let c: uint8x8_t = simd_shuffle8!(c, c, <const LANE: i32> [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
+    vdot_u32(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(udot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t {
+    static_assert_imm1!(LANE);
+    let c: uint8x16_t = simd_shuffle16!(c, c, <const LANE: i32> [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
+    vdotq_u32(a, b, c)
+}
+
+/// Dot product arithmetic
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(test, assert_instr(udot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdotq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    let c: uint8x16_t = simd_shuffle16!(c, c, <const LANE: i32> [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
+    vdotq_u32(a, b, c)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmax))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmax_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v1f64")]
+        fn vmax_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    vmax_f64_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmax))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v2f64")]
+        fn vmaxq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vmaxq_f64_(a, b)
+}
+
+/// Floating-point Maximum Number (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnm))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v1f64")]
+        fn vmaxnm_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    vmaxnm_f64_(a, b)
+}
+
+/// Floating-point Maximum Number (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnm))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v2f64")]
+        fn vmaxnmq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vmaxnmq_f64_(a, b)
+}
+
+/// Floating-point maximum number across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxnmv_f32(a: float32x2_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnmv.f32.v2f32")]
+        fn vmaxnmv_f32_(a: float32x2_t) -> f32;
+    }
+    vmaxnmv_f32_(a)
+}
+
+/// Floating-point maximum number across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxnmvq_f64(a: float64x2_t) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnmv.f64.v2f64")]
+        fn vmaxnmvq_f64_(a: float64x2_t) -> f64;
+    }
+    vmaxnmvq_f64_(a)
+}
+
+/// Floating-point maximum number across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxnmvq_f32(a: float32x4_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnmv.f32.v4f32")]
+        fn vmaxnmvq_f32_(a: float32x4_t) -> f32;
+    }
+    vmaxnmvq_f32_(a)
+}
+
+/// Floating-point Maximum Number Pairwise (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnmp.v2f32")]
+        fn vpmaxnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    vpmaxnm_f32_(a, b)
+}
+
+/// Floating-point Maximum Number Pairwise (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnmp.v2f64")]
+        fn vpmaxnmq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vpmaxnmq_f64_(a, b)
+}
+
+/// Floating-point Maximum Number Pairwise (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnmp.v4f32")]
+        fn vpmaxnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    vpmaxnmq_f32_(a, b)
+}
+
+/// Floating-point maximum number pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxnms_f32(a: float32x2_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnmv.f32.v2f32")]
+        fn vpmaxnms_f32_(a: float32x2_t) -> f32;
+    }
+    vpmaxnms_f32_(a)
+}
+
+/// Floating-point maximum number pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxnmqd_f64(a: float64x2_t) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnmv.f64.v2f64")]
+        fn vpmaxnmqd_f64_(a: float64x2_t) -> f64;
+    }
+    vpmaxnmqd_f64_(a)
+}
+
+/// Floating-point maximum pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxs_f32(a: float32x2_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxv.f32.v2f32")]
+        fn vpmaxs_f32_(a: float32x2_t) -> f32;
+    }
+    vpmaxs_f32_(a)
+}
+
+/// Floating-point maximum pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxqd_f64(a: float64x2_t) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxv.f64.v2f64")]
+        fn vpmaxqd_f64_(a: float64x2_t) -> f64;
+    }
+    vpmaxqd_f64_(a)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmin))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmin_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v1f64")]
+        fn vmin_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    vmin_f64_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmin))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v2f64")]
+        fn vminq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vminq_f64_(a, b)
+}
+
+/// Floating-point Minimum Number (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnm))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v1f64")]
+        fn vminnm_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    vminnm_f64_(a, b)
+}
+
+/// Floating-point Minimum Number (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnm))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v2f64")]
+        fn vminnmq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vminnmq_f64_(a, b)
+}
+
+/// Floating-point minimum number across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminnmv_f32(a: float32x2_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnmv.f32.v2f32")]
+        fn vminnmv_f32_(a: float32x2_t) -> f32;
+    }
+    vminnmv_f32_(a)
+}
+
+/// Floating-point minimum number across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminnmvq_f64(a: float64x2_t) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnmv.f64.v2f64")]
+        fn vminnmvq_f64_(a: float64x2_t) -> f64;
+    }
+    vminnmvq_f64_(a)
+}
+
+/// Floating-point minimum number across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminnmvq_f32(a: float32x4_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnmv.f32.v4f32")]
+        fn vminnmvq_f32_(a: float32x4_t) -> f32;
+    }
+    vminnmvq_f32_(a)
+}
+
+/// Vector move
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sxtl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovl_high_s8(a: int8x16_t) -> int16x8_t {
+    let a: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vmovl_s8(a)
+}
+
+/// Vector move
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sxtl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovl_high_s16(a: int16x8_t) -> int32x4_t {
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    vmovl_s16(a)
+}
+
+/// Vector move
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sxtl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovl_high_s32(a: int32x4_t) -> int64x2_t {
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    vmovl_s32(a)
+}
+
+/// Vector move
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uxtl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovl_high_u8(a: uint8x16_t) -> uint16x8_t {
+    let a: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vmovl_u8(a)
+}
+
+/// Vector move
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uxtl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovl_high_u16(a: uint16x8_t) -> uint32x4_t {
+    let a: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    vmovl_u16(a)
+}
+
+/// Vector move
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uxtl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovl_high_u32(a: uint32x4_t) -> uint64x2_t {
+    let a: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    vmovl_u32(a)
+}
+
+/// Floating-point add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(faddp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.faddp.v4f32")]
+        fn vpaddq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    vpaddq_f32_(a, b)
+}
+
+/// Floating-point add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(faddp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.faddp.v2f64")]
+        fn vpaddq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vpaddq_f64_(a, b)
+}
+
+/// Floating-point add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpadds_f32(a: float32x2_t) -> f32 {
+    let a1: f32 = simd_extract(a, 0);
+    let a2: f32 = simd_extract(a, 1);
+    a1 + a2
+}
+
+/// Floating-point add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddd_f64(a: float64x2_t) -> f64 {
+    let a1: f64 = simd_extract(a, 0);
+    let a2: f64 = simd_extract(a, 1);
+    a1 + a2
+}
+
+/// Floating-point Minimum Number Pairwise (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnmp.v2f32")]
+        fn vpminnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    vpminnm_f32_(a, b)
+}
+
+/// Floating-point Minimum Number Pairwise (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnmp.v2f64")]
+        fn vpminnmq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vpminnmq_f64_(a, b)
+}
+
+/// Floating-point Minimum Number Pairwise (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnmp.v4f32")]
+        fn vpminnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    vpminnmq_f32_(a, b)
+}
+
+/// Floating-point minimum number pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminnms_f32(a: float32x2_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnmv.f32.v2f32")]
+        fn vpminnms_f32_(a: float32x2_t) -> f32;
+    }
+    vpminnms_f32_(a)
+}
+
+/// Floating-point minimum number pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminnmp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminnmqd_f64(a: float64x2_t) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnmv.f64.v2f64")]
+        fn vpminnmqd_f64_(a: float64x2_t) -> f64;
+    }
+    vpminnmqd_f64_(a)
+}
+
+/// Floating-point minimum pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmins_f32(a: float32x2_t) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminv.f32.v2f32")]
+        fn vpmins_f32_(a: float32x2_t) -> f32;
+    }
+    vpmins_f32_(a)
+}
+
+/// Floating-point minimum pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminqd_f64(a: float64x2_t) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminv.f64.v2f64")]
+        fn vpminqd_f64_(a: float64x2_t) -> f64;
+    }
+    vpminqd_f64_(a)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmullh_s16(a: i16, b: i16) -> i32 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    simd_extract(vqdmull_s16(a, b), 0)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulls_s32(a: i32, b: i32) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulls.scalar")]
+        fn vqdmulls_s32_(a: i32, b: i32) -> i64;
+    }
+    vqdmulls_s32_(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    vqdmull_s16(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    vqdmull_s32(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t {
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = vdup_n_s16(b);
+    vqdmull_s16(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t {
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = vdup_n_s32(b);
+    vqdmull_s32(a, b)
+}
+
+/// Vector saturating doubling long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_laneq_s16<const N: i32>(a: int16x4_t, b: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(N);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
+    vqdmull_s16(a, b)
+}
+
+/// Vector saturating doubling long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_laneq_s32<const N: i32>(a: int32x2_t, b: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(N);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
+    vqdmull_s32(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmullh_lane_s16<const N: i32>(a: i16, b: int16x4_t) -> i32 {
+    static_assert_imm2!(N);
+    let b: i16 = simd_extract(b, N as u32);
+    vqdmullh_s16(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmullh_laneq_s16<const N: i32>(a: i16, b: int16x8_t) -> i32 {
+    static_assert_imm3!(N);
+    let b: i16 = simd_extract(b, N as u32);
+    vqdmullh_s16(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulls_lane_s32<const N: i32>(a: i32, b: int32x2_t) -> i64 {
+    static_assert_imm1!(N);
+    let b: i32 = simd_extract(b, N as u32);
+    vqdmulls_s32(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulls_laneq_s32<const N: i32>(a: i32, b: int32x4_t) -> i64 {
+    static_assert_imm2!(N);
+    let b: i32 = simd_extract(b, N as u32);
+    vqdmulls_s32(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_high_lane_s16<const N: i32>(a: int16x8_t, b: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
+    vqdmull_s16(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_high_lane_s32<const N: i32>(a: int32x4_t, b: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
+    vqdmull_s32(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_high_laneq_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(N);
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
+    vqdmull_s16(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmull_high_laneq_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(N);
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
+    vqdmull_s32(a, b)
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_high_s16(b, c))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_high_s32(b, c))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_high_n_s16(b, c))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_high_n_s32(b, c))
+}
+
+/// Vector widening saturating doubling multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, N = 2))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_laneq_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(N);
+    vqaddq_s32(a, vqdmull_laneq_s16::<N>(b, c))
+}
+
+/// Vector widening saturating doubling multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_laneq_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(N);
+    vqaddq_s64(a, vqdmull_laneq_s32::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_high_lane_s16<const N: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    vqaddq_s32(a, vqdmull_high_lane_s16::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_high_laneq_s16<const N: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(N);
+    vqaddq_s32(a, vqdmull_high_laneq_s16::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_high_lane_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    vqaddq_s64(a, vqdmull_high_lane_s32::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlal_high_laneq_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(N);
+    vqaddq_s64(a, vqdmull_high_laneq_s32::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlalh_s16(a: i32, b: i16, c: i16) -> i32 {
+    let x: int32x4_t = vqdmull_s16(vdup_n_s16(b), vdup_n_s16(c));
+    vqadds_s32(a, simd_extract(x, 0))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlals_s32(a: i64, b: i32, c: i32) -> i64 {
+    let x: int64x2_t = vqdmull_s32(vdup_n_s32(b), vdup_n_s32(c));
+    vqaddd_s64(a, simd_extract(x, 0))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlalh_lane_s16<const LANE: i32>(a: i32, b: i16, c: int16x4_t) -> i32 {
+    static_assert_imm2!(LANE);
+    vqdmlalh_s16(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlal, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlalh_laneq_s16<const LANE: i32>(a: i32, b: i16, c: int16x8_t) -> i32 {
+    static_assert_imm3!(LANE);
+    vqdmlalh_s16(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlals_lane_s32<const LANE: i32>(a: i64, b: i32, c: int32x2_t) -> i64 {
+    static_assert_imm1!(LANE);
+    vqdmlals_s32(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlals_laneq_s32<const LANE: i32>(a: i64, b: i32, c: int32x4_t) -> i64 {
+    static_assert_imm2!(LANE);
+    vqdmlals_s32(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_high_s16(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_high_s32(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_high_n_s16(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_high_n_s32(b, c))
+}
+
+/// Vector widening saturating doubling multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, N = 2))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_laneq_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(N);
+    vqsubq_s32(a, vqdmull_laneq_s16::<N>(b, c))
+}
+
+/// Vector widening saturating doubling multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_laneq_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(N);
+    vqsubq_s64(a, vqdmull_laneq_s32::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_high_lane_s16<const N: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    vqsubq_s32(a, vqdmull_high_lane_s16::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_high_laneq_s16<const N: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(N);
+    vqsubq_s32(a, vqdmull_high_laneq_s16::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_high_lane_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    vqsubq_s64(a, vqdmull_high_lane_s32::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsl_high_laneq_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(N);
+    vqsubq_s64(a, vqdmull_high_laneq_s32::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlslh_s16(a: i32, b: i16, c: i16) -> i32 {
+    let x: int32x4_t = vqdmull_s16(vdup_n_s16(b), vdup_n_s16(c));
+    vqsubs_s32(a, simd_extract(x, 0))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsls_s32(a: i64, b: i32, c: i32) -> i64 {
+    let x: int64x2_t = vqdmull_s32(vdup_n_s32(b), vdup_n_s32(c));
+    vqsubd_s64(a, simd_extract(x, 0))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlslh_lane_s16<const LANE: i32>(a: i32, b: i16, c: int16x4_t) -> i32 {
+    static_assert_imm2!(LANE);
+    vqdmlslh_s16(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmlsl, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlslh_laneq_s16<const LANE: i32>(a: i32, b: i16, c: int16x8_t) -> i32 {
+    static_assert_imm3!(LANE);
+    vqdmlslh_s16(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsls_lane_s32<const LANE: i32>(a: i64, b: i32, c: int32x2_t) -> i64 {
+    static_assert_imm1!(LANE);
+    vqdmlsls_s32(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmull, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmlsls_laneq_s32<const LANE: i32>(a: i64, b: i32, c: int32x4_t) -> i64 {
+    static_assert_imm2!(LANE);
+    vqdmlsls_s32(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulhh_s16(a: i16, b: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    simd_extract(vqdmulh_s16(a, b), 0)
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulhs_s32(a: i32, b: i32) -> i32 {
+    let a: int32x2_t = vdup_n_s32(a);
+    let b: int32x2_t = vdup_n_s32(b);
+    simd_extract(vqdmulh_s32(a, b), 0)
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulhh_lane_s16<const N: i32>(a: i16, b: int16x4_t) -> i16 {
+    static_assert_imm2!(N);
+    let b: i16 = simd_extract(b, N as u32);
+    vqdmulhh_s16(a, b)
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulhh_laneq_s16<const N: i32>(a: i16, b: int16x8_t) -> i16 {
+    static_assert_imm3!(N);
+    let b: i16 = simd_extract(b, N as u32);
+    vqdmulhh_s16(a, b)
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulhs_lane_s32<const N: i32>(a: i32, b: int32x2_t) -> i32 {
+    static_assert_imm1!(N);
+    let b: i32 = simd_extract(b, N as u32);
+    vqdmulhs_s32(a, b)
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulhs_laneq_s32<const N: i32>(a: i32, b: int32x4_t) -> i32 {
+    static_assert_imm2!(N);
+    let b: i32 = simd_extract(b, N as u32);
+    vqdmulhs_s32(a, b)
+}
+
+/// Vector saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    vqdmulh_s16(a, vdup_n_s16(simd_extract(b, LANE as u32)))
+}
+
+/// Vector saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    vqdmulhq_s16(a, vdupq_n_s16(simd_extract(b, LANE as u32)))
+}
+
+/// Vector saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    vqdmulh_s32(a, vdup_n_s32(simd_extract(b, LANE as u32)))
+}
+
+/// Vector saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    vqdmulhq_s32(a, vdupq_n_s32(simd_extract(b, LANE as u32)))
+}
+
+/// Saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovnh_s16(a: i16) -> i8 {
+    simd_extract(vqmovn_s16(vdupq_n_s16(a)), 0)
+}
+
+/// Saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovns_s32(a: i32) -> i16 {
+    simd_extract(vqmovn_s32(vdupq_n_s32(a)), 0)
+}
+
+/// Saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovnh_u16(a: u16) -> u8 {
+    simd_extract(vqmovn_u16(vdupq_n_u16(a)), 0)
+}
+
+/// Saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovns_u32(a: u32) -> u16 {
+    simd_extract(vqmovn_u32(vdupq_n_u32(a)), 0)
+}
+
+/// Saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovnd_s64(a: i64) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.scalar.sqxtn.i32.i64")]
+        fn vqmovnd_s64_(a: i64) -> i32;
+    }
+    vqmovnd_s64_(a)
+}
+
+/// Saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovnd_u64(a: u64) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.scalar.uqxtn.i32.i64")]
+        fn vqmovnd_u64_(a: u64) -> u32;
+    }
+    vqmovnd_u64_(a)
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    simd_shuffle16!(a, vqmovn_s16(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    simd_shuffle8!(a, vqmovn_s32(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    simd_shuffle4!(a, vqmovn_s64(b), [0, 1, 2, 3])
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    simd_shuffle16!(a, vqmovn_u16(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    simd_shuffle8!(a, vqmovn_u32(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqxtn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    simd_shuffle4!(a, vqmovn_u64(b), [0, 1, 2, 3])
+}
+
+/// Signed saturating extract unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovunh_s16(a: i16) -> u8 {
+    simd_extract(vqmovun_s16(vdupq_n_s16(a)), 0)
+}
+
+/// Signed saturating extract unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovuns_s32(a: i32) -> u16 {
+    simd_extract(vqmovun_s32(vdupq_n_s32(a)), 0)
+}
+
+/// Signed saturating extract unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovund_s64(a: i64) -> u32 {
+    simd_extract(vqmovun_s64(vdupq_n_s64(a)), 0)
+}
+
+/// Signed saturating extract unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovun_high_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_t {
+    simd_shuffle16!(a, vqmovun_s16(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Signed saturating extract unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovun_high_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8_t {
+    simd_shuffle8!(a, vqmovun_s32(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Signed saturating extract unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtun2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqmovun_high_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t {
+    simd_shuffle4!(a, vqmovun_s64(b), [0, 1, 2, 3])
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrdmulhh_s16(a: i16, b: i16) -> i16 {
+    simd_extract(vqrdmulh_s16(vdup_n_s16(a), vdup_n_s16(b)), 0)
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrdmulhs_s32(a: i32, b: i32) -> i32 {
+    simd_extract(vqrdmulh_s32(vdup_n_s32(a), vdup_n_s32(b)), 0)
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrdmulhh_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> i16 {
+    static_assert_imm2!(LANE);
+    vqrdmulhh_s16(a, simd_extract(b, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrdmulhh_laneq_s16<const LANE: i32>(a: i16, b: int16x8_t) -> i16 {
+    static_assert_imm3!(LANE);
+    vqrdmulhh_s16(a, simd_extract(b, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrdmulhs_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> i32 {
+    static_assert_imm1!(LANE);
+    vqrdmulhs_s32(a, simd_extract(b, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrdmulhs_laneq_s32<const LANE: i32>(a: i32, b: int32x4_t) -> i32 {
+    static_assert_imm2!(LANE);
+    vqrdmulhs_s32(a, simd_extract(b, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlah.v4i16")]
+        fn vqrdmlah_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t;
+    }
+    vqrdmlah_s16_(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlah.v8i16")]
+        fn vqrdmlahq_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    vqrdmlahq_s16_(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlah.v2i32")]
+        fn vqrdmlah_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t;
+    }
+    vqrdmlah_s32_(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlah.v4i32")]
+        fn vqrdmlahq_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    vqrdmlahq_s32_(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahh_s16(a: i16, b: i16, c: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    let c: int16x4_t = vdup_n_s16(c);
+    simd_extract(vqrdmlah_s16(a, b, c), 0)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahs_s32(a: i32, b: i32, c: i32) -> i32 {
+    let a: int32x2_t = vdup_n_s32(a);
+    let b: int32x2_t = vdup_n_s32(b);
+    let c: int32x2_t = vdup_n_s32(c);
+    simd_extract(vqrdmlah_s32(a, b, c), 0)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlah_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    let c: int16x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlah_s16(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlah_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    let c: int16x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlah_s16(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    let c: int16x8_t = simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlahq_s16(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    let c: int16x8_t = simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlahq_s16(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlah_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    let c: int32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmlah_s32(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlah_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    let c: int32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmlah_s32(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    let c: int32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlahq_s32(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    let c: int32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlahq_s32(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahh_lane_s16<const LANE: i32>(a: i16, b: i16, c: int16x4_t) -> i16 {
+    static_assert_imm2!(LANE);
+    vqrdmlahh_s16(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahh_laneq_s16<const LANE: i32>(a: i16, b: i16, c: int16x8_t) -> i16 {
+    static_assert_imm3!(LANE);
+    vqrdmlahh_s16(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahs_lane_s32<const LANE: i32>(a: i32, b: i32, c: int32x2_t) -> i32 {
+    static_assert_imm1!(LANE);
+    vqrdmlahs_s32(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlahs_laneq_s32<const LANE: i32>(a: i32, b: i32, c: int32x4_t) -> i32 {
+    static_assert_imm2!(LANE);
+    vqrdmlahs_s32(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlsh_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlsh.v4i16")]
+        fn vqrdmlsh_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t;
+    }
+    vqrdmlsh_s16_(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlsh.v8i16")]
+        fn vqrdmlshq_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    vqrdmlshq_s16_(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlsh_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlsh.v2i32")]
+        fn vqrdmlsh_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t;
+    }
+    vqrdmlsh_s32_(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlsh.v4i32")]
+        fn vqrdmlshq_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    vqrdmlshq_s32_(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshh_s16(a: i16, b: i16, c: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    let c: int16x4_t = vdup_n_s16(c);
+    simd_extract(vqrdmlsh_s16(a, b, c), 0)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh))]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshs_s32(a: i32, b: i32, c: i32) -> i32 {
+    let a: int32x2_t = vdup_n_s32(a);
+    let b: int32x2_t = vdup_n_s32(b);
+    let c: int32x2_t = vdup_n_s32(c);
+    simd_extract(vqrdmlsh_s32(a, b, c), 0)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlsh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    let c: int16x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlsh_s16(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlsh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    let c: int16x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlsh_s16(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    let c: int16x8_t = simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlshq_s16(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    let c: int16x8_t = simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlshq_s16(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlsh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    let c: int32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmlsh_s32(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlsh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    let c: int32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmlsh_s32(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    let c: int32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlshq_s32(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    let c: int32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlshq_s32(a, b, c)
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshh_lane_s16<const LANE: i32>(a: i16, b: i16, c: int16x4_t) -> i16 {
+    static_assert_imm2!(LANE);
+    vqrdmlshh_s16(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshh_laneq_s16<const LANE: i32>(a: i16, b: i16, c: int16x8_t) -> i16 {
+    static_assert_imm3!(LANE);
+    vqrdmlshh_s16(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshs_lane_s32<const LANE: i32>(a: i32, b: i32, c: int32x2_t) -> i32 {
+    static_assert_imm1!(LANE);
+    vqrdmlshs_s32(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+#[inline]
+#[target_feature(enable = "rdm")]
+#[cfg_attr(test, assert_instr(sqrdmlsh, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[stable(feature = "rdm_intrinsics", since = "1.62.0")]
+pub unsafe fn vqrdmlshs_laneq_s32<const LANE: i32>(a: i32, b: i32, c: int32x4_t) -> i32 {
+    static_assert_imm2!(LANE);
+    vqrdmlshs_s32(a, b, simd_extract(c, LANE as u32))
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshls_s32(a: i32, b: i32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.i32")]
+        fn vqrshls_s32_(a: i32, b: i32) -> i32;
+    }
+    vqrshls_s32_(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshld_s64(a: i64, b: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.i64")]
+        fn vqrshld_s64_(a: i64, b: i64) -> i64;
+    }
+    vqrshld_s64_(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshlb_s8(a: i8, b: i8) -> i8 {
+    let a: int8x8_t = vdup_n_s8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    simd_extract(vqrshl_s8(a, b), 0)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshlh_s16(a: i16, b: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    simd_extract(vqrshl_s16(a, b), 0)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshls_u32(a: u32, b: i32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.i32")]
+        fn vqrshls_u32_(a: u32, b: i32) -> u32;
+    }
+    vqrshls_u32_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshld_u64(a: u64, b: i64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.i64")]
+        fn vqrshld_u64_(a: u64, b: i64) -> u64;
+    }
+    vqrshld_u64_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshlb_u8(a: u8, b: i8) -> u8 {
+    let a: uint8x8_t = vdup_n_u8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    simd_extract(vqrshl_u8(a, b), 0)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshlh_u16(a: u16, b: i16) -> u16 {
+    let a: uint16x4_t = vdup_n_u16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    simd_extract(vqrshl_u16(a, b), 0)
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrnh_n_s16<const N: i32>(a: i16) -> i8 {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    let a: int16x8_t = vdupq_n_s16(a);
+    simd_extract(vqrshrn_n_s16::<N>(a), 0)
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrns_n_s32<const N: i32>(a: i32) -> i16 {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    let a: int32x4_t = vdupq_n_s32(a);
+    simd_extract(vqrshrn_n_s32::<N>(a), 0)
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrnd_n_s64<const N: i32>(a: i64) -> i32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    let a: int64x2_t = vdupq_n_s64(a);
+    simd_extract(vqrshrn_n_s64::<N>(a), 0)
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vqrshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vqrshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vqrshrn_n_s64::<N>(b), [0, 1, 2, 3])
+}
+
+/// Unsigned saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrnh_n_u16<const N: i32>(a: u16) -> u8 {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    let a: uint16x8_t = vdupq_n_u16(a);
+    simd_extract(vqrshrn_n_u16::<N>(a), 0)
+}
+
+/// Unsigned saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrns_n_u32<const N: i32>(a: u32) -> u16 {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    let a: uint32x4_t = vdupq_n_u32(a);
+    simd_extract(vqrshrn_n_u32::<N>(a), 0)
+}
+
+/// Unsigned saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrnd_n_u64<const N: i32>(a: u64) -> u32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    let a: uint64x2_t = vdupq_n_u64(a);
+    simd_extract(vqrshrn_n_u64::<N>(a), 0)
+}
+
+/// Unsigned saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vqrshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Unsigned saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vqrshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Unsigned saturating rounded shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vqrshrn_n_u64::<N>(b), [0, 1, 2, 3])
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrunh_n_s16<const N: i32>(a: i16) -> u8 {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    let a: int16x8_t = vdupq_n_s16(a);
+    simd_extract(vqrshrun_n_s16::<N>(a), 0)
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshruns_n_s32<const N: i32>(a: i32) -> u16 {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    let a: int32x4_t = vdupq_n_s32(a);
+    simd_extract(vqrshrun_n_s32::<N>(a), 0)
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrund_n_s64<const N: i32>(a: i64) -> u32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    let a: int64x2_t = vdupq_n_s64(a);
+    simd_extract(vqrshrun_n_s64::<N>(a), 0)
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrun_high_n_s16<const N: i32>(a: uint8x8_t, b: int16x8_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vqrshrun_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrun_high_n_s32<const N: i32>(a: uint16x4_t, b: int32x4_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vqrshrun_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrun_high_n_s64<const N: i32>(a: uint32x2_t, b: int64x2_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vqrshrun_n_s64::<N>(b), [0, 1, 2, 3])
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshld_s64(a: i64, b: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.i64")]
+        fn vqshld_s64_(a: i64, b: i64) -> i64;
+    }
+    vqshld_s64_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlb_s8(a: i8, b: i8) -> i8 {
+    let c: int8x8_t = vqshl_s8(vdup_n_s8(a), vdup_n_s8(b));
+    simd_extract(c, 0)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlh_s16(a: i16, b: i16) -> i16 {
+    let c: int16x4_t = vqshl_s16(vdup_n_s16(a), vdup_n_s16(b));
+    simd_extract(c, 0)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshls_s32(a: i32, b: i32) -> i32 {
+    let c: int32x2_t = vqshl_s32(vdup_n_s32(a), vdup_n_s32(b));
+    simd_extract(c, 0)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshld_u64(a: u64, b: i64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.i64")]
+        fn vqshld_u64_(a: u64, b: i64) -> u64;
+    }
+    vqshld_u64_(a, b)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlb_u8(a: u8, b: i8) -> u8 {
+    let c: uint8x8_t = vqshl_u8(vdup_n_u8(a), vdup_n_s8(b));
+    simd_extract(c, 0)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlh_u16(a: u16, b: i16) -> u16 {
+    let c: uint16x4_t = vqshl_u16(vdup_n_u16(a), vdup_n_s16(b));
+    simd_extract(c, 0)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshls_u32(a: u32, b: i32) -> u32 {
+    let c: uint32x2_t = vqshl_u32(vdup_n_u32(a), vdup_n_s32(b));
+    simd_extract(c, 0)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlb_n_s8<const N: i32>(a: i8) -> i8 {
+    static_assert_imm3!(N);
+    simd_extract(vqshl_n_s8::<N>(vdup_n_s8(a)), 0)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlh_n_s16<const N: i32>(a: i16) -> i16 {
+    static_assert_imm4!(N);
+    simd_extract(vqshl_n_s16::<N>(vdup_n_s16(a)), 0)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshls_n_s32<const N: i32>(a: i32) -> i32 {
+    static_assert_imm5!(N);
+    simd_extract(vqshl_n_s32::<N>(vdup_n_s32(a)), 0)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshld_n_s64<const N: i32>(a: i64) -> i64 {
+    static_assert_imm6!(N);
+    simd_extract(vqshl_n_s64::<N>(vdup_n_s64(a)), 0)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlb_n_u8<const N: i32>(a: u8) -> u8 {
+    static_assert_imm3!(N);
+    simd_extract(vqshl_n_u8::<N>(vdup_n_u8(a)), 0)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlh_n_u16<const N: i32>(a: u16) -> u16 {
+    static_assert_imm4!(N);
+    simd_extract(vqshl_n_u16::<N>(vdup_n_u16(a)), 0)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshls_n_u32<const N: i32>(a: u32) -> u32 {
+    static_assert_imm5!(N);
+    simd_extract(vqshl_n_u32::<N>(vdup_n_u32(a)), 0)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshld_n_u64<const N: i32>(a: u64) -> u64 {
+    static_assert_imm6!(N);
+    simd_extract(vqshl_n_u64::<N>(vdup_n_u64(a)), 0)
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlub_n_s8<const N: i32>(a: i8) -> u8 {
+    static_assert_imm3!(N);
+    simd_extract(vqshlu_n_s8::<N>(vdup_n_s8(a)), 0)
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshluh_n_s16<const N: i32>(a: i16) -> u16 {
+    static_assert_imm4!(N);
+    simd_extract(vqshlu_n_s16::<N>(vdup_n_s16(a)), 0)
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlus_n_s32<const N: i32>(a: i32) -> u32 {
+    static_assert_imm5!(N);
+    simd_extract(vqshlu_n_s32::<N>(vdup_n_s32(a)), 0)
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlud_n_s64<const N: i32>(a: i64) -> u64 {
+    static_assert_imm6!(N);
+    simd_extract(vqshlu_n_s64::<N>(vdup_n_s64(a)), 0)
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrnd_n_s64<const N: i32>(a: i64) -> i32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.i32")]
+        fn vqshrnd_n_s64_(a: i64, n: i32) -> i32;
+    }
+    vqshrnd_n_s64_(a, N)
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrnh_n_s16<const N: i32>(a: i16) -> i8 {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_extract(vqshrn_n_s16::<N>(vdupq_n_s16(a)), 0)
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrns_n_s32<const N: i32>(a: i32) -> i16 {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_extract(vqshrn_n_s32::<N>(vdupq_n_s32(a)), 0)
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vqshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vqshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vqshrn_n_s64::<N>(b), [0, 1, 2, 3])
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrnd_n_u64<const N: i32>(a: u64) -> u32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.i32")]
+        fn vqshrnd_n_u64_(a: u64, n: i32) -> u32;
+    }
+    vqshrnd_n_u64_(a, N)
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrnh_n_u16<const N: i32>(a: u16) -> u8 {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_extract(vqshrn_n_u16::<N>(vdupq_n_u16(a)), 0)
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrns_n_u32<const N: i32>(a: u32) -> u16 {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_extract(vqshrn_n_u32::<N>(vdupq_n_u32(a)), 0)
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vqshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vqshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vqshrn_n_u64::<N>(b), [0, 1, 2, 3])
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrunh_n_s16<const N: i32>(a: i16) -> u8 {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_extract(vqshrun_n_s16::<N>(vdupq_n_s16(a)), 0)
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshruns_n_s32<const N: i32>(a: i32) -> u16 {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_extract(vqshrun_n_s32::<N>(vdupq_n_s32(a)), 0)
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrund_n_s64<const N: i32>(a: i64) -> u32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_extract(vqshrun_n_s64::<N>(vdupq_n_s64(a)), 0)
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrun_high_n_s16<const N: i32>(a: uint8x8_t, b: int16x8_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vqshrun_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrun_high_n_s32<const N: i32>(a: uint16x4_t, b: int32x4_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vqshrun_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrun_high_n_s64<const N: i32>(a: uint32x2_t, b: int64x2_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vqshrun_n_s64::<N>(b), [0, 1, 2, 3])
+}
+
+/// Unsigned saturating accumulate of signed value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqaddb_u8(a: u8, b: i8) -> u8 {
+    simd_extract(vsqadd_u8(vdup_n_u8(a), vdup_n_s8(b)), 0)
+}
+
+/// Unsigned saturating accumulate of signed value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqaddh_u16(a: u16, b: i16) -> u16 {
+    simd_extract(vsqadd_u16(vdup_n_u16(a), vdup_n_s16(b)), 0)
+}
+
+/// Unsigned saturating accumulate of signed value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqadds_u32(a: u32, b: i32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usqadd.i32")]
+        fn vsqadds_u32_(a: u32, b: i32) -> u32;
+    }
+    vsqadds_u32_(a, b)
+}
+
+/// Unsigned saturating accumulate of signed value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqaddd_u64(a: u64, b: i64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usqadd.i64")]
+        fn vsqaddd_u64_(a: u64, b: i64) -> u64;
+    }
+    vsqaddd_u64_(a, b)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqrt_f32(a: float32x2_t) -> float32x2_t {
+    simd_fsqrt(a)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqrtq_f32(a: float32x4_t) -> float32x4_t {
+    simd_fsqrt(a)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqrt_f64(a: float64x1_t) -> float64x1_t {
+    simd_fsqrt(a)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqrtq_f64(a: float64x2_t) -> float64x2_t {
+    simd_fsqrt(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsqrte_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v1f64")]
+        fn vrsqrte_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrsqrte_f64_(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsqrteq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f64")]
+        fn vrsqrteq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrsqrteq_f64_(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsqrtes_f32(a: f32) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.f32")]
+        fn vrsqrtes_f32_(a: f32) -> f32;
+    }
+    vrsqrtes_f32_(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsqrted_f64(a: f64) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.f64")]
+        fn vrsqrted_f64_(a: f64) -> f64;
+    }
+    vrsqrted_f64_(a)
+}
+
+/// Floating-point reciprocal square root step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrts))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsqrts_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrts.v1f64")]
+        fn vrsqrts_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    vrsqrts_f64_(a, b)
+}
+
+/// Floating-point reciprocal square root step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrts))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsqrtsq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrts.v2f64")]
+        fn vrsqrtsq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vrsqrtsq_f64_(a, b)
+}
+
+/// Floating-point reciprocal square root step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrts))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsqrtss_f32(a: f32, b: f32) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrts.f32")]
+        fn vrsqrtss_f32_(a: f32, b: f32) -> f32;
+    }
+    vrsqrtss_f32_(a, b)
+}
+
+/// Floating-point reciprocal square root step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrts))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsqrtsd_f64(a: f64, b: f64) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrts.f64")]
+        fn vrsqrtsd_f64_(a: f64, b: f64) -> f64;
+    }
+    vrsqrtsd_f64_(a, b)
+}
+
+/// Reciprocal estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpe))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecpe_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v1f64")]
+        fn vrecpe_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrecpe_f64_(a)
+}
+
+/// Reciprocal estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpe))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecpeq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v2f64")]
+        fn vrecpeq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrecpeq_f64_(a)
+}
+
+/// Reciprocal estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpe))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecpes_f32(a: f32) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.f32")]
+        fn vrecpes_f32_(a: f32) -> f32;
+    }
+    vrecpes_f32_(a)
+}
+
+/// Reciprocal estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpe))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecped_f64(a: f64) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.f64")]
+        fn vrecped_f64_(a: f64) -> f64;
+    }
+    vrecped_f64_(a)
+}
+
+/// Floating-point reciprocal step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecps_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecps.v1f64")]
+        fn vrecps_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    vrecps_f64_(a, b)
+}
+
+/// Floating-point reciprocal step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecpsq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecps.v2f64")]
+        fn vrecpsq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vrecpsq_f64_(a, b)
+}
+
+/// Floating-point reciprocal step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecpss_f32(a: f32, b: f32) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecps.f32")]
+        fn vrecpss_f32_(a: f32, b: f32) -> f32;
+    }
+    vrecpss_f32_(a, b)
+}
+
+/// Floating-point reciprocal step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecps))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecpsd_f64(a: f64, b: f64) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecps.f64")]
+        fn vrecpsd_f64_(a: f64, b: f64) -> f64;
+    }
+    vrecpsd_f64_(a, b)
+}
+
+/// Floating-point reciprocal exponent
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecpxs_f32(a: f32) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpx.f32")]
+        fn vrecpxs_f32_(a: f32) -> f32;
+    }
+    vrecpxs_f32_(a)
+}
+
+/// Floating-point reciprocal exponent
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frecpx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrecpxd_f64(a: f64) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpx.f64")]
+        fn vrecpxd_f64_(a: f64) -> f64;
+    }
+    vrecpxd_f64_(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_s64_p64(a: poly64x1_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_u64_p64(a: poly64x1_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_p64_s64(a: int64x1_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_p64_u64(a: uint64x1_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_s64_p64(a: poly64x2_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_u64_p64(a: poly64x2_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_p64_s64(a: int64x2_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_p64_u64(a: uint64x2_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_s8_f64(a: float64x1_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_s16_f64(a: float64x1_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_s32_f64(a: float64x1_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_s64_f64(a: float64x1_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_s8_f64(a: float64x2_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_s16_f64(a: float64x2_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_s32_f64(a: float64x2_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_s64_f64(a: float64x2_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_u8_f64(a: float64x1_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_u16_f64(a: float64x1_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_u32_f64(a: float64x1_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_u64_f64(a: float64x1_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_u8_f64(a: float64x2_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_u16_f64(a: float64x2_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_u32_f64(a: float64x2_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_u64_f64(a: float64x2_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_p8_f64(a: float64x1_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_p16_f64(a: float64x1_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_p64_f32(a: float32x2_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_p64_f64(a: float64x1_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_p8_f64(a: float64x2_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_p16_f64(a: float64x2_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_p64_f32(a: float32x4_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_p64_f64(a: float64x2_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_p128_f64(a: float64x2_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_s8(a: int8x8_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_s16(a: int16x4_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_s32(a: int32x2_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_s64(a: int64x1_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_s8(a: int8x16_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_s16(a: int16x8_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_s32(a: int32x4_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_s64(a: int64x2_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_p8(a: poly8x8_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_u16(a: uint16x4_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_u32(a: uint32x2_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_u64(a: uint64x1_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_p8(a: poly8x16_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_u16(a: uint16x8_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_u32(a: uint32x4_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_u64(a: uint64x2_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_u8(a: uint8x8_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_p16(a: poly16x4_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_p64(a: poly64x1_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f32_p64(a: poly64x1_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_u8(a: uint8x16_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_p16(a: poly16x8_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_p64(a: poly64x2_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f32_p64(a: poly64x2_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_p128(a: p128) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f64_f32(a: float32x2_t) -> float64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpret_f32_f64(a: float64x1_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f64_f32(a: float32x4_t) -> float64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vreinterpretq_f32_f64(a: float64x2_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(srshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshld_s64(a: i64, b: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.i64")]
+        fn vrshld_s64_(a: i64, b: i64) -> i64;
+    }
+    vrshld_s64_(a, b)
+}
+
+/// Unsigned rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(urshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshld_u64(a: u64, b: i64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.i64")]
+        fn vrshld_u64_(a: u64, b: i64) -> u64;
+    }
+    vrshld_u64_(a, b)
+}
+
+/// Signed rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrd_n_s64<const N: i32>(a: i64) -> i64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshld_s64(a, -N as i64)
+}
+
+/// Unsigned rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrd_n_u64<const N: i32>(a: u64) -> u64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshld_u64(a, -N as i64)
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vrshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vrshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vrshrn_n_s64::<N>(b), [0, 1, 2, 3])
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vrshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vrshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vrshrn_n_u64::<N>(b), [0, 1, 2, 3])
+}
+
+/// Signed rounding shift right and accumulate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    let b: i64 = vrshrd_n_s64::<N>(b);
+    a.wrapping_add(b)
+}
+
+/// Ungisned rounding shift right and accumulate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    let b: u64 = vrshrd_n_u64::<N>(b);
+    a.wrapping_add(b)
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let x: int8x8_t = vrsubhn_s16(b, c);
+    simd_shuffle16!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let x: int16x4_t = vrsubhn_s32(b, c);
+    simd_shuffle8!(a, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let x: int32x2_t = vrsubhn_s64(b, c);
+    simd_shuffle4!(a, x, [0, 1, 2, 3])
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    let x: uint8x8_t = vrsubhn_u16(b, c);
+    simd_shuffle16!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    let x: uint16x4_t = vrsubhn_u32(b, c);
+    simd_shuffle8!(a, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rsubhn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    let x: uint32x2_t = vrsubhn_u64(b, c);
+    simd_shuffle4!(a, x, [0, 1, 2, 3])
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vset_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> float64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsetq_lane_f64<const LANE: i32>(a: f64, b: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sshl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshld_s64(a: i64, b: i64) -> i64 {
+    transmute(vshl_s64(transmute(a), transmute(b)))
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ushl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshld_u64(a: u64, b: i64) -> u64 {
+    transmute(vshl_u64(transmute(a), transmute(b)))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sshll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshll_high_n_s8<const N: i32>(a: int8x16_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 0 && N <= 8);
+    let b: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vshll_n_s8::<N>(b)
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sshll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshll_high_n_s16<const N: i32>(a: int16x8_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 0 && N <= 16);
+    let b: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    vshll_n_s16::<N>(b)
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sshll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshll_high_n_s32<const N: i32>(a: int32x4_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 0 && N <= 32);
+    let b: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    vshll_n_s32::<N>(b)
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ushll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshll_high_n_u8<const N: i32>(a: uint8x16_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 0 && N <= 8);
+    let b: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    vshll_n_u8::<N>(b)
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ushll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshll_high_n_u16<const N: i32>(a: uint16x8_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 0 && N <= 16);
+    let b: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    vshll_n_u16::<N>(b)
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ushll2, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshll_high_n_u32<const N: i32>(a: uint32x4_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 0 && N <= 32);
+    let b: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    vshll_n_u32::<N>(b)
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vshrn_n_s64::<N>(b), [0, 1, 2, 3])
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shuffle16!(a, vshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shuffle8!(a, vshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(shrn2, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shuffle4!(a, vshrn_n_u64::<N>(b), [0, 1, 2, 3])
+}
+
+/// SM3PARTW1
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3partw1))]
+pub unsafe fn vsm3partw1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3partw1")]
+        fn vsm3partw1q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    vsm3partw1q_u32_(a, b, c)
+}
+
+/// SM3PARTW2
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3partw2))]
+pub unsafe fn vsm3partw2q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3partw2")]
+        fn vsm3partw2q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    vsm3partw2q_u32_(a, b, c)
+}
+
+/// SM3SS1
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3ss1))]
+pub unsafe fn vsm3ss1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3ss1")]
+        fn vsm3ss1q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    vsm3ss1q_u32_(a, b, c)
+}
+
+/// SM4 key
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm4ekey))]
+pub unsafe fn vsm4ekeyq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm4ekey")]
+        fn vsm4ekeyq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    vsm4ekeyq_u32_(a, b)
+}
+
+/// SM4 encode
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm4e))]
+pub unsafe fn vsm4eq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm4e")]
+        fn vsm4eq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    vsm4eq_u32_(a, b)
+}
+
+/// Rotate and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(rax1))]
+pub unsafe fn vrax1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.rax1")]
+        fn vrax1q_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+    vrax1q_u64_(a, b)
+}
+
+/// SHA512 hash update part 1
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512h))]
+pub unsafe fn vsha512hq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512h")]
+        fn vsha512hq_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    vsha512hq_u64_(a, b, c)
+}
+
+/// SHA512 hash update part 2
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512h2))]
+pub unsafe fn vsha512h2q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512h2")]
+        fn vsha512h2q_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    vsha512h2q_u64_(a, b, c)
+}
+
+/// SHA512 schedule update 0
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512su0))]
+pub unsafe fn vsha512su0q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512su0")]
+        fn vsha512su0q_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+    vsha512su0q_u64_(a, b)
+}
+
+/// SHA512 schedule update 1
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512su1))]
+pub unsafe fn vsha512su1q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512su1")]
+        fn vsha512su1q_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    vsha512su1q_u64_(a, b, c)
+}
+
+/// Floating-point round to 32-bit integer, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32x))]
+pub unsafe fn vrnd32x_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32x.v2f32")]
+        fn vrnd32x_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd32x_f32_(a)
+}
+
+/// Floating-point round to 32-bit integer, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32x))]
+pub unsafe fn vrnd32xq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32x.v4f32")]
+        fn vrnd32xq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrnd32xq_f32_(a)
+}
+
+/// Floating-point round to 32-bit integer toward zero
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32z))]
+pub unsafe fn vrnd32z_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32z.v2f32")]
+        fn vrnd32z_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd32z_f32_(a)
+}
+
+/// Floating-point round to 32-bit integer toward zero
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32z))]
+pub unsafe fn vrnd32zq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32z.v4f32")]
+        fn vrnd32zq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrnd32zq_f32_(a)
+}
+
+/// Floating-point round to 64-bit integer, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64x))]
+pub unsafe fn vrnd64x_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64x.v2f32")]
+        fn vrnd64x_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd64x_f32_(a)
+}
+
+/// Floating-point round to 64-bit integer, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64x))]
+pub unsafe fn vrnd64xq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64x.v4f32")]
+        fn vrnd64xq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrnd64xq_f32_(a)
+}
+
+/// Floating-point round to 64-bit integer toward zero
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64z))]
+pub unsafe fn vrnd64z_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64z.v2f32")]
+        fn vrnd64z_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd64z_f32_(a)
+}
+
+/// Floating-point round to 64-bit integer toward zero
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64z))]
+pub unsafe fn vrnd64zq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64z.v4f32")]
+        fn vrnd64zq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrnd64zq_f32_(a)
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_shuffle16!(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_shuffle16!(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    simd_shuffle16!(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_shuffle16!(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_shuffle16!(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    simd_shuffle16!(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(trn2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Transpose vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtrn2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_shuffle16!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_shuffle16!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    simd_shuffle16!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_shuffle16!(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_shuffle16!(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    simd_shuffle16!(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vzip2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_shuffle16!(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_shuffle16!(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    simd_shuffle16!(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_shuffle16!(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_shuffle16!(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    simd_shuffle16!(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uzp2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(zip2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuzp2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uabal))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
+    let d: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let e: uint8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let f: uint8x8_t = vabd_u8(d, e);
+    simd_add(a, simd_cast(f))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uabal))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    let d: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let e: uint16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
+    let f: uint16x4_t = vabd_u16(d, e);
+    simd_add(a, simd_cast(f))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uabal))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    let d: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let e: uint32x2_t = simd_shuffle2!(c, c, [2, 3]);
+    let f: uint32x2_t = vabd_u32(d, e);
+    simd_add(a, simd_cast(f))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sabal))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
+    let d: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let e: int8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let f: int8x8_t = vabd_s8(d, e);
+    let f: uint8x8_t = simd_cast(f);
+    simd_add(a, simd_cast(f))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sabal))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    let d: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let e: int16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
+    let f: int16x4_t = vabd_s16(d, e);
+    let f: uint16x4_t = simd_cast(f);
+    simd_add(a, simd_cast(f))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sabal))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    let d: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let e: int32x2_t = simd_shuffle2!(c, c, [2, 3]);
+    let f: int32x2_t = vabd_s32(d, e);
+    let f: uint32x2_t = simd_cast(f);
+    simd_add(a, simd_cast(f))
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqabs_s64(a: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v1i64")]
+        fn vqabs_s64_(a: int64x1_t) -> int64x1_t;
+    }
+    vqabs_s64_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqabsq_s64(a: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v2i64")]
+        fn vqabsq_s64_(a: int64x2_t) -> int64x2_t;
+    }
+    vqabsq_s64_(a)
+}
+
+/// Signed saturating absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqabsb_s8(a: i8) -> i8 {
+    simd_extract(vqabs_s8(vdup_n_s8(a)), 0)
+}
+
+/// Signed saturating absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqabsh_s16(a: i16) -> i16 {
+    simd_extract(vqabs_s16(vdup_n_s16(a)), 0)
+}
+
+/// Signed saturating absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqabss_s32(a: i32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.i32")]
+        fn vqabss_s32_(a: i32) -> i32;
+    }
+    vqabss_s32_(a)
+}
+
+/// Signed saturating absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqabs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqabsd_s64(a: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.i64")]
+        fn vqabsd_s64_(a: i64) -> i64;
+    }
+    vqabsd_s64_(a)
+}
+
+/// Shift left and insert
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vslid_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
+    static_assert!(N : i32 where N >= 0 && N <= 63);
+    transmute(vsli_n_s64::<N>(transmute(a), transmute(b)))
+}
+
+/// Shift left and insert
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vslid_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
+    static_assert!(N : i32 where N >= 0 && N <= 63);
+    transmute(vsli_n_u64::<N>(transmute(a), transmute(b)))
+}
+
+/// Shift right and insert
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsrid_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    transmute(vsri_n_s64::<N>(transmute(a), transmute(b)))
+}
+
+/// Shift right and insert
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsrid_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    transmute(vsri_n_u64::<N>(transmute(a), transmute(b)))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::core_arch::simd::*;
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_veor3q_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let c: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: i8x16 = transmute(veor3q_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_veor3q_s16() {
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let c: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i16x8 = transmute(veor3q_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_veor3q_s32() {
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let c: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(veor3q_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_veor3q_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let c: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(veor3q_s64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_veor3q_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let c: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: u8x16 = transmute(veor3q_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_veor3q_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let c: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(veor3q_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_veor3q_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let c: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(veor3q_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_veor3q_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let c: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(veor3q_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 9.0;
+        let e: f64 = 8.0;
+        let r: f64 = transmute(vabd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let b: f64x2 = f64x2::new(9.0, 3.0);
+        let e: f64x2 = f64x2::new(8.0, 1.0);
+        let r: f64x2 = transmute(vabdq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabds_f32() {
+        let a: f32 = 1.0;
+        let b: f32 = 9.0;
+        let e: f32 = 8.0;
+        let r: f32 = transmute(vabds_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdd_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 9.0;
+        let e: f64 = 8.0;
+        let r: f64 = transmute(vabdd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_high_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10);
+        let e: u16x8 = u16x8::new(1, 0, 1, 2, 3, 4, 5, 6);
+        let r: u16x8 = transmute(vabdl_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_high_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 8, 9, 11, 12);
+        let b: u16x8 = u16x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: u32x4 = u32x4::new(2, 1, 1, 2);
+        let r: u32x4 = transmute(vabdl_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_high_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(10, 10, 10, 10);
+        let e: u64x2 = u64x2::new(7, 6);
+        let r: u64x2 = transmute(vabdl_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_high_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10);
+        let e: i16x8 = i16x8::new(1, 0, 1, 2, 3, 4, 5, 6);
+        let r: i16x8 = transmute(vabdl_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_high_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 9, 10, 11, 12);
+        let b: i16x8 = i16x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: i32x4 = i32x4::new(1, 0, 1, 2);
+        let r: i32x4 = transmute(vabdl_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_high_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(10, 10, 10, 10);
+        let e: i64x2 = i64x2::new(7, 6);
+        let r: i64x2 = transmute(vabdl_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let b: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u64x1 = u64x1::new(0);
+        let b: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u64() {
+        let a: u64x2 = u64x2::new(0, 0x01);
+        let b: u64x2 = u64x2::new(0, 0x01);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u64x2 = u64x2::new(0, 0);
+        let b: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let r: u64x2 = transmute(vceqq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x01);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x01);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x2 = i64x2::new(-9223372036854775808, -9223372036854775808);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let r: u64x2 = transmute(vceqq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_p64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceq_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceq_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_p64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x01);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x01);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqq_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x2 = i64x2::new(-9223372036854775808, -9223372036854775808);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let r: u64x2 = transmute(vceqq_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_f64() {
+        let a: f64 = 1.2;
+        let b: f64 = 1.2;
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_f64() {
+        let a: f64x2 = f64x2::new(1.2, 3.4);
+        let b: f64x2 = f64x2::new(1.2, 3.4);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqd_s64() {
+        let a: i64 = 1;
+        let b: i64 = 2;
+        let e: u64 = 0;
+        let r: u64 = transmute(vceqd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqd_u64() {
+        let a: u64 = 1;
+        let b: u64 = 2;
+        let e: u64 = 0;
+        let r: u64 = transmute(vceqd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqs_f32() {
+        let a: f32 = 1.;
+        let b: f32 = 2.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vceqs_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqd_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vceqd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_s8() {
+        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0, 0xFF, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vceqz_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let e: u8x16 = u8x16::new(0, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vceqzq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0, 0xFF_FF, 0, 0);
+        let r: u16x4 = transmute(vceqz_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0, 0xFF_FF, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vceqzq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0x00);
+        let e: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceqz_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0, 0);
+        let r: u32x4 = transmute(vceqzq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vceqz_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x00);
+        let e: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqzq_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_p8() {
+        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0, 0xFF, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vceqz_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_p8() {
+        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let e: u8x16 = u8x16::new(0, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vceqzq_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_p64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vceqz_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_p64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x00);
+        let e: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqzq_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_u8() {
+        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vceqz_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vceqzq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_u16() {
+        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0, 0);
+        let r: u16x4 = transmute(vceqz_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vceqzq_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_u32() {
+        let a: u32x2 = u32x2::new(0, 0x00);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceqz_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0, 0);
+        let r: u32x4 = transmute(vceqzq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceqz_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_u64() {
+        let a: u64x2 = u64x2::new(0, 0x00);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqzq_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_f32() {
+        let a: f32x2 = f32x2::new(0.0, 1.2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vceqz_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_f32() {
+        let a: f32x4 = f32x4::new(0.0, 1.2, 3.4, 5.6);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0, 0);
+        let r: u32x4 = transmute(vceqzq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_f64() {
+        let a: f64 = 0.0;
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceqz_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_f64() {
+        let a: f64x2 = f64x2::new(0.0, 1.2);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let r: u64x2 = transmute(vceqzq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzd_s64() {
+        let a: i64 = 1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vceqzd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzd_u64() {
+        let a: u64 = 1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vceqzd_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzs_f32() {
+        let a: f32 = 1.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vceqzs_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzd_f64() {
+        let a: f64 = 1.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vceqzd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vtst_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x00);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x00);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let r: u64x2 = transmute(vtstq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_p64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vtst_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_p64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x00);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x00);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let r: u64x2 = transmute(vtstq_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let b: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vtst_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_u64() {
+        let a: u64x2 = u64x2::new(0, 0x00);
+        let b: u64x2 = u64x2::new(0, 0x00);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vtstq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstd_s64() {
+        let a: i64 = 0;
+        let b: i64 = 0;
+        let e: u64 = 0;
+        let r: u64 = transmute(vtstd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstd_u64() {
+        let a: u64 = 0;
+        let b: u64 = 0;
+        let e: u64 = 0;
+        let r: u64 = transmute(vtstd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqadds_s32() {
+        let a: i32 = 1;
+        let b: u32 = 1;
+        let e: i32 = 2;
+        let r: i32 = transmute(vuqadds_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddd_s64() {
+        let a: i64 = 1;
+        let b: u64 = 1;
+        let e: i64 = 2;
+        let r: i64 = transmute(vuqaddd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddb_s8() {
+        let a: i8 = 1;
+        let b: u8 = 2;
+        let e: i8 = 3;
+        let r: i8 = transmute(vuqaddb_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddh_s16() {
+        let a: i16 = 1;
+        let b: u16 = 2;
+        let e: i16 = 3;
+        let r: i16 = transmute(vuqaddh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabs_f64() {
+        let a: f64 = -0.1;
+        let e: f64 = 0.1;
+        let r: f64 = transmute(vabs_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabsq_f64() {
+        let a: f64x2 = f64x2::new(-0.1, -2.2);
+        let e: f64x2 = f64x2::new(0.1, 2.2);
+        let r: f64x2 = transmute(vabsq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(0);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcgt_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(0, 1);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcgtq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcgt_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(0, 1);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcgtq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_f64() {
+        let a: f64 = 1.2;
+        let b: f64 = 0.1;
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcgt_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_f64() {
+        let a: f64x2 = f64x2::new(1.2, 2.3);
+        let b: f64x2 = f64x2::new(0.1, 1.2);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcgtq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtd_s64() {
+        let a: i64 = 1;
+        let b: i64 = 2;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcgtd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtd_u64() {
+        let a: u64 = 1;
+        let b: u64 = 2;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcgtd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgts_f32() {
+        let a: f32 = 1.;
+        let b: f32 = 2.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vcgts_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtd_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcgtd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let b: i64x1 = i64x1::new(1);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vclt_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i64x2 = i64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcltq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let b: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vclt_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u64x2 = u64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcltq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_f64() {
+        let a: f64 = 0.1;
+        let b: f64 = 1.2;
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vclt_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_f64() {
+        let a: f64x2 = f64x2::new(0.1, 1.2);
+        let b: f64x2 = f64x2::new(1.2, 2.3);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcltq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltd_s64() {
+        let a: i64 = 2;
+        let b: i64 = 1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcltd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltd_u64() {
+        let a: u64 = 2;
+        let b: u64 = 1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcltd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclts_f32() {
+        let a: f32 = 2.;
+        let b: f32 = 1.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vclts_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltd_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 1.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcltd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let b: i64x1 = i64x1::new(1);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcle_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i64x2 = i64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcleq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcged_s64() {
+        let a: i64 = 1;
+        let b: i64 = 2;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcged_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcged_u64() {
+        let a: u64 = 1;
+        let b: u64 = 2;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcged_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcges_f32() {
+        let a: f32 = 1.;
+        let b: f32 = 2.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vcges_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcged_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcged_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let b: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcle_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u64x2 = u64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcleq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_f64() {
+        let a: f64 = 0.1;
+        let b: f64 = 1.2;
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcle_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_f64() {
+        let a: f64x2 = f64x2::new(0.1, 1.2);
+        let b: f64x2 = f64x2::new(1.2, 2.3);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcleq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcled_s64() {
+        let a: i64 = 2;
+        let b: i64 = 1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcled_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcled_u64() {
+        let a: u64 = 2;
+        let b: u64 = 1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcled_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcles_f32() {
+        let a: f32 = 2.;
+        let b: f32 = 1.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vcles_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcled_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 1.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcled_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(0);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcge_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(0, 1);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcgeq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcge_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(0, 1);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcgeq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_f64() {
+        let a: f64 = 1.2;
+        let b: f64 = 0.1;
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcge_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_f64() {
+        let a: f64x2 = f64x2::new(1.2, 2.3);
+        let b: f64x2 = f64x2::new(0.1, 1.2);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcgeq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgez_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05);
+        let e: u8x8 = u8x8::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcgez_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgezq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x7F);
+        let e: u8x16 = u8x16::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgezq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgez_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x01);
+        let e: u16x4 = u16x4::new(0, 0, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcgez_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgezq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05);
+        let e: u16x8 = u16x8::new(0, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgezq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgez_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vcgez_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgezq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x01);
+        let e: u32x4 = u32x4::new(0, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgezq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgez_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vcgez_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgezq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, -1);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vcgezq_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgez_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let e: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcgez_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgezq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let e: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgezq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgez_f64() {
+        let a: f64 = -1.2;
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vcgez_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgezq_f64() {
+        let a: f64x2 = f64x2::new(-1.2, 0.0);
+        let e: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcgezq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgezd_s64() {
+        let a: i64 = -1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcgezd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgezs_f32() {
+        let a: f32 = -1.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vcgezs_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgezd_f64() {
+        let a: f64 = -1.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcgezd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtz_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05);
+        let e: u8x8 = u8x8::new(0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcgtz_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtzq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x7F);
+        let e: u8x16 = u8x16::new(0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgtzq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtz_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x01);
+        let e: u16x4 = u16x4::new(0, 0, 0, 0xFF_FF);
+        let r: u16x4 = transmute(vcgtz_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtzq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgtzq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtz_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vcgtz_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtzq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x01);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtzq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtz_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vcgtz_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtzq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, -1);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vcgtzq_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtz_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vcgtz_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtzq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let e: u32x4 = u32x4::new(0, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtzq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtz_f64() {
+        let a: f64 = -1.2;
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vcgtz_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtzq_f64() {
+        let a: f64x2 = f64x2::new(-1.2, 0.0);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vcgtzq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtzd_s64() {
+        let a: i64 = -1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcgtzd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtzs_f32() {
+        let a: f32 = -1.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vcgtzs_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtzd_f64() {
+        let a: f64 = -1.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcgtzd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclez_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vclez_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclezq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vclezq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclez_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x01);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0);
+        let r: u16x4 = transmute(vclez_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclezq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vclezq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclez_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vclez_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclezq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x01);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vclezq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclez_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vclez_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclezq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, -1);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vclezq_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclez_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vclez_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclezq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0, 0);
+        let r: u32x4 = transmute(vclezq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclez_f64() {
+        let a: f64 = -1.2;
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vclez_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclezq_f64() {
+        let a: f64x2 = f64x2::new(-1.2, 0.0);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vclezq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclezd_s64() {
+        let a: i64 = 2;
+        let e: u64 = 0;
+        let r: u64 = transmute(vclezd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclezs_f32() {
+        let a: f32 = 2.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vclezs_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclezd_f64() {
+        let a: f64 = 2.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vclezd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltz_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vcltz_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltzq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vcltzq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltz_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x01);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0, 0);
+        let r: u16x4 = transmute(vcltz_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltzq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vcltzq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltz_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcltz_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltzq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x01);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0, 0);
+        let r: u32x4 = transmute(vcltzq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltz_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcltz_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltzq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, -1);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcltzq_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltz_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vcltz_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltzq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0, 0);
+        let r: u32x4 = transmute(vcltzq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltz_f64() {
+        let a: f64 = -1.2;
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcltz_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltzq_f64() {
+        let a: f64x2 = f64x2::new(-1.2, 0.0);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let r: u64x2 = transmute(vcltzq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltzd_s64() {
+        let a: i64 = 2;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcltzd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltzs_f32() {
+        let a: f32 = 2.;
+        let e: u32 = 0;
+        let r: u32 = transmute(vcltzs_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltzd_f64() {
+        let a: f64 = 2.;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcltzd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcagt_f64() {
+        let a: f64 = -1.2;
+        let b: f64 = -1.1;
+        let e: u64x1 = u64x1::new(!0);
+        let r: u64x1 = transmute(vcagt_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcagtq_f64() {
+        let a: f64x2 = f64x2::new(-1.2, 0.0);
+        let b: f64x2 = f64x2::new(-1.1, 0.0);
+        let e: u64x2 = u64x2::new(!0, 0);
+        let r: u64x2 = transmute(vcagtq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcagts_f32() {
+        let a: f32 = -1.2;
+        let b: f32 = -1.1;
+        let e: u32 = !0;
+        let r: u32 = transmute(vcagts_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcagtd_f64() {
+        let a: f64 = -1.2;
+        let b: f64 = -1.1;
+        let e: u64 = !0;
+        let r: u64 = transmute(vcagtd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcage_f64() {
+        let a: f64 = -1.2;
+        let b: f64 = -1.1;
+        let e: u64x1 = u64x1::new(!0);
+        let r: u64x1 = transmute(vcage_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcageq_f64() {
+        let a: f64x2 = f64x2::new(-1.2, 0.0);
+        let b: f64x2 = f64x2::new(-1.1, 0.0);
+        let e: u64x2 = u64x2::new(!0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcageq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcages_f32() {
+        let a: f32 = -1.2;
+        let b: f32 = -1.1;
+        let e: u32 = !0;
+        let r: u32 = transmute(vcages_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaged_f64() {
+        let a: f64 = -1.2;
+        let b: f64 = -1.1;
+        let e: u64 = !0;
+        let r: u64 = transmute(vcaged_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcalt_f64() {
+        let a: f64 = -1.2;
+        let b: f64 = -1.1;
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vcalt_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaltq_f64() {
+        let a: f64x2 = f64x2::new(-1.2, 0.0);
+        let b: f64x2 = f64x2::new(-1.1, 0.0);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vcaltq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcalts_f32() {
+        let a: f32 = -1.2;
+        let b: f32 = -1.1;
+        let e: u32 = 0;
+        let r: u32 = transmute(vcalts_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaltd_f64() {
+        let a: f64 = -1.2;
+        let b: f64 = -1.1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcaltd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcale_f64() {
+        let a: f64 = -1.2;
+        let b: f64 = -1.1;
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vcale_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaleq_f64() {
+        let a: f64x2 = f64x2::new(-1.2, 0.0);
+        let b: f64x2 = f64x2::new(-1.1, 0.0);
+        let e: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcaleq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcales_f32() {
+        let a: f32 = -1.2;
+        let b: f32 = -1.1;
+        let e: u32 = 0;
+        let r: u32 = transmute(vcales_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaled_f64() {
+        let a: f64 = -1.2;
+        let b: f64 = -1.1;
+        let e: u64 = 0;
+        let r: u64 = transmute(vcaled_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(0, 0x7F, 0, 0, 0, 0, 0, 0);
+        let e: i8x8 = i8x8::new(0x7F, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vcopy_lane_s8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(0, 0x7F, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i8x16 = i8x16::new(0x7F, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vcopyq_laneq_s8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 0x7F_FF, 0, 0);
+        let e: i16x4 = i16x4::new(0x7F_FF, 2, 3, 4);
+        let r: i16x4 = transmute(vcopy_lane_s16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(0, 0x7F_FF, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(0x7F_FF, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vcopyq_laneq_s16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 0x7F_FF_FF_FF);
+        let e: i32x2 = i32x2::new(0x7F_FF_FF_FF, 2);
+        let r: i32x2 = transmute(vcopy_lane_s32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(0, 0x7F_FF_FF_FF, 0, 0);
+        let e: i32x4 = i32x4::new(0x7F_FF_FF_FF, 2, 3, 4);
+        let r: i32x4 = transmute(vcopyq_laneq_s32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(0, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 2);
+        let r: i64x2 = transmute(vcopyq_laneq_s64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(0, 0xFF, 0, 0, 0, 0, 0, 0);
+        let e: u8x8 = u8x8::new(0xFF, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vcopy_lane_u8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(0, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: u8x16 = u8x16::new(0xFF, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x16 = transmute(vcopyq_laneq_u8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(0, 0xFF_FF, 0, 0);
+        let e: u16x4 = u16x4::new(0xFF_FF, 2, 3, 4);
+        let r: u16x4 = transmute(vcopy_lane_u16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(0, 0xFF_FF, 0, 0, 0, 0, 0, 0);
+        let e: u16x8 = u16x8::new(0xFF_FF, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16x8 = transmute(vcopyq_laneq_u16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 2);
+        let r: u32x2 = transmute(vcopy_lane_u32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0, 0);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 2, 3, 4);
+        let r: u32x4 = transmute(vcopyq_laneq_u32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 2);
+        let r: u64x2 = transmute(vcopyq_laneq_u64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_p8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(0, 0x7F, 0, 0, 0, 0, 0, 0);
+        let e: i8x8 = i8x8::new(0x7F, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vcopy_lane_p8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_p8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(0, 0x7F, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i8x16 = i8x16::new(0x7F, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vcopyq_laneq_p8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_p16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 0x7F_FF, 0, 0);
+        let e: i16x4 = i16x4::new(0x7F_FF, 2, 3, 4);
+        let r: i16x4 = transmute(vcopy_lane_p16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_p16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(0, 0x7F_FF, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(0x7F_FF, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vcopyq_laneq_p16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_p64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(0, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 2);
+        let r: i64x2 = transmute(vcopyq_laneq_p64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(0., 0.5);
+        let e: f32x2 = f32x2::new(0.5, 2.);
+        let r: f32x2 = transmute(vcopy_lane_f32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x4 = f32x4::new(0., 0.5, 0., 0.);
+        let e: f32x4 = f32x4::new(0.5, 2., 3., 4.);
+        let r: f32x4 = transmute(vcopyq_laneq_f32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_laneq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64x2 = f64x2::new(0., 0.5);
+        let e: f64x2 = f64x2::new(0.5, 2.);
+        let r: f64x2 = transmute(vcopyq_laneq_f64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x16 = i8x16::new(0, 0x7F, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i8x8 = i8x8::new(0x7F, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vcopy_laneq_s8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 0x7F_FF, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(0x7F_FF, 2, 3, 4);
+        let r: i16x4 = transmute(vcopy_laneq_s16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 0x7F_FF_FF_FF, 0, 0);
+        let e: i32x2 = i32x2::new(0x7F_FF_FF_FF, 2);
+        let r: i32x2 = transmute(vcopy_laneq_s32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x16 = u8x16::new(0, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: u8x8 = u8x8::new(0xFF, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vcopy_laneq_u8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x8 = u16x8::new(0, 0xFF_FF, 0, 0, 0, 0, 0, 0);
+        let e: u16x4 = u16x4::new(0xFF_FF, 2, 3, 4);
+        let r: u16x4 = transmute(vcopy_laneq_u16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0, 0);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 2);
+        let r: u32x2 = transmute(vcopy_laneq_u32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_p8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x16 = i8x16::new(0, 0x7F, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i8x8 = i8x8::new(0x7F, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vcopy_laneq_p8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_p16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 0x7F_FF, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(0x7F_FF, 2, 3, 4);
+        let r: i16x4 = transmute(vcopy_laneq_p16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x4 = f32x4::new(0., 0.5, 0., 0.);
+        let e: f32x2 = f32x2::new(0.5, 2.);
+        let r: f32x2 = transmute(vcopy_laneq_f32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x8 = i8x8::new(0, 0x7F, 0, 0, 0, 0, 0, 0);
+        let e: i8x16 = i8x16::new(0x7F, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vcopyq_lane_s8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x4 = i16x4::new(0, 0x7F_FF, 0, 0);
+        let e: i16x8 = i16x8::new(0x7F_FF, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vcopyq_lane_s16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x2 = i32x2::new(0, 0x7F_FF_FF_FF);
+        let e: i32x4 = i32x4::new(0x7F_FF_FF_FF, 2, 3, 4);
+        let r: i32x4 = transmute(vcopyq_lane_s32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x8 = u8x8::new(0, 0xFF, 0, 0, 0, 0, 0, 0);
+        let e: u8x16 = u8x16::new(0xFF, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x16 = transmute(vcopyq_lane_u8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x4 = u16x4::new(0, 0xFF_FF, 0, 0);
+        let e: u16x8 = u16x8::new(0xFF_FF, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16x8 = transmute(vcopyq_lane_u16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 2, 3, 4);
+        let r: u32x4 = transmute(vcopyq_lane_u32::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_p8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x8 = i8x8::new(0, 0x7F, 0, 0, 0, 0, 0, 0);
+        let e: i8x16 = i8x16::new(0x7F, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vcopyq_lane_p8::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_p16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x4 = i16x4::new(0, 0x7F_FF, 0, 0);
+        let e: i16x8 = i16x8::new(0x7F_FF, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vcopyq_lane_p16::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x2 = i64x2::new(1, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x2 = transmute(vcopyq_lane_s64::<1, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x2 = u64x2::new(1, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vcopyq_lane_u64::<1, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_p64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x2 = i64x2::new(1, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x2 = transmute(vcopyq_lane_p64::<1, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x2 = f32x2::new(0.5, 0.);
+        let e: f32x4 = f32x4::new(1., 0.5, 3., 4.);
+        let r: f32x4 = transmute(vcopyq_lane_f32::<1, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopyq_lane_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64 = 0.5;
+        let e: f64x2 = f64x2::new(1., 0.5);
+        let r: f64x2 = transmute(vcopyq_lane_f64::<1, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_f64() {
+        let a: u64 = 0;
+        let e: f64 = 0.;
+        let r: f64 = transmute(vcreate_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_f64_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let e: f64 = 1.;
+        let r: f64 = transmute(vcvt_f64_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_f64_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let e: f64x2 = f64x2::new(1., 2.);
+        let r: f64x2 = transmute(vcvtq_f64_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_f64_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let e: f64 = 1.;
+        let r: f64 = transmute(vcvt_f64_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_f64_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let e: f64x2 = f64x2::new(1., 2.);
+        let r: f64x2 = transmute(vcvtq_f64_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_f64_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 1.2);
+        let e: f64x2 = f64x2::new(-1.2f32 as f64, 1.2f32 as f64);
+        let r: f64x2 = transmute(vcvt_f64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_high_f64_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 1.2, 2.3, 3.4);
+        let e: f64x2 = f64x2::new(2.3f32 as f64, 3.4f32 as f64);
+        let r: f64x2 = transmute(vcvt_high_f64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_f32_f64() {
+        let a: f64x2 = f64x2::new(-1.2, 1.2);
+        let e: f32x2 = f32x2::new(-1.2f64 as f32, 1.2f64 as f32);
+        let r: f32x2 = transmute(vcvt_f32_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_high_f32_f64() {
+        let a: f32x2 = f32x2::new(-1.2, 1.2);
+        let b: f64x2 = f64x2::new(-2.3, 3.4);
+        let e: f32x4 = f32x4::new(-1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32);
+        let r: f32x4 = transmute(vcvt_high_f32_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtx_f32_f64() {
+        let a: f64x2 = f64x2::new(-1.0, 2.0);
+        let e: f32x2 = f32x2::new(-1.0, 2.0);
+        let r: f32x2 = transmute(vcvtx_f32_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtxd_f32_f64() {
+        let a: f64 = -1.0;
+        let e: f32 = -1.0;
+        let r: f32 = transmute(vcvtxd_f32_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtx_high_f32_f64() {
+        let a: f32x2 = f32x2::new(-1.0, 2.0);
+        let b: f64x2 = f64x2::new(-3.0, 4.0);
+        let e: f32x4 = f32x4::new(-1.0, 2.0, -3.0, 4.0);
+        let r: f32x4 = transmute(vcvtx_high_f32_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_n_f64_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let e: f64 = 0.25;
+        let r: f64 = transmute(vcvt_n_f64_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_n_f64_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let e: f64x2 = f64x2::new(0.25, 0.5);
+        let r: f64x2 = transmute(vcvtq_n_f64_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvts_n_f32_s32() {
+        let a: i32 = 1;
+        let e: f32 = 0.25;
+        let r: f32 = transmute(vcvts_n_f32_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtd_n_f64_s64() {
+        let a: i64 = 1;
+        let e: f64 = 0.25;
+        let r: f64 = transmute(vcvtd_n_f64_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_n_f64_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let e: f64 = 0.25;
+        let r: f64 = transmute(vcvt_n_f64_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_n_f64_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let e: f64x2 = f64x2::new(0.25, 0.5);
+        let r: f64x2 = transmute(vcvtq_n_f64_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvts_n_f32_u32() {
+        let a: u32 = 1;
+        let e: f32 = 0.25;
+        let r: f32 = transmute(vcvts_n_f32_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtd_n_f64_u64() {
+        let a: u64 = 1;
+        let e: f64 = 0.25;
+        let r: f64 = transmute(vcvtd_n_f64_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_n_s64_f64() {
+        let a: f64 = 0.25;
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vcvt_n_s64_f64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_n_s64_f64() {
+        let a: f64x2 = f64x2::new(0.25, 0.5);
+        let e: i64x2 = i64x2::new(1, 2);
+        let r: i64x2 = transmute(vcvtq_n_s64_f64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvts_n_s32_f32() {
+        let a: f32 = 0.25;
+        let e: i32 = 1;
+        let r: i32 = transmute(vcvts_n_s32_f32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtd_n_s64_f64() {
+        let a: f64 = 0.25;
+        let e: i64 = 1;
+        let r: i64 = transmute(vcvtd_n_s64_f64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_n_u64_f64() {
+        let a: f64 = 0.25;
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vcvt_n_u64_f64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_n_u64_f64() {
+        let a: f64x2 = f64x2::new(0.25, 0.5);
+        let e: u64x2 = u64x2::new(1, 2);
+        let r: u64x2 = transmute(vcvtq_n_u64_f64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvts_n_u32_f32() {
+        let a: f32 = 0.25;
+        let e: u32 = 1;
+        let r: u32 = transmute(vcvts_n_u32_f32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtd_n_u64_f64() {
+        let a: f64 = 0.25;
+        let e: u64 = 1;
+        let r: u64 = transmute(vcvtd_n_u64_f64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvts_f32_s32() {
+        let a: i32 = 1;
+        let e: f32 = 1.;
+        let r: f32 = transmute(vcvts_f32_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtd_f64_s64() {
+        let a: i64 = 1;
+        let e: f64 = 1.;
+        let r: f64 = transmute(vcvtd_f64_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvts_f32_u32() {
+        let a: u32 = 1;
+        let e: f32 = 1.;
+        let r: f32 = transmute(vcvts_f32_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtd_f64_u64() {
+        let a: u64 = 1;
+        let e: f64 = 1.;
+        let r: f64 = transmute(vcvtd_f64_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvts_s32_f32() {
+        let a: f32 = 1.;
+        let e: i32 = 1;
+        let r: i32 = transmute(vcvts_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtd_s64_f64() {
+        let a: f64 = 1.;
+        let e: i64 = 1;
+        let r: i64 = transmute(vcvtd_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvts_u32_f32() {
+        let a: f32 = 1.;
+        let e: u32 = 1;
+        let r: u32 = transmute(vcvts_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtd_u64_f64() {
+        let a: f64 = 1.;
+        let e: u64 = 1;
+        let r: u64 = transmute(vcvtd_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_s64_f64() {
+        let a: f64 = -1.1;
+        let e: i64x1 = i64x1::new(-1);
+        let r: i64x1 = transmute(vcvt_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_s64_f64() {
+        let a: f64x2 = f64x2::new(-1.1, 2.1);
+        let e: i64x2 = i64x2::new(-1, 2);
+        let r: i64x2 = transmute(vcvtq_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_u64_f64() {
+        let a: f64 = 1.1;
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vcvt_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_u64_f64() {
+        let a: f64x2 = f64x2::new(1.1, 2.1);
+        let e: u64x2 = u64x2::new(1, 2);
+        let r: u64x2 = transmute(vcvtq_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvta_s32_f32() {
+        let a: f32x2 = f32x2::new(-1.1, 2.1);
+        let e: i32x2 = i32x2::new(-1, 2);
+        let r: i32x2 = transmute(vcvta_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtaq_s32_f32() {
+        let a: f32x4 = f32x4::new(-1.1, 2.1, -2.9, 3.9);
+        let e: i32x4 = i32x4::new(-1, 2, -3, 4);
+        let r: i32x4 = transmute(vcvtaq_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvta_s64_f64() {
+        let a: f64 = -1.1;
+        let e: i64x1 = i64x1::new(-1);
+        let r: i64x1 = transmute(vcvta_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtaq_s64_f64() {
+        let a: f64x2 = f64x2::new(-1.1, 2.1);
+        let e: i64x2 = i64x2::new(-1, 2);
+        let r: i64x2 = transmute(vcvtaq_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtas_s32_f32() {
+        let a: f32 = 2.9;
+        let e: i32 = 3;
+        let r: i32 = transmute(vcvtas_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtad_s64_f64() {
+        let a: f64 = 2.9;
+        let e: i64 = 3;
+        let r: i64 = transmute(vcvtad_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtas_u32_f32() {
+        let a: f32 = 2.9;
+        let e: u32 = 3;
+        let r: u32 = transmute(vcvtas_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtad_u64_f64() {
+        let a: f64 = 2.9;
+        let e: u64 = 3;
+        let r: u64 = transmute(vcvtad_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtn_s32_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 2.1);
+        let e: i32x2 = i32x2::new(-2, 2);
+        let r: i32x2 = transmute(vcvtn_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtnq_s32_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 2.1, -2.9, 3.9);
+        let e: i32x4 = i32x4::new(-2, 2, -3, 4);
+        let r: i32x4 = transmute(vcvtnq_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtn_s64_f64() {
+        let a: f64 = -1.5;
+        let e: i64x1 = i64x1::new(-2);
+        let r: i64x1 = transmute(vcvtn_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtnq_s64_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 2.1);
+        let e: i64x2 = i64x2::new(-2, 2);
+        let r: i64x2 = transmute(vcvtnq_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtns_s32_f32() {
+        let a: f32 = -1.5;
+        let e: i32 = -2;
+        let r: i32 = transmute(vcvtns_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtnd_s64_f64() {
+        let a: f64 = -1.5;
+        let e: i64 = -2;
+        let r: i64 = transmute(vcvtnd_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtm_s32_f32() {
+        let a: f32x2 = f32x2::new(-1.1, 2.1);
+        let e: i32x2 = i32x2::new(-2, 2);
+        let r: i32x2 = transmute(vcvtm_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtmq_s32_f32() {
+        let a: f32x4 = f32x4::new(-1.1, 2.1, -2.9, 3.9);
+        let e: i32x4 = i32x4::new(-2, 2, -3, 3);
+        let r: i32x4 = transmute(vcvtmq_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtm_s64_f64() {
+        let a: f64 = -1.1;
+        let e: i64x1 = i64x1::new(-2);
+        let r: i64x1 = transmute(vcvtm_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtmq_s64_f64() {
+        let a: f64x2 = f64x2::new(-1.1, 2.1);
+        let e: i64x2 = i64x2::new(-2, 2);
+        let r: i64x2 = transmute(vcvtmq_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtms_s32_f32() {
+        let a: f32 = -1.1;
+        let e: i32 = -2;
+        let r: i32 = transmute(vcvtms_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtmd_s64_f64() {
+        let a: f64 = -1.1;
+        let e: i64 = -2;
+        let r: i64 = transmute(vcvtmd_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtp_s32_f32() {
+        let a: f32x2 = f32x2::new(-1.1, 2.1);
+        let e: i32x2 = i32x2::new(-1, 3);
+        let r: i32x2 = transmute(vcvtp_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtpq_s32_f32() {
+        let a: f32x4 = f32x4::new(-1.1, 2.1, -2.9, 3.9);
+        let e: i32x4 = i32x4::new(-1, 3, -2, 4);
+        let r: i32x4 = transmute(vcvtpq_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtp_s64_f64() {
+        let a: f64 = -1.1;
+        let e: i64x1 = i64x1::new(-1);
+        let r: i64x1 = transmute(vcvtp_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtpq_s64_f64() {
+        let a: f64x2 = f64x2::new(-1.1, 2.1);
+        let e: i64x2 = i64x2::new(-1, 3);
+        let r: i64x2 = transmute(vcvtpq_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtps_s32_f32() {
+        let a: f32 = -1.1;
+        let e: i32 = -1;
+        let r: i32 = transmute(vcvtps_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtpd_s64_f64() {
+        let a: f64 = -1.1;
+        let e: i64 = -1;
+        let r: i64 = transmute(vcvtpd_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvta_u32_f32() {
+        let a: f32x2 = f32x2::new(1.1, 2.1);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vcvta_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtaq_u32_f32() {
+        let a: f32x4 = f32x4::new(1.1, 2.1, 2.9, 3.9);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vcvtaq_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvta_u64_f64() {
+        let a: f64 = 1.1;
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vcvta_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtaq_u64_f64() {
+        let a: f64x2 = f64x2::new(1.1, 2.1);
+        let e: u64x2 = u64x2::new(1, 2);
+        let r: u64x2 = transmute(vcvtaq_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtn_u32_f32() {
+        let a: f32x2 = f32x2::new(1.5, 2.1);
+        let e: u32x2 = u32x2::new(2, 2);
+        let r: u32x2 = transmute(vcvtn_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtnq_u32_f32() {
+        let a: f32x4 = f32x4::new(1.5, 2.1, 2.9, 3.9);
+        let e: u32x4 = u32x4::new(2, 2, 3, 4);
+        let r: u32x4 = transmute(vcvtnq_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtn_u64_f64() {
+        let a: f64 = 1.5;
+        let e: u64x1 = u64x1::new(2);
+        let r: u64x1 = transmute(vcvtn_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtnq_u64_f64() {
+        let a: f64x2 = f64x2::new(1.5, 2.1);
+        let e: u64x2 = u64x2::new(2, 2);
+        let r: u64x2 = transmute(vcvtnq_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtns_u32_f32() {
+        let a: f32 = 1.5;
+        let e: u32 = 2;
+        let r: u32 = transmute(vcvtns_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtnd_u64_f64() {
+        let a: f64 = 1.5;
+        let e: u64 = 2;
+        let r: u64 = transmute(vcvtnd_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtm_u32_f32() {
+        let a: f32x2 = f32x2::new(1.1, 2.1);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vcvtm_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtmq_u32_f32() {
+        let a: f32x4 = f32x4::new(1.1, 2.1, 2.9, 3.9);
+        let e: u32x4 = u32x4::new(1, 2, 2, 3);
+        let r: u32x4 = transmute(vcvtmq_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtm_u64_f64() {
+        let a: f64 = 1.1;
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vcvtm_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtmq_u64_f64() {
+        let a: f64x2 = f64x2::new(1.1, 2.1);
+        let e: u64x2 = u64x2::new(1, 2);
+        let r: u64x2 = transmute(vcvtmq_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtms_u32_f32() {
+        let a: f32 = 1.1;
+        let e: u32 = 1;
+        let r: u32 = transmute(vcvtms_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtmd_u64_f64() {
+        let a: f64 = 1.1;
+        let e: u64 = 1;
+        let r: u64 = transmute(vcvtmd_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtp_u32_f32() {
+        let a: f32x2 = f32x2::new(1.1, 2.1);
+        let e: u32x2 = u32x2::new(2, 3);
+        let r: u32x2 = transmute(vcvtp_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtpq_u32_f32() {
+        let a: f32x4 = f32x4::new(1.1, 2.1, 2.9, 3.9);
+        let e: u32x4 = u32x4::new(2, 3, 3, 4);
+        let r: u32x4 = transmute(vcvtpq_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtp_u64_f64() {
+        let a: f64 = 1.1;
+        let e: u64x1 = u64x1::new(2);
+        let r: u64x1 = transmute(vcvtp_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtpq_u64_f64() {
+        let a: f64x2 = f64x2::new(1.1, 2.1);
+        let e: u64x2 = u64x2::new(2, 3);
+        let r: u64x2 = transmute(vcvtpq_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtps_u32_f32() {
+        let a: f32 = 1.1;
+        let e: u32 = 2;
+        let r: u32 = transmute(vcvtps_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtpd_u64_f64() {
+        let a: f64 = 1.1;
+        let e: u64 = 2;
+        let r: u64 = transmute(vcvtpd_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_p64() {
+        let a: i64x2 = i64x2::new(1, 1);
+        let e: i64x2 = i64x2::new(1, 1);
+        let r: i64x2 = transmute(vdupq_laneq_p64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_p64() {
+        let a: i64x1 = i64x1::new(1);
+        let e: i64x2 = i64x2::new(1, 1);
+        let r: i64x2 = transmute(vdupq_lane_p64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_f64() {
+        let a: f64x2 = f64x2::new(1., 1.);
+        let e: f64x2 = f64x2::new(1., 1.);
+        let r: f64x2 = transmute(vdupq_laneq_f64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_f64() {
+        let a: f64 = 1.;
+        let e: f64x2 = f64x2::new(1., 1.);
+        let r: f64x2 = transmute(vdupq_lane_f64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vdup_lane_p64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_f64() {
+        let a: f64 = 0.;
+        let e: f64 = 0.;
+        let r: f64 = transmute(vdup_lane_f64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vdup_laneq_p64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_f64() {
+        let a: f64x2 = f64x2::new(0., 1.);
+        let e: f64 = 1.;
+        let r: f64 = transmute(vdup_laneq_f64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupb_lane_s8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8 = 1;
+        let r: i8 = transmute(vdupb_lane_s8::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupb_laneq_s8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8 = 1;
+        let r: i8 = transmute(vdupb_laneq_s8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vduph_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16 = 1;
+        let r: i16 = transmute(vduph_lane_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vduph_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16 = 1;
+        let r: i16 = transmute(vduph_laneq_s16::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdups_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let e: i32 = 1;
+        let r: i32 = transmute(vdups_lane_s32::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdups_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 4);
+        let e: i32 = 1;
+        let r: i32 = transmute(vdups_laneq_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupd_lane_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let e: i64 = 1;
+        let r: i64 = transmute(vdupd_lane_s64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupd_laneq_s64() {
+        let a: i64x2 = i64x2::new(1, 1);
+        let e: i64 = 1;
+        let r: i64 = transmute(vdupd_laneq_s64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupb_lane_u8() {
+        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u8 = 1;
+        let r: u8 = transmute(vdupb_lane_u8::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupb_laneq_u8() {
+        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8 = 1;
+        let r: u8 = transmute(vdupb_laneq_u8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vduph_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 1, 1, 4);
+        let e: u16 = 1;
+        let r: u16 = transmute(vduph_lane_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vduph_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u16 = 1;
+        let r: u16 = transmute(vduph_laneq_u16::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdups_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 1);
+        let e: u32 = 1;
+        let r: u32 = transmute(vdups_lane_u32::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdups_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 1, 1, 4);
+        let e: u32 = 1;
+        let r: u32 = transmute(vdups_laneq_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupd_lane_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let e: u64 = 1;
+        let r: u64 = transmute(vdupd_lane_u64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupd_laneq_u64() {
+        let a: u64x2 = u64x2::new(1, 1);
+        let e: u64 = 1;
+        let r: u64 = transmute(vdupd_laneq_u64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupb_lane_p8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: p8 = 1;
+        let r: p8 = transmute(vdupb_lane_p8::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupb_laneq_p8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: p8 = 1;
+        let r: p8 = transmute(vdupb_laneq_p8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vduph_lane_p16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: p16 = 1;
+        let r: p16 = transmute(vduph_lane_p16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vduph_laneq_p16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: p16 = 1;
+        let r: p16 = transmute(vduph_laneq_p16::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdups_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let e: f32 = 1.;
+        let r: f32 = transmute(vdups_lane_f32::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdups_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
+        let e: f32 = 1.;
+        let r: f32 = transmute(vdups_laneq_f32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupd_lane_f64() {
+        let a: f64 = 1.;
+        let e: f64 = 1.;
+        let r: f64 = transmute(vdupd_lane_f64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupd_laneq_f64() {
+        let a: f64x2 = f64x2::new(1., 1.);
+        let e: f64 = 1.;
+        let r: f64 = transmute(vdupd_laneq_f64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_p64() {
+        let a: i64x2 = i64x2::new(0, 8);
+        let b: i64x2 = i64x2::new(9, 11);
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vextq_p64::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_f64() {
+        let a: f64x2 = f64x2::new(0., 2.);
+        let b: f64x2 = f64x2::new(3., 4.);
+        let e: f64x2 = f64x2::new(2., 3.);
+        let r: f64x2 = transmute(vextq_f64::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_f64() {
+        let a: f64 = 0.;
+        let b: f64 = 2.;
+        let c: f64 = 3.;
+        let e: f64 = 6.;
+        let r: f64 = transmute(vmla_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_f64() {
+        let a: f64x2 = f64x2::new(0., 1.);
+        let b: f64x2 = f64x2::new(2., 2.);
+        let c: f64x2 = f64x2::new(3., 3.);
+        let e: f64x2 = f64x2::new(6., 7.);
+        let r: f64x2 = transmute(vmlaq_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_s8() {
+        let a: i16x8 = i16x8::new(8, 7, 6, 5, 4, 3, 2, 1);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x16 = i8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16x8 = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i16x8 = transmute(vmlal_high_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_s16() {
+        let a: i32x4 = i32x4::new(8, 7, 6, 5);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let e: i32x4 = i32x4::new(8, 9, 10, 11);
+        let r: i32x4 = transmute(vmlal_high_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_s32() {
+        let a: i64x2 = i64x2::new(8, 7);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(3, 3, 0, 1);
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vmlal_high_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_u8() {
+        let a: u16x8 = u16x8::new(8, 7, 6, 5, 4, 3, 2, 1);
+        let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x16 = u8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u16x8 = transmute(vmlal_high_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_u16() {
+        let a: u32x4 = u32x4::new(8, 7, 6, 5);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(8, 9, 10, 11);
+        let r: u32x4 = transmute(vmlal_high_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_u32() {
+        let a: u64x2 = u64x2::new(8, 7);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(3, 3, 0, 1);
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vmlal_high_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_n_s16() {
+        let a: i32x4 = i32x4::new(8, 7, 6, 5);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16 = 2;
+        let e: i32x4 = i32x4::new(8, 9, 10, 11);
+        let r: i32x4 = transmute(vmlal_high_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_n_s32() {
+        let a: i64x2 = i64x2::new(8, 7);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32 = 2;
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vmlal_high_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_n_u16() {
+        let a: u32x4 = u32x4::new(8, 7, 6, 5);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16 = 2;
+        let e: u32x4 = u32x4::new(8, 9, 10, 11);
+        let r: u32x4 = transmute(vmlal_high_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_n_u32() {
+        let a: u64x2 = u64x2::new(8, 7);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32 = 2;
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vmlal_high_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_lane_s16() {
+        let a: i32x4 = i32x4::new(8, 7, 6, 5);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(8, 9, 10, 11);
+        let r: i32x4 = transmute(vmlal_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_laneq_s16() {
+        let a: i32x4 = i32x4::new(8, 7, 6, 5);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(8, 9, 10, 11);
+        let r: i32x4 = transmute(vmlal_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_lane_s32() {
+        let a: i64x2 = i64x2::new(8, 7);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vmlal_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_laneq_s32() {
+        let a: i64x2 = i64x2::new(8, 7);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vmlal_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_lane_u16() {
+        let a: u32x4 = u32x4::new(8, 7, 6, 5);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(8, 9, 10, 11);
+        let r: u32x4 = transmute(vmlal_high_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_laneq_u16() {
+        let a: u32x4 = u32x4::new(8, 7, 6, 5);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(8, 9, 10, 11);
+        let r: u32x4 = transmute(vmlal_high_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_lane_u32() {
+        let a: u64x2 = u64x2::new(8, 7);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32x2 = u32x2::new(0, 2);
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vmlal_high_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_laneq_u32() {
+        let a: u64x2 = u64x2::new(8, 7);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vmlal_high_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_f64() {
+        let a: f64 = 6.;
+        let b: f64 = 2.;
+        let c: f64 = 3.;
+        let e: f64 = 0.;
+        let r: f64 = transmute(vmls_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_f64() {
+        let a: f64x2 = f64x2::new(6., 7.);
+        let b: f64x2 = f64x2::new(2., 2.);
+        let c: f64x2 = f64x2::new(3., 3.);
+        let e: f64x2 = f64x2::new(0., 1.);
+        let r: f64x2 = transmute(vmlsq_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_s8() {
+        let a: i16x8 = i16x8::new(14, 15, 16, 17, 18, 19, 20, 21);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x16 = i8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16x8 = i16x8::new(14, 13, 12, 11, 10, 9, 8, 7);
+        let r: i16x8 = transmute(vmlsl_high_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_s16() {
+        let a: i32x4 = i32x4::new(14, 15, 16, 17);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let e: i32x4 = i32x4::new(14, 13, 12, 11);
+        let r: i32x4 = transmute(vmlsl_high_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_s32() {
+        let a: i64x2 = i64x2::new(14, 15);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(3, 3, 0, 1);
+        let e: i64x2 = i64x2::new(14, 13);
+        let r: i64x2 = transmute(vmlsl_high_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_u8() {
+        let a: u16x8 = u16x8::new(14, 15, 16, 17, 18, 19, 20, 21);
+        let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x16 = u8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(14, 13, 12, 11, 10, 9, 8, 7);
+        let r: u16x8 = transmute(vmlsl_high_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_u16() {
+        let a: u32x4 = u32x4::new(14, 15, 16, 17);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(14, 13, 12, 11);
+        let r: u32x4 = transmute(vmlsl_high_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_u32() {
+        let a: u64x2 = u64x2::new(14, 15);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(3, 3, 0, 1);
+        let e: u64x2 = u64x2::new(14, 13);
+        let r: u64x2 = transmute(vmlsl_high_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_n_s16() {
+        let a: i32x4 = i32x4::new(14, 15, 16, 17);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16 = 2;
+        let e: i32x4 = i32x4::new(14, 13, 12, 11);
+        let r: i32x4 = transmute(vmlsl_high_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_n_s32() {
+        let a: i64x2 = i64x2::new(14, 15);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32 = 2;
+        let e: i64x2 = i64x2::new(14, 13);
+        let r: i64x2 = transmute(vmlsl_high_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_n_u16() {
+        let a: u32x4 = u32x4::new(14, 15, 16, 17);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16 = 2;
+        let e: u32x4 = u32x4::new(14, 13, 12, 11);
+        let r: u32x4 = transmute(vmlsl_high_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_n_u32() {
+        let a: u64x2 = u64x2::new(14, 15);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32 = 2;
+        let e: u64x2 = u64x2::new(14, 13);
+        let r: u64x2 = transmute(vmlsl_high_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_lane_s16() {
+        let a: i32x4 = i32x4::new(14, 15, 16, 17);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(14, 13, 12, 11);
+        let r: i32x4 = transmute(vmlsl_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_laneq_s16() {
+        let a: i32x4 = i32x4::new(14, 15, 16, 17);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(14, 13, 12, 11);
+        let r: i32x4 = transmute(vmlsl_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_lane_s32() {
+        let a: i64x2 = i64x2::new(14, 15);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(14, 13);
+        let r: i64x2 = transmute(vmlsl_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_laneq_s32() {
+        let a: i64x2 = i64x2::new(14, 15);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(14, 13);
+        let r: i64x2 = transmute(vmlsl_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_lane_u16() {
+        let a: u32x4 = u32x4::new(14, 15, 16, 17);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(14, 13, 12, 11);
+        let r: u32x4 = transmute(vmlsl_high_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_laneq_u16() {
+        let a: u32x4 = u32x4::new(14, 15, 16, 17);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(14, 13, 12, 11);
+        let r: u32x4 = transmute(vmlsl_high_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_lane_u32() {
+        let a: u64x2 = u64x2::new(14, 15);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32x2 = u32x2::new(0, 2);
+        let e: u64x2 = u64x2::new(14, 13);
+        let r: u64x2 = transmute(vmlsl_high_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_laneq_u32() {
+        let a: u64x2 = u64x2::new(14, 15);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u64x2 = u64x2::new(14, 13);
+        let r: u64x2 = transmute(vmlsl_high_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_s16() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 2, 3, 4, 5);
+        let b: i16x8 = i16x8::new(2, 3, 4, 5, 12, 13, 14, 15);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vmovn_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_s32() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 3, 4, 5);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 2, 3, 4, 5);
+        let r: i16x8 = transmute(vmovn_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_s64() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i64x2 = i64x2::new(2, 3);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmovn_high_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_u16() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 2, 3, 4, 5);
+        let b: u16x8 = u16x8::new(2, 3, 4, 5, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vmovn_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_u32() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 3, 4, 5);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 2, 3, 4, 5);
+        let r: u16x8 = transmute(vmovn_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_u64() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u64x2 = u64x2::new(2, 3);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmovn_high_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vneg_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vneg_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vnegq_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i64x2 = i64x2::new(0, -1);
+        let r: i64x2 = transmute(vnegq_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vnegd_s64() {
+        let a: i64 = 1;
+        let e: i64 = -1;
+        let r: i64 = transmute(vnegd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vneg_f64() {
+        let a: f64 = 0.;
+        let e: f64 = 0.;
+        let r: f64 = transmute(vneg_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vnegq_f64() {
+        let a: f64x2 = f64x2::new(0., 1.);
+        let e: f64x2 = f64x2::new(0., -1.);
+        let r: f64x2 = transmute(vnegq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqneg_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vqneg_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqnegq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0);
+        let e: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 0);
+        let r: i64x2 = transmute(vqnegq_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqnegb_s8() {
+        let a: i8 = 1;
+        let e: i8 = -1;
+        let r: i8 = transmute(vqnegb_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqnegh_s16() {
+        let a: i16 = 1;
+        let e: i16 = -1;
+        let r: i16 = transmute(vqnegh_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqnegs_s32() {
+        let a: i32 = 1;
+        let e: i32 = -1;
+        let r: i32 = transmute(vqnegs_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqnegd_s64() {
+        let a: i64 = 1;
+        let e: i64 = -1;
+        let r: i64 = transmute(vqnegd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubb_s8() {
+        let a: i8 = 42;
+        let b: i8 = 1;
+        let e: i8 = 41;
+        let r: i8 = transmute(vqsubb_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubh_s16() {
+        let a: i16 = 42;
+        let b: i16 = 1;
+        let e: i16 = 41;
+        let r: i16 = transmute(vqsubh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubb_u8() {
+        let a: u8 = 42;
+        let b: u8 = 1;
+        let e: u8 = 41;
+        let r: u8 = transmute(vqsubb_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubh_u16() {
+        let a: u16 = 42;
+        let b: u16 = 1;
+        let e: u16 = 41;
+        let r: u16 = transmute(vqsubh_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubs_u32() {
+        let a: u32 = 42;
+        let b: u32 = 1;
+        let e: u32 = 41;
+        let r: u32 = transmute(vqsubs_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubd_u64() {
+        let a: u64 = 42;
+        let b: u64 = 1;
+        let e: u64 = 41;
+        let r: u64 = transmute(vqsubd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubs_s32() {
+        let a: i32 = 42;
+        let b: i32 = 1;
+        let e: i32 = 41;
+        let r: i32 = transmute(vqsubs_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubd_s64() {
+        let a: i64 = 42;
+        let b: i64 = 1;
+        let e: i64 = 41;
+        let r: i64 = transmute(vqsubd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbit_s8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let e: i8x8 = i8x8::new(0, 64, 32, 96, 16, 80, 48, 112);
+        let r: i8x8 = transmute(vrbit_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbitq_s8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let e: i8x16 = i8x16::new(0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120);
+        let r: i8x16 = transmute(vrbitq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbit_u8() {
+        let a: u8x8 = u8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let e: u8x8 = u8x8::new(0, 64, 32, 96, 16, 80, 48, 112);
+        let r: u8x8 = transmute(vrbit_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbitq_u8() {
+        let a: u8x16 = u8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let e: u8x16 = u8x16::new(0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120);
+        let r: u8x16 = transmute(vrbitq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbit_p8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let e: i8x8 = i8x8::new(0, 64, 32, 96, 16, 80, 48, 112);
+        let r: i8x8 = transmute(vrbit_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbitq_p8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let e: i8x16 = i8x16::new(0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120);
+        let r: i8x16 = transmute(vrbitq_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndx_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 0.0);
+        let r: f32x2 = transmute(vrndx_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndxq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 0.0, 2.0, 2.0);
+        let r: f32x4 = transmute(vrndxq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndx_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrndx_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndxq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 0.0);
+        let r: f64x2 = transmute(vrndxq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrnda_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 1.0);
+        let r: f32x2 = transmute(vrnda_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndaq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 1.0, 2.0, 3.0);
+        let r: f32x4 = transmute(vrndaq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrnda_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrnda_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndaq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 1.0);
+        let r: f64x2 = transmute(vrndaq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndn_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrndn_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndnq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 0.0);
+        let r: f64x2 = transmute(vrndnq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndns_f32() {
+        let a: f32 = -1.5;
+        let e: f32 = -2.0;
+        let r: f32 = transmute(vrndns_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndm_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 0.0);
+        let r: f32x2 = transmute(vrndm_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndmq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 0.0, 1.0, 2.0);
+        let r: f32x4 = transmute(vrndmq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndm_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrndm_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndmq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 0.0);
+        let r: f64x2 = transmute(vrndmq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndp_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-1.0, 1.0);
+        let r: f32x2 = transmute(vrndp_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndpq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-1.0, 1.0, 2.0, 3.0);
+        let r: f32x4 = transmute(vrndpq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndp_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -1.0;
+        let r: f64 = transmute(vrndp_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndpq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-1.0, 1.0);
+        let r: f64x2 = transmute(vrndpq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrnd_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-1.0, 0.0);
+        let r: f32x2 = transmute(vrnd_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-1.0, 0.0, 1.0, 2.0);
+        let r: f32x4 = transmute(vrndq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrnd_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -1.0;
+        let r: f64 = transmute(vrnd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-1.0, 0.0);
+        let r: f64x2 = transmute(vrndq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndi_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 0.0);
+        let r: f32x2 = transmute(vrndi_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndiq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 0.0, 2.0, 2.0);
+        let r: f32x4 = transmute(vrndiq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndi_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrndi_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndiq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 0.0);
+        let r: f64x2 = transmute(vrndiq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddb_s8() {
+        let a: i8 = 42;
+        let b: i8 = 1;
+        let e: i8 = 43;
+        let r: i8 = transmute(vqaddb_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddh_s16() {
+        let a: i16 = 42;
+        let b: i16 = 1;
+        let e: i16 = 43;
+        let r: i16 = transmute(vqaddh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddb_u8() {
+        let a: u8 = 42;
+        let b: u8 = 1;
+        let e: u8 = 43;
+        let r: u8 = transmute(vqaddb_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddh_u16() {
+        let a: u16 = 42;
+        let b: u16 = 1;
+        let e: u16 = 43;
+        let r: u16 = transmute(vqaddh_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadds_u32() {
+        let a: u32 = 42;
+        let b: u32 = 1;
+        let e: u32 = 43;
+        let r: u32 = transmute(vqadds_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddd_u64() {
+        let a: u64 = 42;
+        let b: u64 = 1;
+        let e: u64 = 43;
+        let r: u64 = transmute(vqaddd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadds_s32() {
+        let a: i32 = 42;
+        let b: i32 = 1;
+        let e: i32 = 43;
+        let r: i32 = transmute(vqadds_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddd_s64() {
+        let a: i64 = 42;
+        let b: i64 = 1;
+        let e: i64 = 43;
+        let r: i64 = transmute(vqaddd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f64_x2() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let e: [f64; 2] = [1., 2.];
+        let r: [f64; 2] = transmute(vld1_f64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f64_x2() {
+        let a: [f64; 5] = [0., 1., 2., 3., 4.];
+        let e: [f64x2; 2] = [f64x2::new(1., 2.), f64x2::new(3., 4.)];
+        let r: [f64x2; 2] = transmute(vld1q_f64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f64_x3() {
+        let a: [f64; 4] = [0., 1., 2., 3.];
+        let e: [f64; 3] = [1., 2., 3.];
+        let r: [f64; 3] = transmute(vld1_f64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f64_x3() {
+        let a: [f64; 7] = [0., 1., 2., 3., 4., 5., 6.];
+        let e: [f64x2; 3] = [f64x2::new(1., 2.), f64x2::new(3., 4.), f64x2::new(5., 6.)];
+        let r: [f64x2; 3] = transmute(vld1q_f64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f64_x4() {
+        let a: [f64; 5] = [0., 1., 2., 3., 4.];
+        let e: [f64; 4] = [1., 2., 3., 4.];
+        let r: [f64; 4] = transmute(vld1_f64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f64_x4() {
+        let a: [f64; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let e: [f64x2; 4] = [f64x2::new(1., 2.), f64x2::new(3., 4.), f64x2::new(5., 6.), f64x2::new(7., 8.)];
+        let r: [f64x2; 4] = transmute(vld1q_f64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_s64() {
+        let a: [i64; 5] = [0, 1, 2, 2, 3];
+        let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(2, 3)];
+        let r: [i64x2; 2] = transmute(vld2q_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_u64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 3];
+        let e: [u64x2; 2] = [u64x2::new(1, 2), u64x2::new(2, 3)];
+        let r: [u64x2; 2] = transmute(vld2q_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_p64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 3];
+        let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(2, 3)];
+        let r: [i64x2; 2] = transmute(vld2q_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_f64() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let e: [f64; 2] = [1., 2.];
+        let r: [f64; 2] = transmute(vld2_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_f64() {
+        let a: [f64; 5] = [0., 1., 2., 2., 3.];
+        let e: [f64x2; 2] = [f64x2::new(1., 2.), f64x2::new(2., 3.)];
+        let r: [f64x2; 2] = transmute(vld2q_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_s64() {
+        let a: [i64; 5] = [0, 1, 1, 2, 3];
+        let e: [i64x2; 2] = [i64x2::new(1, 1), i64x2::new(1, 1)];
+        let r: [i64x2; 2] = transmute(vld2q_dup_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_u64() {
+        let a: [u64; 5] = [0, 1, 1, 2, 3];
+        let e: [u64x2; 2] = [u64x2::new(1, 1), u64x2::new(1, 1)];
+        let r: [u64x2; 2] = transmute(vld2q_dup_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_p64() {
+        let a: [u64; 5] = [0, 1, 1, 2, 3];
+        let e: [i64x2; 2] = [i64x2::new(1, 1), i64x2::new(1, 1)];
+        let r: [i64x2; 2] = transmute(vld2q_dup_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_f64() {
+        let a: [f64; 3] = [0., 1., 1.];
+        let e: [f64; 2] = [1., 1.];
+        let r: [f64; 2] = transmute(vld2_dup_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_f64() {
+        let a: [f64; 5] = [0., 1., 1., 2., 3.];
+        let e: [f64x2; 2] = [f64x2::new(1., 1.), f64x2::new(1., 1.)];
+        let r: [f64x2; 2] = transmute(vld2q_dup_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_s8() {
+        let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x16; 2] = [i8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i8x16; 2] = [i8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i8x16; 2] = transmute(vld2q_lane_s8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_s64() {
+        let a: [i64; 3] = [0, 1, 2];
+        let b: [i64x1; 2] = [i64x1::new(0), i64x1::new(2)];
+        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)];
+        let r: [i64x1; 2] = transmute(vld2_lane_s64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_s64() {
+        let a: [i64; 5] = [0, 1, 2, 3, 4];
+        let b: [i64x2; 2] = [i64x2::new(0, 2), i64x2::new(2, 14)];
+        let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(2, 14)];
+        let r: [i64x2; 2] = transmute(vld2q_lane_s64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_p64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let b: [i64x1; 2] = [i64x1::new(0), i64x1::new(2)];
+        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)];
+        let r: [i64x1; 2] = transmute(vld2_lane_p64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_p64() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let b: [i64x2; 2] = [i64x2::new(0, 2), i64x2::new(2, 14)];
+        let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(2, 14)];
+        let r: [i64x2; 2] = transmute(vld2q_lane_p64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_u8() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u8x16; 2] = [u8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [u8x16; 2] = [u8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [u8x16; 2] = transmute(vld2q_lane_u8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_u64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let b: [u64x1; 2] = [u64x1::new(0), u64x1::new(2)];
+        let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(2)];
+        let r: [u64x1; 2] = transmute(vld2_lane_u64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_u64() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let b: [u64x2; 2] = [u64x2::new(0, 2), u64x2::new(2, 14)];
+        let e: [u64x2; 2] = [u64x2::new(1, 2), u64x2::new(2, 14)];
+        let r: [u64x2; 2] = transmute(vld2q_lane_u64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_p8() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x16; 2] = [i8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i8x16; 2] = [i8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i8x16; 2] = transmute(vld2q_lane_p8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_f64() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let b: [f64; 2] = [0., 2.];
+        let e: [f64; 2] = [1., 2.];
+        let r: [f64; 2] = transmute(vld2_lane_f64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_f64() {
+        let a: [f64; 5] = [0., 1., 2., 3., 4.];
+        let b: [f64x2; 2] = [f64x2::new(0., 2.), f64x2::new(2., 14.)];
+        let e: [f64x2; 2] = [f64x2::new(1., 2.), f64x2::new(2., 14.)];
+        let r: [f64x2; 2] = transmute(vld2q_lane_f64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_s64() {
+        let a: [i64; 7] = [0, 1, 2, 2, 2, 4, 4];
+        let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(2, 4), i64x2::new(2, 4)];
+        let r: [i64x2; 3] = transmute(vld3q_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_u64() {
+        let a: [u64; 7] = [0, 1, 2, 2, 2, 4, 4];
+        let e: [u64x2; 3] = [u64x2::new(1, 2), u64x2::new(2, 4), u64x2::new(2, 4)];
+        let r: [u64x2; 3] = transmute(vld3q_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_p64() {
+        let a: [u64; 7] = [0, 1, 2, 2, 2, 4, 4];
+        let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(2, 4), i64x2::new(2, 4)];
+        let r: [i64x2; 3] = transmute(vld3q_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_f64() {
+        let a: [f64; 4] = [0., 1., 2., 2.];
+        let e: [f64; 3] = [1., 2., 2.];
+        let r: [f64; 3] = transmute(vld3_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_f64() {
+        let a: [f64; 7] = [0., 1., 2., 2., 2., 4., 4.];
+        let e: [f64x2; 3] = [f64x2::new(1., 2.), f64x2::new(2., 4.), f64x2::new(2., 4.)];
+        let r: [f64x2; 3] = transmute(vld3q_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_s64() {
+        let a: [i64; 7] = [0, 1, 1, 1, 3, 1, 4];
+        let e: [i64x2; 3] = [i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1)];
+        let r: [i64x2; 3] = transmute(vld3q_dup_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_u64() {
+        let a: [u64; 7] = [0, 1, 1, 1, 3, 1, 4];
+        let e: [u64x2; 3] = [u64x2::new(1, 1), u64x2::new(1, 1), u64x2::new(1, 1)];
+        let r: [u64x2; 3] = transmute(vld3q_dup_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_p64() {
+        let a: [u64; 7] = [0, 1, 1, 1, 3, 1, 4];
+        let e: [i64x2; 3] = [i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1)];
+        let r: [i64x2; 3] = transmute(vld3q_dup_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_f64() {
+        let a: [f64; 4] = [0., 1., 1., 1.];
+        let e: [f64; 3] = [1., 1., 1.];
+        let r: [f64; 3] = transmute(vld3_dup_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_f64() {
+        let a: [f64; 7] = [0., 1., 1., 1., 3., 1., 4.];
+        let e: [f64x2; 3] = [f64x2::new(1., 1.), f64x2::new(1., 1.), f64x2::new(1., 1.)];
+        let r: [f64x2; 3] = transmute(vld3q_dup_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_s8() {
+        let a: [i8; 49] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x16; 3] = [i8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)];
+        let e: [i8x16; 3] = [i8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)];
+        let r: [i8x16; 3] = transmute(vld3q_lane_s8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_s64() {
+        let a: [i64; 4] = [0, 1, 2, 2];
+        let b: [i64x1; 3] = [i64x1::new(0), i64x1::new(2), i64x1::new(2)];
+        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)];
+        let r: [i64x1; 3] = transmute(vld3_lane_s64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_s64() {
+        let a: [i64; 7] = [0, 1, 2, 2, 4, 5, 6];
+        let b: [i64x2; 3] = [i64x2::new(0, 2), i64x2::new(2, 14), i64x2::new(2, 16)];
+        let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(2, 14), i64x2::new(2, 16)];
+        let r: [i64x2; 3] = transmute(vld3q_lane_s64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_p64() {
+        let a: [u64; 4] = [0, 1, 2, 2];
+        let b: [i64x1; 3] = [i64x1::new(0), i64x1::new(2), i64x1::new(2)];
+        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)];
+        let r: [i64x1; 3] = transmute(vld3_lane_p64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_p64() {
+        let a: [u64; 7] = [0, 1, 2, 2, 4, 5, 6];
+        let b: [i64x2; 3] = [i64x2::new(0, 2), i64x2::new(2, 14), i64x2::new(2, 16)];
+        let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(2, 14), i64x2::new(2, 16)];
+        let r: [i64x2; 3] = transmute(vld3q_lane_p64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_p8() {
+        let a: [u8; 49] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x16; 3] = [i8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)];
+        let e: [i8x16; 3] = [i8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)];
+        let r: [i8x16; 3] = transmute(vld3q_lane_p8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_u8() {
+        let a: [u8; 49] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u8x16; 3] = [u8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)];
+        let e: [u8x16; 3] = [u8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), u8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)];
+        let r: [u8x16; 3] = transmute(vld3q_lane_u8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_u64() {
+        let a: [u64; 4] = [0, 1, 2, 2];
+        let b: [u64x1; 3] = [u64x1::new(0), u64x1::new(2), u64x1::new(2)];
+        let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(2), u64x1::new(2)];
+        let r: [u64x1; 3] = transmute(vld3_lane_u64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_u64() {
+        let a: [u64; 7] = [0, 1, 2, 2, 4, 5, 6];
+        let b: [u64x2; 3] = [u64x2::new(0, 2), u64x2::new(2, 14), u64x2::new(2, 16)];
+        let e: [u64x2; 3] = [u64x2::new(1, 2), u64x2::new(2, 14), u64x2::new(2, 16)];
+        let r: [u64x2; 3] = transmute(vld3q_lane_u64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_f64() {
+        let a: [f64; 4] = [0., 1., 2., 2.];
+        let b: [f64; 3] = [0., 2., 2.];
+        let e: [f64; 3] = [1., 2., 2.];
+        let r: [f64; 3] = transmute(vld3_lane_f64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_f64() {
+        let a: [f64; 7] = [0., 1., 2., 2., 4., 5., 6.];
+        let b: [f64x2; 3] = [f64x2::new(0., 2.), f64x2::new(2., 14.), f64x2::new(9., 16.)];
+        let e: [f64x2; 3] = [f64x2::new(1., 2.), f64x2::new(2., 14.), f64x2::new(2., 16.)];
+        let r: [f64x2; 3] = transmute(vld3q_lane_f64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_s64() {
+        let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(2, 6), i64x2::new(2, 6), i64x2::new(6, 8)];
+        let r: [i64x2; 4] = transmute(vld4q_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_u64() {
+        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [u64x2; 4] = [u64x2::new(1, 2), u64x2::new(2, 6), u64x2::new(2, 6), u64x2::new(6, 8)];
+        let r: [u64x2; 4] = transmute(vld4q_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_p64() {
+        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(2, 6), i64x2::new(2, 6), i64x2::new(6, 8)];
+        let r: [i64x2; 4] = transmute(vld4q_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_f64() {
+        let a: [f64; 5] = [0., 1., 2., 2., 6.];
+        let e: [f64; 4] = [1., 2., 2., 6.];
+        let r: [f64; 4] = transmute(vld4_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_f64() {
+        let a: [f64; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.];
+        let e: [f64x2; 4] = [f64x2::new(1., 2.), f64x2::new(2., 6.), f64x2::new(2., 6.), f64x2::new(6., 8.)];
+        let r: [f64x2; 4] = transmute(vld4q_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_s64() {
+        let a: [i64; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5];
+        let e: [i64x2; 4] = [i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1)];
+        let r: [i64x2; 4] = transmute(vld4q_dup_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_u64() {
+        let a: [u64; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5];
+        let e: [u64x2; 4] = [u64x2::new(1, 1), u64x2::new(1, 1), u64x2::new(1, 1), u64x2::new(1, 1)];
+        let r: [u64x2; 4] = transmute(vld4q_dup_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_p64() {
+        let a: [u64; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5];
+        let e: [i64x2; 4] = [i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1)];
+        let r: [i64x2; 4] = transmute(vld4q_dup_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_f64() {
+        let a: [f64; 5] = [0., 1., 1., 1., 1.];
+        let e: [f64; 4] = [1., 1., 1., 1.];
+        let r: [f64; 4] = transmute(vld4_dup_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_f64() {
+        let a: [f64; 9] = [0., 1., 1., 1., 1., 6., 4., 3., 5.];
+        let e: [f64x2; 4] = [f64x2::new(1., 1.), f64x2::new(1., 1.), f64x2::new(1., 1.), f64x2::new(1., 1.)];
+        let r: [f64x2; 4] = transmute(vld4q_dup_f64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_s8() {
+        let a: [i8; 65] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16];
+        let b: [i8x16; 4] = [i8x16::new(0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), i8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)];
+        let e: [i8x16; 4] = [i8x16::new(1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), i8x16::new(2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)];
+        let r: [i8x16; 4] = transmute(vld4q_lane_s8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_s64() {
+        let a: [i64; 5] = [0, 1, 2, 2, 2];
+        let b: [i64x1; 4] = [i64x1::new(0), i64x1::new(2), i64x1::new(2), i64x1::new(2)];
+        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(2)];
+        let r: [i64x1; 4] = transmute(vld4_lane_s64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_s64() {
+        let a: [i64; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8];
+        let b: [i64x2; 4] = [i64x2::new(0, 2), i64x2::new(2, 2), i64x2::new(2, 16), i64x2::new(2, 18)];
+        let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(2, 2), i64x2::new(2, 16), i64x2::new(2, 18)];
+        let r: [i64x2; 4] = transmute(vld4q_lane_s64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_p64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 2];
+        let b: [i64x1; 4] = [i64x1::new(0), i64x1::new(2), i64x1::new(2), i64x1::new(2)];
+        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(2)];
+        let r: [i64x1; 4] = transmute(vld4_lane_p64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_p64() {
+        let a: [u64; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8];
+        let b: [i64x2; 4] = [i64x2::new(0, 2), i64x2::new(2, 2), i64x2::new(2, 16), i64x2::new(2, 18)];
+        let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(2, 2), i64x2::new(2, 16), i64x2::new(2, 18)];
+        let r: [i64x2; 4] = transmute(vld4q_lane_p64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_p8() {
+        let a: [u8; 65] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16];
+        let b: [i8x16; 4] = [i8x16::new(0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), i8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)];
+        let e: [i8x16; 4] = [i8x16::new(1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), i8x16::new(2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)];
+        let r: [i8x16; 4] = transmute(vld4q_lane_p8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_u8() {
+        let a: [u8; 65] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16];
+        let b: [u8x16; 4] = [u8x16::new(0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), u8x16::new(11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), u8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)];
+        let e: [u8x16; 4] = [u8x16::new(1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), u8x16::new(2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), u8x16::new(2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)];
+        let r: [u8x16; 4] = transmute(vld4q_lane_u8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_u64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 2];
+        let b: [u64x1; 4] = [u64x1::new(0), u64x1::new(2), u64x1::new(2), u64x1::new(2)];
+        let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(2), u64x1::new(2), u64x1::new(2)];
+        let r: [u64x1; 4] = transmute(vld4_lane_u64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_u64() {
+        let a: [u64; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8];
+        let b: [u64x2; 4] = [u64x2::new(0, 2), u64x2::new(2, 2), u64x2::new(2, 16), u64x2::new(2, 18)];
+        let e: [u64x2; 4] = [u64x2::new(1, 2), u64x2::new(2, 2), u64x2::new(2, 16), u64x2::new(2, 18)];
+        let r: [u64x2; 4] = transmute(vld4q_lane_u64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_f64() {
+        let a: [f64; 5] = [0., 1., 2., 2., 2.];
+        let b: [f64; 4] = [0., 2., 2., 2.];
+        let e: [f64; 4] = [1., 2., 2., 2.];
+        let r: [f64; 4] = transmute(vld4_lane_f64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_f64() {
+        let a: [f64; 9] = [0., 1., 2., 2., 2., 5., 6., 7., 8.];
+        let b: [f64x2; 4] = [f64x2::new(0., 2.), f64x2::new(2., 2.), f64x2::new(2., 16.), f64x2::new(2., 18.)];
+        let e: [f64x2; 4] = [f64x2::new(1., 2.), f64x2::new(2., 2.), f64x2::new(2., 16.), f64x2::new(2., 18.)];
+        let r: [f64x2; 4] = transmute(vld4q_lane_f64::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_f64() {
+        let a: [f64; 2] = [0., 1.];
+        let e: [f64; 1] = [1.];
+        let mut r: [f64; 1] = [0f64; 1];
+        vst1_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_f64() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let e: [f64; 2] = [1., 0.];
+        let mut r: [f64; 2] = [0f64; 2];
+        vst1q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f64_x2() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let e: [f64; 2] = [1., 2.];
+        let mut r: [f64; 2] = [0f64; 2];
+        vst1_f64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f64_x2() {
+        let a: [f64; 5] = [0., 1., 2., 3., 4.];
+        let e: [f64; 4] = [1., 2., 3., 4.];
+        let mut r: [f64; 4] = [0f64; 4];
+        vst1q_f64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f64_x3() {
+        let a: [f64; 4] = [0., 1., 2., 3.];
+        let e: [f64; 3] = [1., 2., 3.];
+        let mut r: [f64; 3] = [0f64; 3];
+        vst1_f64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f64_x3() {
+        let a: [f64; 7] = [0., 1., 2., 3., 4., 5., 6.];
+        let e: [f64; 6] = [1., 2., 3., 4., 5., 6.];
+        let mut r: [f64; 6] = [0f64; 6];
+        vst1q_f64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f64_x4() {
+        let a: [f64; 5] = [0., 1., 2., 3., 4.];
+        let e: [f64; 4] = [1., 2., 3., 4.];
+        let mut r: [f64; 4] = [0f64; 4];
+        vst1_f64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f64_x4() {
+        let a: [f64; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let e: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let mut r: [f64; 8] = [0f64; 8];
+        vst1q_f64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_s64() {
+        let a: [i64; 5] = [0, 1, 2, 2, 3];
+        let e: [i64; 4] = [1, 2, 2, 3];
+        let mut r: [i64; 4] = [0i64; 4];
+        vst2q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_u64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 3];
+        let e: [u64; 4] = [1, 2, 2, 3];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst2q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_p64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 3];
+        let e: [u64; 4] = [1, 2, 2, 3];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst2q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_f64() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let e: [f64; 2] = [1., 2.];
+        let mut r: [f64; 2] = [0f64; 2];
+        vst2_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_f64() {
+        let a: [f64; 5] = [0., 1., 2., 2., 3.];
+        let e: [f64; 4] = [1., 2., 2., 3.];
+        let mut r: [f64; 4] = [0f64; 4];
+        vst2q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_s8() {
+        let a: [i8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+        let e: [i8; 32] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i8; 32] = [0i8; 32];
+        vst2q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_s64() {
+        let a: [i64; 3] = [0, 1, 2];
+        let e: [i64; 2] = [1, 2];
+        let mut r: [i64; 2] = [0i64; 2];
+        vst2_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_s64() {
+        let a: [i64; 5] = [0, 1, 2, 2, 3];
+        let e: [i64; 4] = [1, 2, 0, 0];
+        let mut r: [i64; 4] = [0i64; 4];
+        vst2q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_u8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+        let e: [u8; 32] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst2q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_u64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64; 2] = [1, 2];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst2_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_u64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 3];
+        let e: [u64; 4] = [1, 2, 0, 0];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst2q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_p8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+        let e: [u8; 32] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst2q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_p64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64; 2] = [1, 2];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst2_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_p64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 3];
+        let e: [u64; 4] = [1, 2, 0, 0];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst2q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_f64() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let e: [f64; 2] = [1., 2.];
+        let mut r: [f64; 2] = [0f64; 2];
+        vst2_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_f64() {
+        let a: [f64; 5] = [0., 1., 2., 2., 3.];
+        let e: [f64; 4] = [1., 2., 0., 0.];
+        let mut r: [f64; 4] = [0f64; 4];
+        vst2q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_s64() {
+        let a: [i64; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [i64; 6] = [1, 2, 2, 2, 4, 4];
+        let mut r: [i64; 6] = [0i64; 6];
+        vst3q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_u64() {
+        let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [u64; 6] = [1, 2, 2, 2, 4, 4];
+        let mut r: [u64; 6] = [0u64; 6];
+        vst3q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_p64() {
+        let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [u64; 6] = [1, 2, 2, 2, 4, 4];
+        let mut r: [u64; 6] = [0u64; 6];
+        vst3q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_f64() {
+        let a: [f64; 4] = [0., 1., 2., 2.];
+        let e: [f64; 3] = [1., 2., 2.];
+        let mut r: [f64; 3] = [0f64; 3];
+        vst3_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_f64() {
+        let a: [f64; 7] = [0., 1., 2., 2., 4., 2., 4.];
+        let e: [f64; 6] = [1., 2., 2., 2., 4., 4.];
+        let mut r: [f64; 6] = [0f64; 6];
+        vst3q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_s8() {
+        let a: [i8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48];
+        let e: [i8; 48] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i8; 48] = [0i8; 48];
+        vst3q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_s64() {
+        let a: [i64; 4] = [0, 1, 2, 2];
+        let e: [i64; 3] = [1, 2, 2];
+        let mut r: [i64; 3] = [0i64; 3];
+        vst3_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_s64() {
+        let a: [i64; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [i64; 6] = [1, 2, 2, 0, 0, 0];
+        let mut r: [i64; 6] = [0i64; 6];
+        vst3q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_u8() {
+        let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48];
+        let e: [u8; 48] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 48] = [0u8; 48];
+        vst3q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_u64() {
+        let a: [u64; 4] = [0, 1, 2, 2];
+        let e: [u64; 3] = [1, 2, 2];
+        let mut r: [u64; 3] = [0u64; 3];
+        vst3_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_u64() {
+        let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [u64; 6] = [1, 2, 2, 0, 0, 0];
+        let mut r: [u64; 6] = [0u64; 6];
+        vst3q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_p8() {
+        let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48];
+        let e: [u8; 48] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 48] = [0u8; 48];
+        vst3q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_p64() {
+        let a: [u64; 4] = [0, 1, 2, 2];
+        let e: [u64; 3] = [1, 2, 2];
+        let mut r: [u64; 3] = [0u64; 3];
+        vst3_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_p64() {
+        let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [u64; 6] = [1, 2, 2, 0, 0, 0];
+        let mut r: [u64; 6] = [0u64; 6];
+        vst3q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_f64() {
+        let a: [f64; 4] = [0., 1., 2., 2.];
+        let e: [f64; 3] = [1., 2., 2.];
+        let mut r: [f64; 3] = [0f64; 3];
+        vst3_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_f64() {
+        let a: [f64; 7] = [0., 1., 2., 2., 3., 2., 3.];
+        let e: [f64; 6] = [1., 2., 2., 0., 0., 0.];
+        let mut r: [f64; 6] = [0f64; 6];
+        vst3q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_s64() {
+        let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [i64; 8] = [1, 2, 2, 6, 2, 6, 6, 8];
+        let mut r: [i64; 8] = [0i64; 8];
+        vst4q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_u64() {
+        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [u64; 8] = [1, 2, 2, 6, 2, 6, 6, 8];
+        let mut r: [u64; 8] = [0u64; 8];
+        vst4q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_p64() {
+        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [u64; 8] = [1, 2, 2, 6, 2, 6, 6, 8];
+        let mut r: [u64; 8] = [0u64; 8];
+        vst4q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_f64() {
+        let a: [f64; 5] = [0., 1., 2., 2., 6.];
+        let e: [f64; 4] = [1., 2., 2., 6.];
+        let mut r: [f64; 4] = [0f64; 4];
+        vst4_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_f64() {
+        let a: [f64; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.];
+        let e: [f64; 8] = [1., 2., 2., 6., 2., 6., 6., 8.];
+        let mut r: [f64; 8] = [0f64; 8];
+        vst4q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_s8() {
+        let a: [i8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let e: [i8; 64] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i8; 64] = [0i8; 64];
+        vst4q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_s64() {
+        let a: [i64; 5] = [0, 1, 2, 2, 6];
+        let e: [i64; 4] = [1, 2, 2, 6];
+        let mut r: [i64; 4] = [0i64; 4];
+        vst4_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_s64() {
+        let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [i64; 8] = [1, 2, 2, 6, 0, 0, 0, 0];
+        let mut r: [i64; 8] = [0i64; 8];
+        vst4q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_u8() {
+        let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let e: [u8; 64] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 64] = [0u8; 64];
+        vst4q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_u64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 6];
+        let e: [u64; 4] = [1, 2, 2, 6];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst4_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_u64() {
+        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [u64; 8] = [1, 2, 2, 6, 0, 0, 0, 0];
+        let mut r: [u64; 8] = [0u64; 8];
+        vst4q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_p8() {
+        let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let e: [u8; 64] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 64] = [0u8; 64];
+        vst4q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_p64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 6];
+        let e: [u64; 4] = [1, 2, 2, 6];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst4_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_p64() {
+        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [u64; 8] = [1, 2, 2, 6, 0, 0, 0, 0];
+        let mut r: [u64; 8] = [0u64; 8];
+        vst4q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_f64() {
+        let a: [f64; 5] = [0., 1., 2., 2., 6.];
+        let e: [f64; 4] = [1., 2., 2., 6.];
+        let mut r: [f64; 4] = [0f64; 4];
+        vst4_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_f64() {
+        let a: [f64; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.];
+        let e: [f64; 8] = [1., 2., 2., 6., 0., 0., 0., 0.];
+        let mut r: [f64; 8] = [0f64; 8];
+        vst4q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 2.0;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vmul_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let b: f64x2 = f64x2::new(2.0, 3.0);
+        let e: f64x2 = f64x2::new(2.0, 6.0);
+        let r: f64x2 = transmute(vmulq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmul_n_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64 = 2.;
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulq_n_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmul_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_f64() {
+        let a: f64 = 1.;
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmul_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64 = 2.;
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulq_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulq_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmuls_lane_f32() {
+        let a: f32 = 1.;
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vmuls_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmuls_laneq_f32() {
+        let a: f32 = 1.;
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vmuls_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmuld_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmuld_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmuld_laneq_f64() {
+        let a: f64 = 1.;
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmuld_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+        let e: i16x8 = i16x8::new(9, 20, 11, 24, 13, 28, 15, 32);
+        let r: i16x8 = transmute(vmull_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: i16x8 = i16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: i32x4 = i32x4::new(9, 20, 11, 24);
+        let r: i32x4 = transmute(vmull_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 9, 10);
+        let b: i32x4 = i32x4::new(1, 2, 1, 2);
+        let e: i64x2 = i64x2::new(9, 20);
+        let r: i64x2 = transmute(vmull_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+        let e: u16x8 = u16x8::new(9, 20, 11, 24, 13, 28, 15, 32);
+        let r: u16x8 = transmute(vmull_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: u16x8 = u16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: u32x4 = u32x4::new(9, 20, 11, 24);
+        let r: u32x4 = transmute(vmull_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 9, 10);
+        let b: u32x4 = u32x4::new(1, 2, 1, 2);
+        let e: u64x2 = u64x2::new(9, 20);
+        let r: u64x2 = transmute(vmull_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_p64() {
+        let a: p64 = 15;
+        let b: p64 = 3;
+        let e: p128 = 17;
+        let r: p128 = transmute(vmull_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_p8() {
+        let a: i8x16 = i8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3);
+        let e: i16x8 = i16x8::new(9, 30, 11, 20, 13, 18, 15, 48);
+        let r: i16x8 = transmute(vmull_high_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_p64() {
+        let a: i64x2 = i64x2::new(1, 15);
+        let b: i64x2 = i64x2::new(1, 3);
+        let e: p128 = 17;
+        let r: p128 = transmute(vmull_high_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_n_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: i16 = 2;
+        let e: i32x4 = i32x4::new(18, 20, 22, 24);
+        let r: i32x4 = transmute(vmull_high_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_n_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 9, 10);
+        let b: i32 = 2;
+        let e: i64x2 = i64x2::new(18, 20);
+        let r: i64x2 = transmute(vmull_high_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_n_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: u16 = 2;
+        let e: u32x4 = u32x4::new(18, 20, 22, 24);
+        let r: u32x4 = transmute(vmull_high_n_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_n_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 9, 10);
+        let b: u32 = 2;
+        let e: u64x2 = u64x2::new(18, 20);
+        let r: u64x2 = transmute(vmull_high_n_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_lane_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(18, 20, 22, 24);
+        let r: i32x4 = transmute(vmull_high_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(18, 20, 22, 24);
+        let r: i32x4 = transmute(vmull_high_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_lane_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 9, 10);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(18, 20);
+        let r: i64x2 = transmute(vmull_high_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 9, 10);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(18, 20);
+        let r: i64x2 = transmute(vmull_high_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_lane_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(18, 20, 22, 24);
+        let r: u32x4 = transmute(vmull_high_lane_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(18, 20, 22, 24);
+        let r: u32x4 = transmute(vmull_high_laneq_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_lane_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 9, 10);
+        let b: u32x2 = u32x2::new(0, 2);
+        let e: u64x2 = u64x2::new(18, 20);
+        let r: u64x2 = transmute(vmull_high_lane_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 9, 10);
+        let b: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u64x2 = u64x2::new(18, 20);
+        let r: u64x2 = transmute(vmull_high_laneq_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmulx_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulxq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmulx_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64x2 = f64x2::new(2., 2.);
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulxq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmulx_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_laneq_f64() {
+        let a: f64 = 1.;
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmulx_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmulx_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_laneq_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmulx_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_lane_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulxq_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulxq_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_lane_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64 = 2.;
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulxq_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_laneq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulxq_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxs_f32() {
+        let a: f32 = 2.;
+        let b: f32 = 3.;
+        let e: f32 = 6.;
+        let r: f32 = transmute(vmulxs_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxd_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 3.;
+        let e: f64 = 6.;
+        let r: f64 = transmute(vmulxd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxs_lane_f32() {
+        let a: f32 = 2.;
+        let b: f32x2 = f32x2::new(3., 0.);
+        let e: f32 = 6.;
+        let r: f32 = transmute(vmulxs_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxs_laneq_f32() {
+        let a: f32 = 2.;
+        let b: f32x4 = f32x4::new(3., 0., 0., 0.);
+        let e: f32 = 6.;
+        let r: f32 = transmute(vmulxs_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxd_lane_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 3.;
+        let e: f64 = 6.;
+        let r: f64 = transmute(vmulxd_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxd_laneq_f64() {
+        let a: f64 = 2.;
+        let b: f64x2 = f64x2::new(3., 0.);
+        let e: f64 = 6.;
+        let r: f64 = transmute(vmulxd_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_f64() {
+        let a: f64 = 8.0;
+        let b: f64 = 6.0;
+        let c: f64 = 2.0;
+        let e: f64 = 20.0;
+        let r: f64 = transmute(vfma_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_f64() {
+        let a: f64x2 = f64x2::new(8.0, 18.0);
+        let b: f64x2 = f64x2::new(6.0, 4.0);
+        let c: f64x2 = f64x2::new(2.0, 3.0);
+        let e: f64x2 = f64x2::new(20.0, 30.0);
+        let r: f64x2 = transmute(vfmaq_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_n_f64() {
+        let a: f64 = 2.0;
+        let b: f64 = 6.0;
+        let c: f64 = 8.0;
+        let e: f64 = 50.0;
+        let r: f64 = transmute(vfma_n_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_n_f64() {
+        let a: f64x2 = f64x2::new(2.0, 3.0);
+        let b: f64x2 = f64x2::new(6.0, 4.0);
+        let c: f64 = 8.0;
+        let e: f64x2 = f64x2::new(50.0, 35.0);
+        let r: f64x2 = transmute(vfmaq_n_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_lane_f32() {
+        let a: f32x2 = f32x2::new(2., 3.);
+        let b: f32x2 = f32x2::new(6., 4.);
+        let c: f32x2 = f32x2::new(2., 0.);
+        let e: f32x2 = f32x2::new(14., 11.);
+        let r: f32x2 = transmute(vfma_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_laneq_f32() {
+        let a: f32x2 = f32x2::new(2., 3.);
+        let b: f32x2 = f32x2::new(6., 4.);
+        let c: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x2 = f32x2::new(14., 11.);
+        let r: f32x2 = transmute(vfma_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_lane_f32() {
+        let a: f32x4 = f32x4::new(2., 3., 4., 5.);
+        let b: f32x4 = f32x4::new(6., 4., 7., 8.);
+        let c: f32x2 = f32x2::new(2., 0.);
+        let e: f32x4 = f32x4::new(14., 11., 18., 21.);
+        let r: f32x4 = transmute(vfmaq_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_laneq_f32() {
+        let a: f32x4 = f32x4::new(2., 3., 4., 5.);
+        let b: f32x4 = f32x4::new(6., 4., 7., 8.);
+        let c: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x4 = f32x4::new(14., 11., 18., 21.);
+        let r: f32x4 = transmute(vfmaq_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_lane_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 6.;
+        let c: f64 = 2.;
+        let e: f64 = 14.;
+        let r: f64 = transmute(vfma_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_laneq_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 6.;
+        let c: f64x2 = f64x2::new(2., 0.);
+        let e: f64 = 14.;
+        let r: f64 = transmute(vfma_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_lane_f64() {
+        let a: f64x2 = f64x2::new(2., 3.);
+        let b: f64x2 = f64x2::new(6., 4.);
+        let c: f64 = 2.;
+        let e: f64x2 = f64x2::new(14., 11.);
+        let r: f64x2 = transmute(vfmaq_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_laneq_f64() {
+        let a: f64x2 = f64x2::new(2., 3.);
+        let b: f64x2 = f64x2::new(6., 4.);
+        let c: f64x2 = f64x2::new(2., 0.);
+        let e: f64x2 = f64x2::new(14., 11.);
+        let r: f64x2 = transmute(vfmaq_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmas_lane_f32() {
+        let a: f32 = 2.;
+        let b: f32 = 6.;
+        let c: f32x2 = f32x2::new(3., 0.);
+        let e: f32 = 20.;
+        let r: f32 = transmute(vfmas_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmas_laneq_f32() {
+        let a: f32 = 2.;
+        let b: f32 = 6.;
+        let c: f32x4 = f32x4::new(3., 0., 0., 0.);
+        let e: f32 = 20.;
+        let r: f32 = transmute(vfmas_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmad_lane_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 6.;
+        let c: f64 = 3.;
+        let e: f64 = 20.;
+        let r: f64 = transmute(vfmad_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmad_laneq_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 6.;
+        let c: f64x2 = f64x2::new(3., 0.);
+        let e: f64 = 20.;
+        let r: f64 = transmute(vfmad_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfms_f64() {
+        let a: f64 = 20.0;
+        let b: f64 = 6.0;
+        let c: f64 = 2.0;
+        let e: f64 = 8.0;
+        let r: f64 = transmute(vfms_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsq_f64() {
+        let a: f64x2 = f64x2::new(20.0, 30.0);
+        let b: f64x2 = f64x2::new(6.0, 4.0);
+        let c: f64x2 = f64x2::new(2.0, 3.0);
+        let e: f64x2 = f64x2::new(8.0, 18.0);
+        let r: f64x2 = transmute(vfmsq_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfms_n_f64() {
+        let a: f64 = 50.0;
+        let b: f64 = 6.0;
+        let c: f64 = 8.0;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vfms_n_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsq_n_f64() {
+        let a: f64x2 = f64x2::new(50.0, 35.0);
+        let b: f64x2 = f64x2::new(6.0, 4.0);
+        let c: f64 = 8.0;
+        let e: f64x2 = f64x2::new(2.0, 3.0);
+        let r: f64x2 = transmute(vfmsq_n_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfms_lane_f32() {
+        let a: f32x2 = f32x2::new(14., 11.);
+        let b: f32x2 = f32x2::new(6., 4.);
+        let c: f32x2 = f32x2::new(2., 0.);
+        let e: f32x2 = f32x2::new(2., 3.);
+        let r: f32x2 = transmute(vfms_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfms_laneq_f32() {
+        let a: f32x2 = f32x2::new(14., 11.);
+        let b: f32x2 = f32x2::new(6., 4.);
+        let c: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x2 = f32x2::new(2., 3.);
+        let r: f32x2 = transmute(vfms_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsq_lane_f32() {
+        let a: f32x4 = f32x4::new(14., 11., 18., 21.);
+        let b: f32x4 = f32x4::new(6., 4., 7., 8.);
+        let c: f32x2 = f32x2::new(2., 0.);
+        let e: f32x4 = f32x4::new(2., 3., 4., 5.);
+        let r: f32x4 = transmute(vfmsq_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsq_laneq_f32() {
+        let a: f32x4 = f32x4::new(14., 11., 18., 21.);
+        let b: f32x4 = f32x4::new(6., 4., 7., 8.);
+        let c: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x4 = f32x4::new(2., 3., 4., 5.);
+        let r: f32x4 = transmute(vfmsq_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfms_lane_f64() {
+        let a: f64 = 14.;
+        let b: f64 = 6.;
+        let c: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vfms_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfms_laneq_f64() {
+        let a: f64 = 14.;
+        let b: f64 = 6.;
+        let c: f64x2 = f64x2::new(2., 0.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vfms_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsq_lane_f64() {
+        let a: f64x2 = f64x2::new(14., 11.);
+        let b: f64x2 = f64x2::new(6., 4.);
+        let c: f64 = 2.;
+        let e: f64x2 = f64x2::new(2., 3.);
+        let r: f64x2 = transmute(vfmsq_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsq_laneq_f64() {
+        let a: f64x2 = f64x2::new(14., 11.);
+        let b: f64x2 = f64x2::new(6., 4.);
+        let c: f64x2 = f64x2::new(2., 0.);
+        let e: f64x2 = f64x2::new(2., 3.);
+        let r: f64x2 = transmute(vfmsq_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmss_lane_f32() {
+        let a: f32 = 14.;
+        let b: f32 = 6.;
+        let c: f32x2 = f32x2::new(2., 0.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vfmss_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmss_laneq_f32() {
+        let a: f32 = 14.;
+        let b: f32 = 6.;
+        let c: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vfmss_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsd_lane_f64() {
+        let a: f64 = 14.;
+        let b: f64 = 6.;
+        let c: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vfmsd_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsd_laneq_f64() {
+        let a: f64 = 14.;
+        let b: f64 = 6.;
+        let c: f64x2 = f64x2::new(2., 0.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vfmsd_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdiv_f32() {
+        let a: f32x2 = f32x2::new(2.0, 6.0);
+        let b: f32x2 = f32x2::new(1.0, 2.0);
+        let e: f32x2 = f32x2::new(2.0, 3.0);
+        let r: f32x2 = transmute(vdiv_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdivq_f32() {
+        let a: f32x4 = f32x4::new(2.0, 6.0, 4.0, 10.0);
+        let b: f32x4 = f32x4::new(1.0, 2.0, 1.0, 2.0);
+        let e: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let r: f32x4 = transmute(vdivq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdiv_f64() {
+        let a: f64 = 2.0;
+        let b: f64 = 1.0;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vdiv_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdivq_f64() {
+        let a: f64x2 = f64x2::new(2.0, 6.0);
+        let b: f64x2 = f64x2::new(1.0, 2.0);
+        let e: f64x2 = f64x2::new(2.0, 3.0);
+        let r: f64x2 = transmute(vdivq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 1.0;
+        let e: f64 = 0.0;
+        let r: f64 = transmute(vsub_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 4.0);
+        let b: f64x2 = f64x2::new(1.0, 2.0);
+        let e: f64x2 = f64x2::new(0.0, 2.0);
+        let r: f64x2 = transmute(vsubq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubd_s64() {
+        let a: i64 = 3;
+        let b: i64 = 2;
+        let e: i64 = 1;
+        let r: i64 = transmute(vsubd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubd_u64() {
+        let a: u64 = 3;
+        let b: u64 = 2;
+        let e: u64 = 1;
+        let r: u64 = transmute(vsubd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddd_s64() {
+        let a: i64 = 1;
+        let b: i64 = 2;
+        let e: i64 = 3;
+        let r: i64 = transmute(vaddd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddd_u64() {
+        let a: u64 = 1;
+        let b: u64 = 2;
+        let e: u64 = 3;
+        let r: u64 = transmute(vaddd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddv_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let e: f32 = 3.;
+        let r: f32 = transmute(vaddv_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 0., 0.);
+        let e: f32 = 3.;
+        let r: f32 = transmute(vaddvq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let e: f64 = 3.;
+        let r: f64 = transmute(vaddvq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlv_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i32 = 10;
+        let r: i32 = transmute(vaddlv_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlvq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32 = 36;
+        let r: i32 = transmute(vaddlvq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlv_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let e: i64 = 3;
+        let r: i64 = transmute(vaddlv_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlvq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i64 = 10;
+        let r: i64 = transmute(vaddlvq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlv_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u32 = 10;
+        let r: u32 = transmute(vaddlv_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlvq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32 = 36;
+        let r: u32 = transmute(vaddlvq_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlv_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let e: u64 = 3;
+        let r: u64 = transmute(vaddlv_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlvq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u64 = 10;
+        let r: u64 = transmute(vaddlvq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_high_s8() {
+        let a: i16x8 = i16x8::new(8, 9, 10, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vsubw_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_high_s16() {
+        let a: i32x4 = i32x4::new(8, 9, 10, 11);
+        let b: i16x8 = i16x8::new(0, 1, 2, 3, 8, 9, 10, 11);
+        let e: i32x4 = i32x4::new(0, 0, 0, 0);
+        let r: i32x4 = transmute(vsubw_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_high_s32() {
+        let a: i64x2 = i64x2::new(8, 9);
+        let b: i32x4 = i32x4::new(6, 7, 8, 9);
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vsubw_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_high_u8() {
+        let a: u16x8 = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vsubw_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_high_u16() {
+        let a: u32x4 = u32x4::new(8, 9, 10, 11);
+        let b: u16x8 = u16x8::new(0, 1, 2, 3, 8, 9, 10, 11);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0);
+        let r: u32x4 = transmute(vsubw_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_high_u32() {
+        let a: u64x2 = u64x2::new(8, 9);
+        let b: u32x4 = u32x4::new(6, 7, 8, 9);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vsubw_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_high_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vsubl_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_high_s16() {
+        let a: i16x8 = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i16x8 = i16x8::new(6, 6, 6, 6, 8, 8, 8, 8);
+        let e: i32x4 = i32x4::new(4, 5, 6, 7);
+        let r: i32x4 = transmute(vsubl_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_high_s32() {
+        let a: i32x4 = i32x4::new(12, 13, 14, 15);
+        let b: i32x4 = i32x4::new(6, 6, 8, 8);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vsubl_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_high_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vsubl_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_high_u16() {
+        let a: u16x8 = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u16x8 = u16x8::new(6, 6, 6, 6, 8, 8, 8, 8);
+        let e: u32x4 = u32x4::new(4, 5, 6, 7);
+        let r: u32x4 = transmute(vsubl_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_high_u32() {
+        let a: u32x4 = u32x4::new(12, 13, 14, 15);
+        let b: u32x4 = u32x4::new(6, 6, 8, 8);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vsubl_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vbcaxq_s8() {
+        let a: i8x16 = i8x16::new(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0);
+        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let c: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e: i8x16 = i8x16::new(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        let r: i8x16 = transmute(vbcaxq_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vbcaxq_s16() {
+        let a: i16x8 = i16x8::new(1, 0, 1, 0, 1, 0, 1, 0);
+        let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let c: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e: i16x8 = i16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let r: i16x8 = transmute(vbcaxq_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vbcaxq_s32() {
+        let a: i32x4 = i32x4::new(1, 0, 1, 0);
+        let b: i32x4 = i32x4::new(0, 1, 2, 3);
+        let c: i32x4 = i32x4::new(1, 1, 1, 1);
+        let e: i32x4 = i32x4::new(1, 0, 3, 2);
+        let r: i32x4 = transmute(vbcaxq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vbcaxq_s64() {
+        let a: i64x2 = i64x2::new(1, 0);
+        let b: i64x2 = i64x2::new(0, 1);
+        let c: i64x2 = i64x2::new(1, 1);
+        let e: i64x2 = i64x2::new(1, 0);
+        let r: i64x2 = transmute(vbcaxq_s64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vbcaxq_u8() {
+        let a: u8x16 = u8x16::new(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0);
+        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let c: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e: u8x16 = u8x16::new(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        let r: u8x16 = transmute(vbcaxq_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vbcaxq_u16() {
+        let a: u16x8 = u16x8::new(1, 0, 1, 0, 1, 0, 1, 0);
+        let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let c: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e: u16x8 = u16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let r: u16x8 = transmute(vbcaxq_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vbcaxq_u32() {
+        let a: u32x4 = u32x4::new(1, 0, 1, 0);
+        let b: u32x4 = u32x4::new(0, 1, 2, 3);
+        let c: u32x4 = u32x4::new(1, 1, 1, 1);
+        let e: u32x4 = u32x4::new(1, 0, 3, 2);
+        let r: u32x4 = transmute(vbcaxq_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vbcaxq_u64() {
+        let a: u64x2 = u64x2::new(1, 0);
+        let b: u64x2 = u64x2::new(0, 1);
+        let c: u64x2 = u64x2::new(1, 1);
+        let e: u64x2 = u64x2::new(1, 0);
+        let r: u64x2 = transmute(vbcaxq_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcadd_rot270_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let e: f32x2 = f32x2::new(2., 0.);
+        let r: f32x2 = transmute(vcadd_rot270_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaddq_rot270_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let e: f32x4 = f32x4::new(2., 0., 2., 0.);
+        let r: f32x4 = transmute(vcaddq_rot270_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaddq_rot270_f64() {
+        let a: f64x2 = f64x2::new(1., -1.);
+        let b: f64x2 = f64x2::new(-1., 1.);
+        let e: f64x2 = f64x2::new(2., 0.);
+        let r: f64x2 = transmute(vcaddq_rot270_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcadd_rot90_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let e: f32x2 = f32x2::new(0., -2.);
+        let r: f32x2 = transmute(vcadd_rot90_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaddq_rot90_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let e: f32x4 = f32x4::new(0., -2., 0., -2.);
+        let r: f32x4 = transmute(vcaddq_rot90_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaddq_rot90_f64() {
+        let a: f64x2 = f64x2::new(1., -1.);
+        let b: f64x2 = f64x2::new(-1., 1.);
+        let e: f64x2 = f64x2::new(0., -2.);
+        let r: f64x2 = transmute(vcaddq_rot90_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(0., -2.);
+        let r: f32x2 = transmute(vcmla_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let c: f32x4 = f32x4::new(1., 1., -1., -1.);
+        let e: f32x4 = f32x4::new(0., -2., 2., 0.);
+        let r: f32x4 = transmute(vcmlaq_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_f64() {
+        let a: f64x2 = f64x2::new(1., -1.);
+        let b: f64x2 = f64x2::new(-1., 1.);
+        let c: f64x2 = f64x2::new(1., 1.);
+        let e: f64x2 = f64x2::new(0., -2.);
+        let r: f64x2 = transmute(vcmlaq_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_rot90_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let b: f32x2 = f32x2::new(1., -1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(2., 0.);
+        let r: f32x2 = transmute(vcmla_rot90_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot90_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let b: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let c: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let e: f32x4 = f32x4::new(2., 0., 2., 0.);
+        let r: f32x4 = transmute(vcmlaq_rot90_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot90_f64() {
+        let a: f64x2 = f64x2::new(1., 1.);
+        let b: f64x2 = f64x2::new(1., -1.);
+        let c: f64x2 = f64x2::new(1., 1.);
+        let e: f64x2 = f64x2::new(2., 0.);
+        let r: f64x2 = transmute(vcmlaq_rot90_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_rot180_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let b: f32x2 = f32x2::new(1., -1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vcmla_rot180_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot180_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let b: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let c: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vcmlaq_rot180_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot180_f64() {
+        let a: f64x2 = f64x2::new(1., 1.);
+        let b: f64x2 = f64x2::new(1., -1.);
+        let c: f64x2 = f64x2::new(1., 1.);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vcmlaq_rot180_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_rot270_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let b: f32x2 = f32x2::new(1., -1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(0., 2.);
+        let r: f32x2 = transmute(vcmla_rot270_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot270_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let b: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let c: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let e: f32x4 = f32x4::new(0., 2., 0., 2.);
+        let r: f32x4 = transmute(vcmlaq_rot270_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot270_f64() {
+        let a: f64x2 = f64x2::new(1., 1.);
+        let b: f64x2 = f64x2::new(1., -1.);
+        let c: f64x2 = f64x2::new(1., 1.);
+        let e: f64x2 = f64x2::new(0., 2.);
+        let r: f64x2 = transmute(vcmlaq_rot270_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_lane_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(0., -2.);
+        let r: f32x2 = transmute(vcmla_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_laneq_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let c: f32x4 = f32x4::new(1., 1., -1., -1.);
+        let e: f32x2 = f32x2::new(0., -2.);
+        let r: f32x2 = transmute(vcmla_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_lane_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x4 = f32x4::new(0., -2., 0., -2.);
+        let r: f32x4 = transmute(vcmlaq_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let c: f32x4 = f32x4::new(1., 1., -1., -1.);
+        let e: f32x4 = f32x4::new(0., -2., 0., -2.);
+        let r: f32x4 = transmute(vcmlaq_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_rot90_lane_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vcmla_rot90_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_rot90_laneq_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let c: f32x4 = f32x4::new(1., 1., -1., -1.);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vcmla_rot90_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot90_lane_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vcmlaq_rot90_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot90_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let c: f32x4 = f32x4::new(1., 1., -1., -1.);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vcmlaq_rot90_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_rot180_lane_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(2., 0.);
+        let r: f32x2 = transmute(vcmla_rot180_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_rot180_laneq_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let c: f32x4 = f32x4::new(1., 1., -1., -1.);
+        let e: f32x2 = f32x2::new(2., 0.);
+        let r: f32x2 = transmute(vcmla_rot180_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot180_lane_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x4 = f32x4::new(2., 0., 2., 0.);
+        let r: f32x4 = transmute(vcmlaq_rot180_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot180_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let c: f32x4 = f32x4::new(1., 1., -1., -1.);
+        let e: f32x4 = f32x4::new(2., 0., 2., 0.);
+        let r: f32x4 = transmute(vcmlaq_rot180_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_rot270_lane_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(2., -2.);
+        let r: f32x2 = transmute(vcmla_rot270_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmla_rot270_laneq_f32() {
+        let a: f32x2 = f32x2::new(1., -1.);
+        let b: f32x2 = f32x2::new(-1., 1.);
+        let c: f32x4 = f32x4::new(1., 1., -1., -1.);
+        let e: f32x2 = f32x2::new(2., -2.);
+        let r: f32x2 = transmute(vcmla_rot270_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot270_lane_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let c: f32x2 = f32x2::new(1., 1.);
+        let e: f32x4 = f32x4::new(2., -2., 2., -2.);
+        let r: f32x4 = transmute(vcmlaq_rot270_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcmlaq_rot270_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., -1., 1., -1.);
+        let b: f32x4 = f32x4::new(-1., 1., -1., 1.);
+        let c: f32x4 = f32x4::new(1., 1., -1., -1.);
+        let e: f32x4 = f32x4::new(2., -2., 2., -2.);
+        let r: f32x4 = transmute(vcmlaq_rot270_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdot_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x2 = i32x2::new(31, 176);
+        let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdotq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 1, 2);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x4 = i32x4::new(31, 176, 31, 176);
+        let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdot_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x2 = u32x2::new(31, 176);
+        let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdotq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 1, 2);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x4 = u32x4::new(31, 176, 31, 176);
+        let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdot_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x2 = i32x2::new(31, 72);
+        let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdot_laneq_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x2 = i32x2::new(31, 72);
+        let r: i32x2 = transmute(vdot_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdotq_lane_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 1, 2);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x4 = i32x4::new(31, 72, 31, 72);
+        let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdotq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 1, 2);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x4 = i32x4::new(31, 72, 31, 72);
+        let r: i32x4 = transmute(vdotq_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdot_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x2 = u32x2::new(31, 72);
+        let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdot_laneq_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x2 = u32x2::new(31, 72);
+        let r: u32x2 = transmute(vdot_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdotq_lane_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 1, 2);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x4 = u32x4::new(31, 72, 31, 72);
+        let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdotq_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 1, 2);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x4 = u32x4::new(31, 72, 31, 72);
+        let r: u32x4 = transmute(vdotq_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmax_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 0.0;
+        let e: f64 = 1.0;
+        let r: f64 = transmute(vmax_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxq_f64() {
+        let a: f64x2 = f64x2::new(1.0, -2.0);
+        let b: f64x2 = f64x2::new(0.0, 3.0);
+        let e: f64x2 = f64x2::new(1.0, 3.0);
+        let r: f64x2 = transmute(vmaxq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxnm_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 8.0;
+        let e: f64 = 8.0;
+        let r: f64 = transmute(vmaxnm_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxnmq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let b: f64x2 = f64x2::new(8.0, 16.0);
+        let e: f64x2 = f64x2::new(8.0, 16.0);
+        let r: f64x2 = transmute(vmaxnmq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxnmv_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vmaxnmv_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxnmvq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmaxnmvq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxnmvq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 0., 1.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vmaxnmvq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxnm_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let b: f32x2 = f32x2::new(6.0, -3.0);
+        let e: f32x2 = f32x2::new(2.0, 6.0);
+        let r: f32x2 = transmute(vpmaxnm_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxnmq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let b: f64x2 = f64x2::new(6.0, -3.0);
+        let e: f64x2 = f64x2::new(2.0, 6.0);
+        let r: f64x2 = transmute(vpmaxnmq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxnmq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 3.0, -4.0);
+        let b: f32x4 = f32x4::new(8.0, 16.0, -1.0, 6.0);
+        let e: f32x4 = f32x4::new(2.0, 3.0, 16.0, 6.0);
+        let r: f32x4 = transmute(vpmaxnmq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxnms_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vpmaxnms_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxnmqd_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vpmaxnmqd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxs_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vpmaxs_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxqd_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vpmaxqd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmin_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 0.0;
+        let e: f64 = 0.0;
+        let r: f64 = transmute(vmin_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminq_f64() {
+        let a: f64x2 = f64x2::new(1.0, -2.0);
+        let b: f64x2 = f64x2::new(0.0, 3.0);
+        let e: f64x2 = f64x2::new(0.0, -2.0);
+        let r: f64x2 = transmute(vminq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminnm_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 8.0;
+        let e: f64 = 1.0;
+        let r: f64 = transmute(vminnm_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminnmq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let b: f64x2 = f64x2::new(8.0, 16.0);
+        let e: f64x2 = f64x2::new(1.0, 2.0);
+        let r: f64x2 = transmute(vminnmq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminnmv_f32() {
+        let a: f32x2 = f32x2::new(1., 0.);
+        let e: f32 = 0.;
+        let r: f32 = transmute(vminnmv_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminnmvq_f64() {
+        let a: f64x2 = f64x2::new(1., 0.);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vminnmvq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminnmvq_f32() {
+        let a: f32x4 = f32x4::new(1., 0., 2., 3.);
+        let e: f32 = 0.;
+        let r: f32 = transmute(vminnmvq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_high_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 3, 4, 5, 6, 3, 4, 5, 6, 7, 8, 9, 10);
+        let e: i16x8 = i16x8::new(3, 4, 5, 6, 7, 8, 9, 10);
+        let r: i16x8 = transmute(vmovl_high_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_high_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 3, 4, 5, 6);
+        let e: i32x4 = i32x4::new(3, 4, 5, 6);
+        let r: i32x4 = transmute(vmovl_high_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_high_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i64x2 = i64x2::new(3, 4);
+        let r: i64x2 = transmute(vmovl_high_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_high_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 3, 4, 5, 6, 3, 4, 5, 6, 7, 8, 9, 10);
+        let e: u16x8 = u16x8::new(3, 4, 5, 6, 7, 8, 9, 10);
+        let r: u16x8 = transmute(vmovl_high_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_high_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 3, 4, 5, 6);
+        let e: u32x4 = u32x4::new(3, 4, 5, 6);
+        let r: u32x4 = transmute(vmovl_high_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_high_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u64x2 = u64x2::new(3, 4);
+        let r: u64x2 = transmute(vmovl_high_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x4 = f32x4::new(3., 4., 5., 6.);
+        let e: f32x4 = f32x4::new(3., 7., 7., 11.);
+        let r: f32x4 = transmute(vpaddq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64x2 = f64x2::new(3., 4.);
+        let e: f64x2 = f64x2::new(3., 7.);
+        let r: f64x2 = transmute(vpaddq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadds_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let e: f32 = 3.;
+        let r: f32 = transmute(vpadds_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddd_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let e: f64 = 3.;
+        let r: f64 = transmute(vpaddd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminnm_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let b: f32x2 = f32x2::new(6.0, -3.0);
+        let e: f32x2 = f32x2::new(1.0, -3.0);
+        let r: f32x2 = transmute(vpminnm_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminnmq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let b: f64x2 = f64x2::new(6.0, -3.0);
+        let e: f64x2 = f64x2::new(1.0, -3.0);
+        let r: f64x2 = transmute(vpminnmq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminnmq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 3.0, -4.0);
+        let b: f32x4 = f32x4::new(8.0, 16.0, -1.0, 6.0);
+        let e: f32x4 = f32x4::new(1.0, -4.0, 8.0, -1.0);
+        let r: f32x4 = transmute(vpminnmq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminnms_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let e: f32 = 1.;
+        let r: f32 = transmute(vpminnms_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminnmqd_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let e: f64 = 1.;
+        let r: f64 = transmute(vpminnmqd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmins_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let e: f32 = 1.;
+        let r: f32 = transmute(vpmins_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminqd_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let e: f64 = 1.;
+        let r: f64 = transmute(vpminqd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmullh_s16() {
+        let a: i16 = 2;
+        let b: i16 = 3;
+        let e: i32 = 12;
+        let r: i32 = transmute(vqdmullh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulls_s32() {
+        let a: i32 = 2;
+        let b: i32 = 3;
+        let e: i64 = 12;
+        let r: i64 = transmute(vqdmulls_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_high_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(1, 2, 5, 6, 5, 6, 7, 8);
+        let e: i32x4 = i32x4::new(40, 60, 84, 112);
+        let r: i32x4 = transmute(vqdmull_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_high_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 4, 5);
+        let b: i32x4 = i32x4::new(1, 2, 5, 6);
+        let e: i64x2 = i64x2::new(40, 60);
+        let r: i64x2 = transmute(vqdmull_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_high_n_s16() {
+        let a: i16x8 = i16x8::new(0, 2, 8, 10, 8, 10, 12, 14);
+        let b: i16 = 2;
+        let e: i32x4 = i32x4::new(32, 40, 48, 56);
+        let r: i32x4 = transmute(vqdmull_high_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_high_n_s32() {
+        let a: i32x4 = i32x4::new(0, 2, 8, 10);
+        let b: i32 = 2;
+        let e: i64x2 = i64x2::new(32, 40);
+        let r: i64x2 = transmute(vqdmull_high_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_laneq_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0);
+        let e: i32x4 = i32x4::new(4, 8, 12, 16);
+        let r: i32x4 = transmute(vqdmull_laneq_s16::<4>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_laneq_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 2, 2, 0);
+        let e: i64x2 = i64x2::new(4, 8);
+        let r: i64x2 = transmute(vqdmull_laneq_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmullh_lane_s16() {
+        let a: i16 = 2;
+        let b: i16x4 = i16x4::new(0, 2, 2, 0);
+        let e: i32 = 8;
+        let r: i32 = transmute(vqdmullh_lane_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmullh_laneq_s16() {
+        let a: i16 = 2;
+        let b: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0);
+        let e: i32 = 8;
+        let r: i32 = transmute(vqdmullh_laneq_s16::<4>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulls_lane_s32() {
+        let a: i32 = 2;
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i64 = 8;
+        let r: i64 = transmute(vqdmulls_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulls_laneq_s32() {
+        let a: i32 = 2;
+        let b: i32x4 = i32x4::new(0, 2, 2, 0);
+        let e: i64 = 8;
+        let r: i64 = transmute(vqdmulls_laneq_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_high_lane_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7);
+        let b: i16x4 = i16x4::new(0, 2, 2, 0);
+        let e: i32x4 = i32x4::new(16, 20, 24, 28);
+        let r: i32x4 = transmute(vqdmull_high_lane_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_high_lane_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 4, 5);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(16, 20);
+        let r: i64x2 = transmute(vqdmull_high_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_high_laneq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0);
+        let e: i32x4 = i32x4::new(16, 20, 24, 28);
+        let r: i32x4 = transmute(vqdmull_high_laneq_s16::<4>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_high_laneq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 4, 5);
+        let b: i32x4 = i32x4::new(0, 2, 2, 0);
+        let e: i64x2 = i64x2::new(16, 20);
+        let r: i64x2 = transmute(vqdmull_high_laneq_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_high_s16() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7);
+        let c: i16x8 = i16x8::new(1, 2, 5, 6, 5, 6, 7, 8);
+        let e: i32x4 = i32x4::new(41, 62, 87, 116);
+        let r: i32x4 = transmute(vqdmlal_high_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_high_s32() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 1, 4, 5);
+        let c: i32x4 = i32x4::new(1, 2, 5, 6);
+        let e: i64x2 = i64x2::new(41, 62);
+        let r: i64x2 = transmute(vqdmlal_high_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_high_n_s16() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 2, 8, 10, 8, 10, 12, 14);
+        let c: i16 = 2;
+        let e: i32x4 = i32x4::new(33, 42, 51, 60);
+        let r: i32x4 = transmute(vqdmlal_high_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_high_n_s32() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 2, 8, 10);
+        let c: i32 = 2;
+        let e: i64x2 = i64x2::new(33, 42);
+        let r: i64x2 = transmute(vqdmlal_high_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_laneq_s16() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let c: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0);
+        let e: i32x4 = i32x4::new(5, 10, 15, 20);
+        let r: i32x4 = transmute(vqdmlal_laneq_s16::<2>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_laneq_s32() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i32x2 = i32x2::new(1, 2);
+        let c: i32x4 = i32x4::new(0, 2, 2, 0);
+        let e: i64x2 = i64x2::new(5, 10);
+        let r: i64x2 = transmute(vqdmlal_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_high_lane_s16() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(17, 22, 27, 32);
+        let r: i32x4 = transmute(vqdmlal_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_high_laneq_s16() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(17, 22, 27, 32);
+        let r: i32x4 = transmute(vqdmlal_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_high_lane_s32() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 1, 4, 5);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(17, 22);
+        let r: i64x2 = transmute(vqdmlal_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_high_laneq_s32() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 1, 4, 5);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(17, 22);
+        let r: i64x2 = transmute(vqdmlal_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlalh_s16() {
+        let a: i32 = 1;
+        let b: i16 = 1;
+        let c: i16 = 2;
+        let e: i32 = 5;
+        let r: i32 = transmute(vqdmlalh_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlals_s32() {
+        let a: i64 = 1;
+        let b: i32 = 1;
+        let c: i32 = 2;
+        let e: i64 = 5;
+        let r: i64 = transmute(vqdmlals_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlalh_lane_s16() {
+        let a: i32 = 1;
+        let b: i16 = 1;
+        let c: i16x4 = i16x4::new(2, 1, 1, 1);
+        let e: i32 = 5;
+        let r: i32 = transmute(vqdmlalh_lane_s16::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlalh_laneq_s16() {
+        let a: i32 = 1;
+        let b: i16 = 1;
+        let c: i16x8 = i16x8::new(2, 1, 1, 1, 1, 1, 1, 1);
+        let e: i32 = 5;
+        let r: i32 = transmute(vqdmlalh_laneq_s16::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlals_lane_s32() {
+        let a: i64 = 1;
+        let b: i32 = 1;
+        let c: i32x2 = i32x2::new(2, 1);
+        let e: i64 = 5;
+        let r: i64 = transmute(vqdmlals_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlals_laneq_s32() {
+        let a: i64 = 1;
+        let b: i32 = 1;
+        let c: i32x4 = i32x4::new(2, 1, 1, 1);
+        let e: i64 = 5;
+        let r: i64 = transmute(vqdmlals_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_high_s16() {
+        let a: i32x4 = i32x4::new(39, 58, 81, 108);
+        let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7);
+        let c: i16x8 = i16x8::new(1, 2, 5, 6, 5, 6, 7, 8);
+        let e: i32x4 = i32x4::new(-1, -2, -3, -4);
+        let r: i32x4 = transmute(vqdmlsl_high_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_high_s32() {
+        let a: i64x2 = i64x2::new(39, 58);
+        let b: i32x4 = i32x4::new(0, 1, 4, 5);
+        let c: i32x4 = i32x4::new(1, 2, 5, 6);
+        let e: i64x2 = i64x2::new(-1, -2);
+        let r: i64x2 = transmute(vqdmlsl_high_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_high_n_s16() {
+        let a: i32x4 = i32x4::new(31, 38, 45, 52);
+        let b: i16x8 = i16x8::new(0, 2, 8, 10, 8, 10, 12, 14);
+        let c: i16 = 2;
+        let e: i32x4 = i32x4::new(-1, -2, -3, -4);
+        let r: i32x4 = transmute(vqdmlsl_high_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_high_n_s32() {
+        let a: i64x2 = i64x2::new(31, 38);
+        let b: i32x4 = i32x4::new(0, 2, 8, 10);
+        let c: i32 = 2;
+        let e: i64x2 = i64x2::new(-1, -2);
+        let r: i64x2 = transmute(vqdmlsl_high_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_laneq_s16() {
+        let a: i32x4 = i32x4::new(3, 6, 9, 12);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let c: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0);
+        let e: i32x4 = i32x4::new(-1, -2, -3, -4);
+        let r: i32x4 = transmute(vqdmlsl_laneq_s16::<2>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_laneq_s32() {
+        let a: i64x2 = i64x2::new(3, 6);
+        let b: i32x2 = i32x2::new(1, 2);
+        let c: i32x4 = i32x4::new(0, 2, 2, 0);
+        let e: i64x2 = i64x2::new(-1, -2);
+        let r: i64x2 = transmute(vqdmlsl_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_high_lane_s16() {
+        let a: i32x4 = i32x4::new(15, 18, 21, 24);
+        let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(-1, -2, -3, -4);
+        let r: i32x4 = transmute(vqdmlsl_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_high_laneq_s16() {
+        let a: i32x4 = i32x4::new(15, 18, 21, 24);
+        let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(-1, -2, -3, -4);
+        let r: i32x4 = transmute(vqdmlsl_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_high_lane_s32() {
+        let a: i64x2 = i64x2::new(15, 18);
+        let b: i32x4 = i32x4::new(0, 1, 4, 5);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(-1, -2);
+        let r: i64x2 = transmute(vqdmlsl_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_high_laneq_s32() {
+        let a: i64x2 = i64x2::new(15, 18);
+        let b: i32x4 = i32x4::new(0, 1, 4, 5);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(-1, -2);
+        let r: i64x2 = transmute(vqdmlsl_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlslh_s16() {
+        let a: i32 = 10;
+        let b: i16 = 1;
+        let c: i16 = 2;
+        let e: i32 = 6;
+        let r: i32 = transmute(vqdmlslh_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsls_s32() {
+        let a: i64 = 10;
+        let b: i32 = 1;
+        let c: i32 = 2;
+        let e: i64 = 6;
+        let r: i64 = transmute(vqdmlsls_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlslh_lane_s16() {
+        let a: i32 = 10;
+        let b: i16 = 1;
+        let c: i16x4 = i16x4::new(2, 1, 1, 1);
+        let e: i32 = 6;
+        let r: i32 = transmute(vqdmlslh_lane_s16::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlslh_laneq_s16() {
+        let a: i32 = 10;
+        let b: i16 = 1;
+        let c: i16x8 = i16x8::new(2, 1, 1, 1, 1, 1, 1, 1);
+        let e: i32 = 6;
+        let r: i32 = transmute(vqdmlslh_laneq_s16::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsls_lane_s32() {
+        let a: i64 = 10;
+        let b: i32 = 1;
+        let c: i32x2 = i32x2::new(2, 1);
+        let e: i64 = 6;
+        let r: i64 = transmute(vqdmlsls_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsls_laneq_s32() {
+        let a: i64 = 10;
+        let b: i32 = 1;
+        let c: i32x4 = i32x4::new(2, 1, 1, 1);
+        let e: i64 = 6;
+        let r: i64 = transmute(vqdmlsls_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhh_s16() {
+        let a: i16 = 1;
+        let b: i16 = 2;
+        let e: i16 = 0;
+        let r: i16 = transmute(vqdmulhh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhs_s32() {
+        let a: i32 = 1;
+        let b: i32 = 2;
+        let e: i32 = 0;
+        let r: i32 = transmute(vqdmulhs_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhh_lane_s16() {
+        let a: i16 = 2;
+        let b: i16x4 = i16x4::new(0, 0, 0x7F_FF, 0);
+        let e: i16 = 1;
+        let r: i16 = transmute(vqdmulhh_lane_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhh_laneq_s16() {
+        let a: i16 = 2;
+        let b: i16x8 = i16x8::new(0, 0, 0x7F_FF, 0, 0, 0, 0, 0);
+        let e: i16 = 1;
+        let r: i16 = transmute(vqdmulhh_laneq_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhs_lane_s32() {
+        let a: i32 = 2;
+        let b: i32x2 = i32x2::new(0, 0x7F_FF_FF_FF);
+        let e: i32 = 1;
+        let r: i32 = transmute(vqdmulhs_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhs_laneq_s32() {
+        let a: i32 = 2;
+        let b: i32x4 = i32x4::new(0, 0x7F_FF_FF_FF, 0, 0);
+        let e: i32 = 1;
+        let r: i32 = transmute(vqdmulhs_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulh_lane_s16() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x4 = i16x4::new(2, 1, 1, 1);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vqdmulh_lane_s16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhq_lane_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x4 = i16x4::new(2, 1, 1, 1);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vqdmulhq_lane_s16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulh_lane_s32() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x2 = i32x2::new(2, 1);
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vqdmulh_lane_s32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhq_lane_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x2 = i32x2::new(2, 1);
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vqdmulhq_lane_s32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovnh_s16() {
+        let a: i16 = 1;
+        let e: i8 = 1;
+        let r: i8 = transmute(vqmovnh_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovns_s32() {
+        let a: i32 = 1;
+        let e: i16 = 1;
+        let r: i16 = transmute(vqmovns_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovnh_u16() {
+        let a: u16 = 1;
+        let e: u8 = 1;
+        let r: u8 = transmute(vqmovnh_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovns_u32() {
+        let a: u32 = 1;
+        let e: u16 = 1;
+        let r: u16 = transmute(vqmovns_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovnd_s64() {
+        let a: i64 = 1;
+        let e: i32 = 1;
+        let r: i32 = transmute(vqmovnd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovnd_u64() {
+        let a: u64 = 1;
+        let e: u32 = 1;
+        let r: u32 = transmute(vqmovnd_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_high_s16() {
+        let a: i8x8 = i8x8::new(0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
+        let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let e: i8x16 = i8x16::new(0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
+        let r: i8x16 = transmute(vqmovn_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_high_s32() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let e: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let r: i16x8 = transmute(vqmovn_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_high_s64() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let r: i32x4 = transmute(vqmovn_high_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_high_u16() {
+        let a: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let b: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vqmovn_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_high_u32() {
+        let a: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let b: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vqmovn_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_high_u64() {
+        let a: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let b: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vqmovn_high_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovunh_s16() {
+        let a: i16 = 1;
+        let e: u8 = 1;
+        let r: u8 = transmute(vqmovunh_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovuns_s32() {
+        let a: i32 = 1;
+        let e: u16 = 1;
+        let r: u16 = transmute(vqmovuns_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovund_s64() {
+        let a: i64 = 1;
+        let e: u32 = 1;
+        let r: u32 = transmute(vqmovund_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovun_high_s16() {
+        let a: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let b: i16x8 = i16x8::new(-1, -1, -1, -1, -1, -1, -1, -1);
+        let e: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vqmovun_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovun_high_s32() {
+        let a: u16x4 = u16x4::new(0, 0, 0, 0);
+        let b: i32x4 = i32x4::new(-1, -1, -1, -1);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vqmovun_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovun_high_s64() {
+        let a: u32x2 = u32x2::new(0, 0);
+        let b: i64x2 = i64x2::new(-1, -1);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0);
+        let r: u32x4 = transmute(vqmovun_high_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhh_s16() {
+        let a: i16 = 1;
+        let b: i16 = 2;
+        let e: i16 = 0;
+        let r: i16 = transmute(vqrdmulhh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhs_s32() {
+        let a: i32 = 1;
+        let b: i32 = 2;
+        let e: i32 = 0;
+        let r: i32 = transmute(vqrdmulhs_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhh_lane_s16() {
+        let a: i16 = 1;
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16 = 0;
+        let r: i16 = transmute(vqrdmulhh_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhh_laneq_s16() {
+        let a: i16 = 1;
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16 = 0;
+        let r: i16 = transmute(vqrdmulhh_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhs_lane_s32() {
+        let a: i32 = 1;
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i32 = 0;
+        let r: i32 = transmute(vqrdmulhs_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhs_laneq_s32() {
+        let a: i32 = 1;
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32 = 0;
+        let r: i32 = transmute(vqrdmulhs_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlah_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i16x4 = i16x4::new(3, 3, 3, 3);
+        let r: i16x4 = transmute(vqrdmlah_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let r: i16x8 = transmute(vqrdmlahq_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlah_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x2 = i32x2::new(2, 2);
+        let e: i32x2 = i32x2::new(3, 3);
+        let r: i32x2 = transmute(vqrdmlah_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(3, 3, 3, 3);
+        let r: i32x4 = transmute(vqrdmlahq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahh_s16() {
+        let a: i16 = 1;
+        let b: i16 = 1;
+        let c: i16 = 2;
+        let e: i16 = 1;
+        let r: i16 = transmute(vqrdmlahh_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahs_s32() {
+        let a: i32 = 1;
+        let b: i32 = 1;
+        let c: i32 = 2;
+        let e: i32 = 1;
+        let r: i32 = transmute(vqrdmlahs_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlah_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x4 = i16x4::new(3, 3, 3, 3);
+        let r: i16x4 = transmute(vqrdmlah_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlah_laneq_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(3, 3, 3, 3);
+        let r: i16x4 = transmute(vqrdmlah_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahq_lane_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let r: i16x8 = transmute(vqrdmlahq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahq_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let r: i16x8 = transmute(vqrdmlahq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlah_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i32x2 = i32x2::new(3, 3);
+        let r: i32x2 = transmute(vqrdmlah_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlah_laneq_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x2 = i32x2::new(3, 3);
+        let r: i32x2 = transmute(vqrdmlah_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahq_lane_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i32x4 = i32x4::new(3, 3, 3, 3);
+        let r: i32x4 = transmute(vqrdmlahq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(3, 3, 3, 3);
+        let r: i32x4 = transmute(vqrdmlahq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahh_lane_s16() {
+        let a: i16 = 1;
+        let b: i16 = 1;
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16 = 1;
+        let r: i16 = transmute(vqrdmlahh_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahh_laneq_s16() {
+        let a: i16 = 1;
+        let b: i16 = 1;
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16 = 1;
+        let r: i16 = transmute(vqrdmlahh_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahs_lane_s32() {
+        let a: i32 = 1;
+        let b: i32 = 1;
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i32 = 1;
+        let r: i32 = transmute(vqrdmlahs_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlahs_laneq_s32() {
+        let a: i32 = 1;
+        let b: i32 = 1;
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32 = 1;
+        let r: i32 = transmute(vqrdmlahs_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlsh_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i16x4 = i16x4::new(-1, -1, -1, -1);
+        let r: i16x4 = transmute(vqrdmlsh_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i16x8 = i16x8::new(-1, -1, -1, -1, -1, -1, -1, -1);
+        let r: i16x8 = transmute(vqrdmlshq_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlsh_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x2 = i32x2::new(2, 2);
+        let e: i32x2 = i32x2::new(-1, -1);
+        let r: i32x2 = transmute(vqrdmlsh_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(-1, -1, -1, -1);
+        let r: i32x4 = transmute(vqrdmlshq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshh_s16() {
+        let a: i16 = 1;
+        let b: i16 = 1;
+        let c: i16 = 2;
+        let e: i16 = 1;
+        let r: i16 = transmute(vqrdmlshh_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshs_s32() {
+        let a: i32 = 1;
+        let b: i32 = 1;
+        let c: i32 = 2;
+        let e: i32 = 1;
+        let r: i32 = transmute(vqrdmlshs_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlsh_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x4 = i16x4::new(-1, -1, -1, -1);
+        let r: i16x4 = transmute(vqrdmlsh_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlsh_laneq_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(-1, -1, -1, -1);
+        let r: i16x4 = transmute(vqrdmlsh_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshq_lane_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x8 = i16x8::new(-1, -1, -1, -1, -1, -1, -1, -1);
+        let r: i16x8 = transmute(vqrdmlshq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshq_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(-1, -1, -1, -1, -1, -1, -1, -1);
+        let r: i16x8 = transmute(vqrdmlshq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlsh_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i32x2 = i32x2::new(-1, -1);
+        let r: i32x2 = transmute(vqrdmlsh_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlsh_laneq_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x2 = i32x2::new(-1, -1);
+        let r: i32x2 = transmute(vqrdmlsh_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshq_lane_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i32x4 = i32x4::new(-1, -1, -1, -1);
+        let r: i32x4 = transmute(vqrdmlshq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(-1, -1, -1, -1);
+        let r: i32x4 = transmute(vqrdmlshq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshh_lane_s16() {
+        let a: i16 = 1;
+        let b: i16 = 1;
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16 = 1;
+        let r: i16 = transmute(vqrdmlshh_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshh_laneq_s16() {
+        let a: i16 = 1;
+        let b: i16 = 1;
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16 = 1;
+        let r: i16 = transmute(vqrdmlshh_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshs_lane_s32() {
+        let a: i32 = 1;
+        let b: i32 = 1;
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i32 = 1;
+        let r: i32 = transmute(vqrdmlshs_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmlshs_laneq_s32() {
+        let a: i32 = 1;
+        let b: i32 = 1;
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32 = 1;
+        let r: i32 = transmute(vqrdmlshs_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshls_s32() {
+        let a: i32 = 2;
+        let b: i32 = 2;
+        let e: i32 = 8;
+        let r: i32 = transmute(vqrshls_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshld_s64() {
+        let a: i64 = 2;
+        let b: i64 = 2;
+        let e: i64 = 8;
+        let r: i64 = transmute(vqrshld_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlb_s8() {
+        let a: i8 = 1;
+        let b: i8 = 2;
+        let e: i8 = 4;
+        let r: i8 = transmute(vqrshlb_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlh_s16() {
+        let a: i16 = 1;
+        let b: i16 = 2;
+        let e: i16 = 4;
+        let r: i16 = transmute(vqrshlh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshls_u32() {
+        let a: u32 = 2;
+        let b: i32 = 2;
+        let e: u32 = 8;
+        let r: u32 = transmute(vqrshls_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshld_u64() {
+        let a: u64 = 2;
+        let b: i64 = 2;
+        let e: u64 = 8;
+        let r: u64 = transmute(vqrshld_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlb_u8() {
+        let a: u8 = 1;
+        let b: i8 = 2;
+        let e: u8 = 4;
+        let r: u8 = transmute(vqrshlb_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlh_u16() {
+        let a: u16 = 1;
+        let b: i16 = 2;
+        let e: u16 = 4;
+        let r: u16 = transmute(vqrshlh_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrnh_n_s16() {
+        let a: i16 = 4;
+        let e: i8 = 1;
+        let r: i8 = transmute(vqrshrnh_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrns_n_s32() {
+        let a: i32 = 4;
+        let e: i16 = 1;
+        let r: i16 = transmute(vqrshrns_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrnd_n_s64() {
+        let a: i64 = 4;
+        let e: i32 = 1;
+        let r: i32 = transmute(vqrshrnd_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_high_n_s16() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 2, 3, 6, 7);
+        let b: i16x8 = i16x8::new(8, 12, 24, 28, 48, 52, 56, 60);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vqrshrn_high_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_high_n_s32() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(8, 12, 24, 28);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 2, 3, 6, 7);
+        let r: i16x8 = transmute(vqrshrn_high_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_high_n_s64() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i64x2 = i64x2::new(8, 12);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vqrshrn_high_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrnh_n_u16() {
+        let a: u16 = 4;
+        let e: u8 = 1;
+        let r: u8 = transmute(vqrshrnh_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrns_n_u32() {
+        let a: u32 = 4;
+        let e: u16 = 1;
+        let r: u16 = transmute(vqrshrns_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrnd_n_u64() {
+        let a: u64 = 4;
+        let e: u32 = 1;
+        let r: u32 = transmute(vqrshrnd_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_high_n_u16() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 2, 3, 6, 7);
+        let b: u16x8 = u16x8::new(8, 12, 24, 28, 48, 52, 56, 60);
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vqrshrn_high_n_u16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_high_n_u32() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(8, 12, 24, 28);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 2, 3, 6, 7);
+        let r: u16x8 = transmute(vqrshrn_high_n_u32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_high_n_u64() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u64x2 = u64x2::new(8, 12);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vqrshrn_high_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrunh_n_s16() {
+        let a: i16 = 4;
+        let e: u8 = 1;
+        let r: u8 = transmute(vqrshrunh_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshruns_n_s32() {
+        let a: i32 = 4;
+        let e: u16 = 1;
+        let r: u16 = transmute(vqrshruns_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrund_n_s64() {
+        let a: i64 = 4;
+        let e: u32 = 1;
+        let r: u32 = transmute(vqrshrund_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrun_high_n_s16() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 2, 3, 6, 7);
+        let b: i16x8 = i16x8::new(8, 12, 24, 28, 48, 52, 56, 60);
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vqrshrun_high_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrun_high_n_s32() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(8, 12, 24, 28);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 2, 3, 6, 7);
+        let r: u16x8 = transmute(vqrshrun_high_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrun_high_n_s64() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: i64x2 = i64x2::new(8, 12);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vqrshrun_high_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshld_s64() {
+        let a: i64 = 0;
+        let b: i64 = 2;
+        let e: i64 = 0;
+        let r: i64 = transmute(vqshld_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlb_s8() {
+        let a: i8 = 1;
+        let b: i8 = 2;
+        let e: i8 = 4;
+        let r: i8 = transmute(vqshlb_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlh_s16() {
+        let a: i16 = 1;
+        let b: i16 = 2;
+        let e: i16 = 4;
+        let r: i16 = transmute(vqshlh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshls_s32() {
+        let a: i32 = 1;
+        let b: i32 = 2;
+        let e: i32 = 4;
+        let r: i32 = transmute(vqshls_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshld_u64() {
+        let a: u64 = 0;
+        let b: i64 = 2;
+        let e: u64 = 0;
+        let r: u64 = transmute(vqshld_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlb_u8() {
+        let a: u8 = 1;
+        let b: i8 = 2;
+        let e: u8 = 4;
+        let r: u8 = transmute(vqshlb_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlh_u16() {
+        let a: u16 = 1;
+        let b: i16 = 2;
+        let e: u16 = 4;
+        let r: u16 = transmute(vqshlh_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshls_u32() {
+        let a: u32 = 1;
+        let b: i32 = 2;
+        let e: u32 = 4;
+        let r: u32 = transmute(vqshls_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlb_n_s8() {
+        let a: i8 = 1;
+        let e: i8 = 4;
+        let r: i8 = transmute(vqshlb_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlh_n_s16() {
+        let a: i16 = 1;
+        let e: i16 = 4;
+        let r: i16 = transmute(vqshlh_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshls_n_s32() {
+        let a: i32 = 1;
+        let e: i32 = 4;
+        let r: i32 = transmute(vqshls_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshld_n_s64() {
+        let a: i64 = 1;
+        let e: i64 = 4;
+        let r: i64 = transmute(vqshld_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlb_n_u8() {
+        let a: u8 = 1;
+        let e: u8 = 4;
+        let r: u8 = transmute(vqshlb_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlh_n_u16() {
+        let a: u16 = 1;
+        let e: u16 = 4;
+        let r: u16 = transmute(vqshlh_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshls_n_u32() {
+        let a: u32 = 1;
+        let e: u32 = 4;
+        let r: u32 = transmute(vqshls_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshld_n_u64() {
+        let a: u64 = 1;
+        let e: u64 = 4;
+        let r: u64 = transmute(vqshld_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlub_n_s8() {
+        let a: i8 = 1;
+        let e: u8 = 4;
+        let r: u8 = transmute(vqshlub_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshluh_n_s16() {
+        let a: i16 = 1;
+        let e: u16 = 4;
+        let r: u16 = transmute(vqshluh_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlus_n_s32() {
+        let a: i32 = 1;
+        let e: u32 = 4;
+        let r: u32 = transmute(vqshlus_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlud_n_s64() {
+        let a: i64 = 1;
+        let e: u64 = 4;
+        let r: u64 = transmute(vqshlud_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrnd_n_s64() {
+        let a: i64 = 0;
+        let e: i32 = 0;
+        let r: i32 = transmute(vqshrnd_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrnh_n_s16() {
+        let a: i16 = 4;
+        let e: i8 = 1;
+        let r: i8 = transmute(vqshrnh_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrns_n_s32() {
+        let a: i32 = 4;
+        let e: i16 = 1;
+        let r: i16 = transmute(vqshrns_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_high_n_s16() {
+        let a: i8x8 = i8x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let b: i16x8 = i16x8::new(32, 36, 40, 44, 48, 52, 56, 60);
+        let e: i8x16 = i8x16::new(0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vqshrn_high_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_high_n_s32() {
+        let a: i16x4 = i16x4::new(0, 1, 8, 9);
+        let b: i32x4 = i32x4::new(32, 36, 40, 44);
+        let e: i16x8 = i16x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let r: i16x8 = transmute(vqshrn_high_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_high_n_s64() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i64x2 = i64x2::new(32, 36);
+        let e: i32x4 = i32x4::new(0, 1, 8, 9);
+        let r: i32x4 = transmute(vqshrn_high_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrnd_n_u64() {
+        let a: u64 = 0;
+        let e: u32 = 0;
+        let r: u32 = transmute(vqshrnd_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrnh_n_u16() {
+        let a: u16 = 4;
+        let e: u8 = 1;
+        let r: u8 = transmute(vqshrnh_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrns_n_u32() {
+        let a: u32 = 4;
+        let e: u16 = 1;
+        let r: u16 = transmute(vqshrns_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_high_n_u16() {
+        let a: u8x8 = u8x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let b: u16x8 = u16x8::new(32, 36, 40, 44, 48, 52, 56, 60);
+        let e: u8x16 = u8x16::new(0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vqshrn_high_n_u16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_high_n_u32() {
+        let a: u16x4 = u16x4::new(0, 1, 8, 9);
+        let b: u32x4 = u32x4::new(32, 36, 40, 44);
+        let e: u16x8 = u16x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let r: u16x8 = transmute(vqshrn_high_n_u32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_high_n_u64() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u64x2 = u64x2::new(32, 36);
+        let e: u32x4 = u32x4::new(0, 1, 8, 9);
+        let r: u32x4 = transmute(vqshrn_high_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrunh_n_s16() {
+        let a: i16 = 4;
+        let e: u8 = 1;
+        let r: u8 = transmute(vqshrunh_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshruns_n_s32() {
+        let a: i32 = 4;
+        let e: u16 = 1;
+        let r: u16 = transmute(vqshruns_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrund_n_s64() {
+        let a: i64 = 4;
+        let e: u32 = 1;
+        let r: u32 = transmute(vqshrund_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrun_high_n_s16() {
+        let a: u8x8 = u8x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let b: i16x8 = i16x8::new(32, 36, 40, 44, 48, 52, 56, 60);
+        let e: u8x16 = u8x16::new(0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vqshrun_high_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrun_high_n_s32() {
+        let a: u16x4 = u16x4::new(0, 1, 8, 9);
+        let b: i32x4 = i32x4::new(32, 36, 40, 44);
+        let e: u16x8 = u16x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let r: u16x8 = transmute(vqshrun_high_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrun_high_n_s64() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: i64x2 = i64x2::new(32, 36);
+        let e: u32x4 = u32x4::new(0, 1, 8, 9);
+        let r: u32x4 = transmute(vqshrun_high_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddb_u8() {
+        let a: u8 = 2;
+        let b: i8 = 2;
+        let e: u8 = 4;
+        let r: u8 = transmute(vsqaddb_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddh_u16() {
+        let a: u16 = 2;
+        let b: i16 = 2;
+        let e: u16 = 4;
+        let r: u16 = transmute(vsqaddh_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqadds_u32() {
+        let a: u32 = 2;
+        let b: i32 = 2;
+        let e: u32 = 4;
+        let r: u32 = transmute(vsqadds_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddd_u64() {
+        let a: u64 = 2;
+        let b: i64 = 2;
+        let e: u64 = 4;
+        let r: u64 = transmute(vsqaddd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrt_f32() {
+        let a: f32x2 = f32x2::new(4.0, 9.0);
+        let e: f32x2 = f32x2::new(2.0, 3.0);
+        let r: f32x2 = transmute(vsqrt_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrtq_f32() {
+        let a: f32x4 = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let e: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let r: f32x4 = transmute(vsqrtq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrt_f64() {
+        let a: f64 = 4.0;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vsqrt_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrtq_f64() {
+        let a: f64x2 = f64x2::new(4.0, 9.0);
+        let e: f64x2 = f64x2::new(2.0, 3.0);
+        let r: f64x2 = transmute(vsqrtq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrte_f64() {
+        let a: f64 = 1.0;
+        let e: f64 = 0.998046875;
+        let r: f64 = transmute(vrsqrte_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrteq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let e: f64x2 = f64x2::new(0.998046875, 0.705078125);
+        let r: f64x2 = transmute(vrsqrteq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrtes_f32() {
+        let a: f32 = 1.0;
+        let e: f32 = 0.998046875;
+        let r: f32 = transmute(vrsqrtes_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrted_f64() {
+        let a: f64 = 1.0;
+        let e: f64 = 0.998046875;
+        let r: f64 = transmute(vrsqrted_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrts_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 1.0;
+        let e: f64 = 1.;
+        let r: f64 = transmute(vrsqrts_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrtsq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let b: f64x2 = f64x2::new(1.0, 2.0);
+        let e: f64x2 = f64x2::new(1., -0.5);
+        let r: f64x2 = transmute(vrsqrtsq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrtss_f32() {
+        let a: f32 = 1.0;
+        let b: f32 = 1.0;
+        let e: f32 = 1.;
+        let r: f32 = transmute(vrsqrtss_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrtsd_f64() {
+        let a: f64 = 1.0;
+        let b: f64 = 1.0;
+        let e: f64 = 1.;
+        let r: f64 = transmute(vrsqrtsd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpe_f64() {
+        let a: f64 = 4.0;
+        let e: f64 = 0.24951171875;
+        let r: f64 = transmute(vrecpe_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpeq_f64() {
+        let a: f64x2 = f64x2::new(4.0, 3.0);
+        let e: f64x2 = f64x2::new(0.24951171875, 0.3330078125);
+        let r: f64x2 = transmute(vrecpeq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpes_f32() {
+        let a: f32 = 4.0;
+        let e: f32 = 0.24951171875;
+        let r: f32 = transmute(vrecpes_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecped_f64() {
+        let a: f64 = 4.0;
+        let e: f64 = 0.24951171875;
+        let r: f64 = transmute(vrecped_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecps_f64() {
+        let a: f64 = 4.0;
+        let b: f64 = 4.0;
+        let e: f64 = -14.;
+        let r: f64 = transmute(vrecps_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpsq_f64() {
+        let a: f64x2 = f64x2::new(4.0, 3.0);
+        let b: f64x2 = f64x2::new(4.0, 3.0);
+        let e: f64x2 = f64x2::new(-14., -7.);
+        let r: f64x2 = transmute(vrecpsq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpss_f32() {
+        let a: f32 = 4.0;
+        let b: f32 = 4.0;
+        let e: f32 = -14.;
+        let r: f32 = transmute(vrecpss_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpsd_f64() {
+        let a: f64 = 4.0;
+        let b: f64 = 4.0;
+        let e: f64 = -14.;
+        let r: f64 = transmute(vrecpsd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpxs_f32() {
+        let a: f32 = 4.0;
+        let e: f32 = 0.5;
+        let r: f32 = transmute(vrecpxs_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpxd_f64() {
+        let a: f64 = 4.0;
+        let e: f64 = 0.5;
+        let r: f64 = transmute(vrecpxd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_f64() {
+        let a: f64 = 0.;
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_f64() {
+        let a: f64 = 0.;
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_s16_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_f64() {
+        let a: f64 = 0.;
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vreinterpret_s32_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_f64() {
+        let a: f64 = 0.;
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_s16_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: i32x4 = i32x4::new(0, 0, 0, 0);
+        let r: i32x4 = transmute(vreinterpretq_s32_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vreinterpretq_s64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_f64() {
+        let a: f64 = 0.;
+        let e: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_f64() {
+        let a: f64 = 0.;
+        let e: u16x4 = u16x4::new(0, 0, 0, 0);
+        let r: u16x4 = transmute(vreinterpret_u16_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_f64() {
+        let a: f64 = 0.;
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vreinterpret_u32_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_f64() {
+        let a: f64 = 0.;
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vreinterpretq_u16_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0);
+        let r: u32x4 = transmute(vreinterpretq_u32_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vreinterpretq_u64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_f64() {
+        let a: f64 = 0.;
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_f64() {
+        let a: f64 = 0.;
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_p16_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_f64() {
+        let a: f64 = 0.;
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_p16_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vreinterpretq_p64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vreinterpretq_p64_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_s16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_s32() {
+        let a: i32x2 = i32x2::new(0, 0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_s16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_s32() {
+        let a: i32x4 = i32x4::new(0, 0, 0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_s64() {
+        let a: i64x2 = i64x2::new(0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_u16() {
+        let a: u16x4 = u16x4::new(0, 0, 0, 0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_u32() {
+        let a: u32x2 = u32x2::new(0, 0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_u16() {
+        let a: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_u32() {
+        let a: u32x4 = u32x4::new(0, 0, 0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_u64() {
+        let a: u64x2 = u64x2::new(0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_p16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_p16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_p64() {
+        let a: i64x2 = i64x2::new(0, 0);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_p64() {
+        let a: i64x2 = i64x2::new(0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_p128() {
+        let a: p128 = 0;
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f64_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: f64 = 0.;
+        let r: f64 = transmute(vreinterpret_f64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_f64() {
+        let a: f64 = 0.;
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f64_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: f64x2 = f64x2::new(0., 0.);
+        let r: f64x2 = transmute(vreinterpretq_f64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_f64() {
+        let a: f64x2 = f64x2::new(0., 0.);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshld_s64() {
+        let a: i64 = 1;
+        let b: i64 = 2;
+        let e: i64 = 4;
+        let r: i64 = transmute(vrshld_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshld_u64() {
+        let a: u64 = 1;
+        let b: i64 = 2;
+        let e: u64 = 4;
+        let r: u64 = transmute(vrshld_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrd_n_s64() {
+        let a: i64 = 4;
+        let e: i64 = 1;
+        let r: i64 = transmute(vrshrd_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrd_n_u64() {
+        let a: u64 = 4;
+        let e: u64 = 1;
+        let r: u64 = transmute(vrshrd_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_high_n_s16() {
+        let a: i8x8 = i8x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let b: i16x8 = i16x8::new(32, 36, 40, 44, 48, 52, 56, 60);
+        let e: i8x16 = i8x16::new(0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vrshrn_high_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_high_n_s32() {
+        let a: i16x4 = i16x4::new(0, 1, 8, 9);
+        let b: i32x4 = i32x4::new(32, 36, 40, 44);
+        let e: i16x8 = i16x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let r: i16x8 = transmute(vrshrn_high_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_high_n_s64() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i64x2 = i64x2::new(32, 36);
+        let e: i32x4 = i32x4::new(0, 1, 8, 9);
+        let r: i32x4 = transmute(vrshrn_high_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_high_n_u16() {
+        let a: u8x8 = u8x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let b: u16x8 = u16x8::new(32, 36, 40, 44, 48, 52, 56, 60);
+        let e: u8x16 = u8x16::new(0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vrshrn_high_n_u16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_high_n_u32() {
+        let a: u16x4 = u16x4::new(0, 1, 8, 9);
+        let b: u32x4 = u32x4::new(32, 36, 40, 44);
+        let e: u16x8 = u16x8::new(0, 1, 8, 9, 8, 9, 10, 11);
+        let r: u16x8 = transmute(vrshrn_high_n_u32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_high_n_u64() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u64x2 = u64x2::new(32, 36);
+        let e: u32x4 = u32x4::new(0, 1, 8, 9);
+        let r: u32x4 = transmute(vrshrn_high_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsrad_n_s64() {
+        let a: i64 = 1;
+        let b: i64 = 4;
+        let e: i64 = 2;
+        let r: i64 = transmute(vrsrad_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsrad_n_u64() {
+        let a: u64 = 1;
+        let b: u64 = 4;
+        let e: u64 = 2;
+        let r: u64 = transmute(vrsrad_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_high_s16() {
+        let a: i8x8 = i8x8::new(1, 2, 0, 0, 0, 0, 0, 0);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x16 = i8x16::new(1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vrsubhn_high_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_high_s32() {
+        let a: i16x4 = i16x4::new(1, 2, 0, 0);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let c: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i16x8 = i16x8::new(1, 2, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vrsubhn_high_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_high_s64() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i64x2 = i64x2::new(1, 2);
+        let c: i64x2 = i64x2::new(1, 2);
+        let e: i32x4 = i32x4::new(1, 2, 0, 0);
+        let r: i32x4 = transmute(vrsubhn_high_s64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_high_u16() {
+        let a: u8x8 = u8x8::new(1, 2, 0, 0, 0, 0, 0, 0);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x16 = u8x16::new(1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vrsubhn_high_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_high_u32() {
+        let a: u16x4 = u16x4::new(1, 2, 0, 0);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u16x8 = u16x8::new(1, 2, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vrsubhn_high_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_high_u64() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u64x2 = u64x2::new(1, 2);
+        let c: u64x2 = u64x2::new(1, 2);
+        let e: u32x4 = u32x4::new(1, 2, 0, 0);
+        let r: u32x4 = transmute(vrsubhn_high_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 0.;
+        let e: f64 = 1.;
+        let r: f64 = transmute(vset_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64x2 = f64x2::new(0., 2.);
+        let e: f64x2 = f64x2::new(1., 2.);
+        let r: f64x2 = transmute(vsetq_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshld_s64() {
+        let a: i64 = 1;
+        let b: i64 = 2;
+        let e: i64 = 4;
+        let r: i64 = transmute(vshld_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshld_u64() {
+        let a: u64 = 1;
+        let b: i64 = 2;
+        let e: u64 = 4;
+        let r: u64 = transmute(vshld_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_high_n_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: i16x8 = transmute(vshll_high_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_high_n_s16() {
+        let a: i16x8 = i16x8::new(0, 0, 1, 2, 1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(4, 8, 12, 16);
+        let r: i32x4 = transmute(vshll_high_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_high_n_s32() {
+        let a: i32x4 = i32x4::new(0, 0, 1, 2);
+        let e: i64x2 = i64x2::new(4, 8);
+        let r: i64x2 = transmute(vshll_high_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_high_n_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: u16x8 = transmute(vshll_high_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_high_n_u16() {
+        let a: u16x8 = u16x8::new(0, 0, 1, 2, 1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(4, 8, 12, 16);
+        let r: u32x4 = transmute(vshll_high_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_high_n_u32() {
+        let a: u32x4 = u32x4::new(0, 0, 1, 2);
+        let e: u64x2 = u64x2::new(4, 8);
+        let r: u64x2 = transmute(vshll_high_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_high_n_s16() {
+        let a: i8x8 = i8x8::new(1, 2, 5, 6, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(20, 24, 28, 32, 52, 56, 60, 64);
+        let e: i8x16 = i8x16::new(1, 2, 5, 6, 5, 6, 7, 8, 5, 6, 7, 8, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vshrn_high_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_high_n_s32() {
+        let a: i16x4 = i16x4::new(1, 2, 5, 6);
+        let b: i32x4 = i32x4::new(20, 24, 28, 32);
+        let e: i16x8 = i16x8::new(1, 2, 5, 6, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vshrn_high_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_high_n_s64() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i64x2 = i64x2::new(20, 24);
+        let e: i32x4 = i32x4::new(1, 2, 5, 6);
+        let r: i32x4 = transmute(vshrn_high_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_high_n_u16() {
+        let a: u8x8 = u8x8::new(1, 2, 5, 6, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(20, 24, 28, 32, 52, 56, 60, 64);
+        let e: u8x16 = u8x16::new(1, 2, 5, 6, 5, 6, 7, 8, 5, 6, 7, 8, 13, 14, 15, 16);
+        let r: u8x16 = transmute(vshrn_high_n_u16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_high_n_u32() {
+        let a: u16x4 = u16x4::new(1, 2, 5, 6);
+        let b: u32x4 = u32x4::new(20, 24, 28, 32);
+        let e: u16x8 = u16x8::new(1, 2, 5, 6, 5, 6, 7, 8);
+        let r: u16x8 = transmute(vshrn_high_n_u32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_high_n_u64() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u64x2 = u64x2::new(20, 24);
+        let e: u32x4 = u32x4::new(1, 2, 5, 6);
+        let r: u32x4 = transmute(vshrn_high_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3partw1q_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2147549312, 3221323968, 131329, 2684362752);
+        let r: u32x4 = transmute(vsm3partw1q_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3partw2q_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(128, 256, 384, 1077977696);
+        let r: u32x4 = transmute(vsm3partw2q_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3ss1q_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0, 0, 0, 2098176);
+        let r: u32x4 = transmute(vsm3ss1q_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm4ekeyq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1784948604, 136020997, 2940231695, 3789947679);
+        let r: u32x4 = transmute(vsm4ekeyq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm4eq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1093874472, 3616769504, 3878330411, 2765298765);
+        let r: u32x4 = transmute(vsm4eq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vrax1q_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let e: u64x2 = u64x2::new(7, 10);
+        let r: u64x2 = transmute(vrax1q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vsha512hq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let c: u64x2 = u64x2::new(5, 6);
+        let e: u64x2 = u64x2::new(11189044327219203, 7177611956453380);
+        let r: u64x2 = transmute(vsha512hq_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vsha512h2q_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let c: u64x2 = u64x2::new(5, 6);
+        let e: u64x2 = u64x2::new(5770237651009406214, 349133864969);
+        let r: u64x2 = transmute(vsha512h2q_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vsha512su0q_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let e: u64x2 = u64x2::new(144115188075855874, 9439544818968559619);
+        let r: u64x2 = transmute(vsha512su0q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vsha512su1q_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let c: u64x2 = u64x2::new(5, 6);
+        let e: u64x2 = u64x2::new(105553116266526, 140737488355368);
+        let r: u64x2 = transmute(vsha512su1q_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32x_f32() {
+        let a: f32x2 = f32x2::new(1.1, 1.9);
+        let e: f32x2 = f32x2::new(1.0, 2.0);
+        let r: f32x2 = transmute(vrnd32x_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32xq_f32() {
+        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
+        let e: f32x4 = f32x4::new(1.0, 2.0, -2.0, -2.0);
+        let r: f32x4 = transmute(vrnd32xq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32z_f32() {
+        let a: f32x2 = f32x2::new(1.1, 1.9);
+        let e: f32x2 = f32x2::new(1.0, 1.0);
+        let r: f32x2 = transmute(vrnd32z_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32zq_f32() {
+        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
+        let e: f32x4 = f32x4::new(1.0, 1.0, -1.0, -2.0);
+        let r: f32x4 = transmute(vrnd32zq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64x_f32() {
+        let a: f32x2 = f32x2::new(1.1, 1.9);
+        let e: f32x2 = f32x2::new(1.0, 2.0);
+        let r: f32x2 = transmute(vrnd64x_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64xq_f32() {
+        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
+        let e: f32x4 = f32x4::new(1.0, 2.0, -2.0, -2.0);
+        let r: f32x4 = transmute(vrnd64xq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64z_f32() {
+        let a: f32x2 = f32x2::new(1.1, 1.9);
+        let e: f32x2 = f32x2::new(1.0, 1.0);
+        let r: f32x2 = transmute(vrnd64z_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64zq_f32() {
+        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
+        let e: f32x4 = f32x4::new(1.0, 1.0, -1.0, -2.0);
+        let r: f32x4 = transmute(vrnd64zq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1_s8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i8x8 = i8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i8x8 = i8x8::new(0, 1, 4, 5, 8, 9, 12, 13);
+        let r: i8x8 = transmute(vtrn1_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_s8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: i8x16 = i8x16::new(0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29);
+        let r: i8x16 = transmute(vtrn1q_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1_s16() {
+        let a: i16x4 = i16x4::new(0, 2, 4, 6);
+        let b: i16x4 = i16x4::new(1, 3, 5, 7);
+        let e: i16x4 = i16x4::new(0, 1, 4, 5);
+        let r: i16x4 = transmute(vtrn1_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_s16() {
+        let a: i16x8 = i16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i16x8 = i16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i16x8 = i16x8::new(0, 1, 4, 5, 8, 9, 12, 13);
+        let r: i16x8 = transmute(vtrn1q_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_s32() {
+        let a: i32x4 = i32x4::new(0, 2, 4, 6);
+        let b: i32x4 = i32x4::new(1, 3, 5, 7);
+        let e: i32x4 = i32x4::new(0, 1, 4, 5);
+        let r: i32x4 = transmute(vtrn1q_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1_u8() {
+        let a: u8x8 = u8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: u8x8 = u8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: u8x8 = u8x8::new(0, 1, 4, 5, 8, 9, 12, 13);
+        let r: u8x8 = transmute(vtrn1_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_u8() {
+        let a: u8x16 = u8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: u8x16 = u8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: u8x16 = u8x16::new(0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29);
+        let r: u8x16 = transmute(vtrn1q_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1_u16() {
+        let a: u16x4 = u16x4::new(0, 2, 4, 6);
+        let b: u16x4 = u16x4::new(1, 3, 5, 7);
+        let e: u16x4 = u16x4::new(0, 1, 4, 5);
+        let r: u16x4 = transmute(vtrn1_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_u16() {
+        let a: u16x8 = u16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: u16x8 = u16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: u16x8 = u16x8::new(0, 1, 4, 5, 8, 9, 12, 13);
+        let r: u16x8 = transmute(vtrn1q_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_u32() {
+        let a: u32x4 = u32x4::new(0, 2, 4, 6);
+        let b: u32x4 = u32x4::new(1, 3, 5, 7);
+        let e: u32x4 = u32x4::new(0, 1, 4, 5);
+        let r: u32x4 = transmute(vtrn1q_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1_p8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i8x8 = i8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i8x8 = i8x8::new(0, 1, 4, 5, 8, 9, 12, 13);
+        let r: i8x8 = transmute(vtrn1_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_p8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: i8x16 = i8x16::new(0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29);
+        let r: i8x16 = transmute(vtrn1q_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1_p16() {
+        let a: i16x4 = i16x4::new(0, 2, 4, 6);
+        let b: i16x4 = i16x4::new(1, 3, 5, 7);
+        let e: i16x4 = i16x4::new(0, 1, 4, 5);
+        let r: i16x4 = transmute(vtrn1_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_p16() {
+        let a: i16x8 = i16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i16x8 = i16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i16x8 = i16x8::new(0, 1, 4, 5, 8, 9, 12, 13);
+        let r: i16x8 = transmute(vtrn1q_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1_s32() {
+        let a: i32x2 = i32x2::new(0, 2);
+        let b: i32x2 = i32x2::new(1, 3);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vtrn1_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_s64() {
+        let a: i64x2 = i64x2::new(0, 2);
+        let b: i64x2 = i64x2::new(1, 3);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vtrn1q_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1_u32() {
+        let a: u32x2 = u32x2::new(0, 2);
+        let b: u32x2 = u32x2::new(1, 3);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vtrn1_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_u64() {
+        let a: u64x2 = u64x2::new(0, 2);
+        let b: u64x2 = u64x2::new(1, 3);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vtrn1q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_p64() {
+        let a: i64x2 = i64x2::new(0, 2);
+        let b: i64x2 = i64x2::new(1, 3);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vtrn1q_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_f32() {
+        let a: f32x4 = f32x4::new(0., 2., 4., 6.);
+        let b: f32x4 = f32x4::new(1., 3., 5., 7.);
+        let e: f32x4 = f32x4::new(0., 1., 4., 5.);
+        let r: f32x4 = transmute(vtrn1q_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1_f32() {
+        let a: f32x2 = f32x2::new(0., 2.);
+        let b: f32x2 = f32x2::new(1., 3.);
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vtrn1_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn1q_f64() {
+        let a: f64x2 = f64x2::new(0., 2.);
+        let b: f64x2 = f64x2::new(1., 3.);
+        let e: f64x2 = f64x2::new(0., 1.);
+        let r: f64x2 = transmute(vtrn1q_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2_s8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i8x8 = i8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i8x8 = i8x8::new(2, 3, 6, 7, 10, 11, 14, 15);
+        let r: i8x8 = transmute(vtrn2_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_s8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: i8x16 = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
+        let r: i8x16 = transmute(vtrn2q_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2_s16() {
+        let a: i16x4 = i16x4::new(0, 2, 4, 6);
+        let b: i16x4 = i16x4::new(1, 3, 5, 7);
+        let e: i16x4 = i16x4::new(2, 3, 6, 7);
+        let r: i16x4 = transmute(vtrn2_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_s16() {
+        let a: i16x8 = i16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i16x8 = i16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i16x8 = i16x8::new(2, 3, 6, 7, 10, 11, 14, 15);
+        let r: i16x8 = transmute(vtrn2q_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_s32() {
+        let a: i32x4 = i32x4::new(0, 2, 4, 6);
+        let b: i32x4 = i32x4::new(1, 3, 5, 7);
+        let e: i32x4 = i32x4::new(2, 3, 6, 7);
+        let r: i32x4 = transmute(vtrn2q_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2_u8() {
+        let a: u8x8 = u8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: u8x8 = u8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: u8x8 = u8x8::new(2, 3, 6, 7, 10, 11, 14, 15);
+        let r: u8x8 = transmute(vtrn2_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_u8() {
+        let a: u8x16 = u8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: u8x16 = u8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: u8x16 = u8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
+        let r: u8x16 = transmute(vtrn2q_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2_u16() {
+        let a: u16x4 = u16x4::new(0, 2, 4, 6);
+        let b: u16x4 = u16x4::new(1, 3, 5, 7);
+        let e: u16x4 = u16x4::new(2, 3, 6, 7);
+        let r: u16x4 = transmute(vtrn2_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_u16() {
+        let a: u16x8 = u16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: u16x8 = u16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: u16x8 = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15);
+        let r: u16x8 = transmute(vtrn2q_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_u32() {
+        let a: u32x4 = u32x4::new(0, 2, 4, 6);
+        let b: u32x4 = u32x4::new(1, 3, 5, 7);
+        let e: u32x4 = u32x4::new(2, 3, 6, 7);
+        let r: u32x4 = transmute(vtrn2q_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2_p8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i8x8 = i8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i8x8 = i8x8::new(2, 3, 6, 7, 10, 11, 14, 15);
+        let r: i8x8 = transmute(vtrn2_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_p8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: i8x16 = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
+        let r: i8x16 = transmute(vtrn2q_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2_p16() {
+        let a: i16x4 = i16x4::new(0, 2, 4, 6);
+        let b: i16x4 = i16x4::new(1, 3, 5, 7);
+        let e: i16x4 = i16x4::new(2, 3, 6, 7);
+        let r: i16x4 = transmute(vtrn2_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_p16() {
+        let a: i16x8 = i16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i16x8 = i16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i16x8 = i16x8::new(2, 3, 6, 7, 10, 11, 14, 15);
+        let r: i16x8 = transmute(vtrn2q_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2_s32() {
+        let a: i32x2 = i32x2::new(0, 2);
+        let b: i32x2 = i32x2::new(1, 3);
+        let e: i32x2 = i32x2::new(2, 3);
+        let r: i32x2 = transmute(vtrn2_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_s64() {
+        let a: i64x2 = i64x2::new(0, 2);
+        let b: i64x2 = i64x2::new(1, 3);
+        let e: i64x2 = i64x2::new(2, 3);
+        let r: i64x2 = transmute(vtrn2q_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2_u32() {
+        let a: u32x2 = u32x2::new(0, 2);
+        let b: u32x2 = u32x2::new(1, 3);
+        let e: u32x2 = u32x2::new(2, 3);
+        let r: u32x2 = transmute(vtrn2_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_u64() {
+        let a: u64x2 = u64x2::new(0, 2);
+        let b: u64x2 = u64x2::new(1, 3);
+        let e: u64x2 = u64x2::new(2, 3);
+        let r: u64x2 = transmute(vtrn2q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_p64() {
+        let a: i64x2 = i64x2::new(0, 2);
+        let b: i64x2 = i64x2::new(1, 3);
+        let e: i64x2 = i64x2::new(2, 3);
+        let r: i64x2 = transmute(vtrn2q_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_f32() {
+        let a: f32x4 = f32x4::new(0., 2., 4., 6.);
+        let b: f32x4 = f32x4::new(1., 3., 5., 7.);
+        let e: f32x4 = f32x4::new(2., 3., 6., 7.);
+        let r: f32x4 = transmute(vtrn2q_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2_f32() {
+        let a: f32x2 = f32x2::new(0., 2.);
+        let b: f32x2 = f32x2::new(1., 3.);
+        let e: f32x2 = f32x2::new(2., 3.);
+        let r: f32x2 = transmute(vtrn2_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn2q_f64() {
+        let a: f64x2 = f64x2::new(0., 2.);
+        let b: f64x2 = f64x2::new(1., 3.);
+        let e: f64x2 = f64x2::new(2., 3.);
+        let r: f64x2 = transmute(vtrn2q_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1_s8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i8x8 = i8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vzip1_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_s8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vzip1q_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1_s16() {
+        let a: i16x4 = i16x4::new(0, 2, 4, 6);
+        let b: i16x4 = i16x4::new(1, 3, 5, 7);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vzip1_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_s16() {
+        let a: i16x8 = i16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i16x8 = i16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vzip1q_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1_s32() {
+        let a: i32x2 = i32x2::new(0, 2);
+        let b: i32x2 = i32x2::new(1, 3);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vzip1_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_s32() {
+        let a: i32x4 = i32x4::new(0, 2, 4, 6);
+        let b: i32x4 = i32x4::new(1, 3, 5, 7);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vzip1q_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_s64() {
+        let a: i64x2 = i64x2::new(0, 2);
+        let b: i64x2 = i64x2::new(1, 3);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vzip1q_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1_u8() {
+        let a: u8x8 = u8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: u8x8 = u8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vzip1_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_u8() {
+        let a: u8x16 = u8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: u8x16 = u8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vzip1q_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1_u16() {
+        let a: u16x4 = u16x4::new(0, 2, 4, 6);
+        let b: u16x4 = u16x4::new(1, 3, 5, 7);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vzip1_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_u16() {
+        let a: u16x8 = u16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: u16x8 = u16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vzip1q_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1_u32() {
+        let a: u32x2 = u32x2::new(0, 2);
+        let b: u32x2 = u32x2::new(1, 3);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vzip1_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_u32() {
+        let a: u32x4 = u32x4::new(0, 2, 4, 6);
+        let b: u32x4 = u32x4::new(1, 3, 5, 7);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vzip1q_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_u64() {
+        let a: u64x2 = u64x2::new(0, 2);
+        let b: u64x2 = u64x2::new(1, 3);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vzip1q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1_p8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i8x8 = i8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vzip1_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_p8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vzip1q_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1_p16() {
+        let a: i16x4 = i16x4::new(0, 2, 4, 6);
+        let b: i16x4 = i16x4::new(1, 3, 5, 7);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vzip1_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_p16() {
+        let a: i16x8 = i16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i16x8 = i16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vzip1q_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_p64() {
+        let a: i64x2 = i64x2::new(0, 2);
+        let b: i64x2 = i64x2::new(1, 3);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vzip1q_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1_f32() {
+        let a: f32x2 = f32x2::new(0., 2.);
+        let b: f32x2 = f32x2::new(1., 3.);
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vzip1_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_f32() {
+        let a: f32x4 = f32x4::new(0., 2., 4., 6.);
+        let b: f32x4 = f32x4::new(1., 3., 5., 7.);
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vzip1q_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip1q_f64() {
+        let a: f64x2 = f64x2::new(0., 2.);
+        let b: f64x2 = f64x2::new(1., 3.);
+        let e: f64x2 = f64x2::new(0., 1.);
+        let r: f64x2 = transmute(vzip1q_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2_s8() {
+        let a: i8x8 = i8x8::new(0, 16, 16, 18, 16, 18, 20, 22);
+        let b: i8x8 = i8x8::new(1, 17, 17, 19, 17, 19, 21, 23);
+        let e: i8x8 = i8x8::new(16, 17, 18, 19, 20, 21, 22, 23);
+        let r: i8x8 = transmute(vzip2_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_s8() {
+        let a: i8x16 = i8x16::new(0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: i8x16 = i8x16::new(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r: i8x16 = transmute(vzip2q_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2_s16() {
+        let a: i16x4 = i16x4::new(0, 16, 16, 18);
+        let b: i16x4 = i16x4::new(1, 17, 17, 19);
+        let e: i16x4 = i16x4::new(16, 17, 18, 19);
+        let r: i16x4 = transmute(vzip2_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_s16() {
+        let a: i16x8 = i16x8::new(0, 16, 16, 18, 16, 18, 20, 22);
+        let b: i16x8 = i16x8::new(1, 17, 17, 19, 17, 19, 21, 23);
+        let e: i16x8 = i16x8::new(16, 17, 18, 19, 20, 21, 22, 23);
+        let r: i16x8 = transmute(vzip2q_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2_s32() {
+        let a: i32x2 = i32x2::new(0, 16);
+        let b: i32x2 = i32x2::new(1, 17);
+        let e: i32x2 = i32x2::new(16, 17);
+        let r: i32x2 = transmute(vzip2_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_s32() {
+        let a: i32x4 = i32x4::new(0, 16, 16, 18);
+        let b: i32x4 = i32x4::new(1, 17, 17, 19);
+        let e: i32x4 = i32x4::new(16, 17, 18, 19);
+        let r: i32x4 = transmute(vzip2q_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_s64() {
+        let a: i64x2 = i64x2::new(0, 16);
+        let b: i64x2 = i64x2::new(1, 17);
+        let e: i64x2 = i64x2::new(16, 17);
+        let r: i64x2 = transmute(vzip2q_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2_u8() {
+        let a: u8x8 = u8x8::new(0, 16, 16, 18, 16, 18, 20, 22);
+        let b: u8x8 = u8x8::new(1, 17, 17, 19, 17, 19, 21, 23);
+        let e: u8x8 = u8x8::new(16, 17, 18, 19, 20, 21, 22, 23);
+        let r: u8x8 = transmute(vzip2_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_u8() {
+        let a: u8x16 = u8x16::new(0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: u8x16 = u8x16::new(1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: u8x16 = u8x16::new(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r: u8x16 = transmute(vzip2q_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2_u16() {
+        let a: u16x4 = u16x4::new(0, 16, 16, 18);
+        let b: u16x4 = u16x4::new(1, 17, 17, 19);
+        let e: u16x4 = u16x4::new(16, 17, 18, 19);
+        let r: u16x4 = transmute(vzip2_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_u16() {
+        let a: u16x8 = u16x8::new(0, 16, 16, 18, 16, 18, 20, 22);
+        let b: u16x8 = u16x8::new(1, 17, 17, 19, 17, 19, 21, 23);
+        let e: u16x8 = u16x8::new(16, 17, 18, 19, 20, 21, 22, 23);
+        let r: u16x8 = transmute(vzip2q_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2_u32() {
+        let a: u32x2 = u32x2::new(0, 16);
+        let b: u32x2 = u32x2::new(1, 17);
+        let e: u32x2 = u32x2::new(16, 17);
+        let r: u32x2 = transmute(vzip2_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_u32() {
+        let a: u32x4 = u32x4::new(0, 16, 16, 18);
+        let b: u32x4 = u32x4::new(1, 17, 17, 19);
+        let e: u32x4 = u32x4::new(16, 17, 18, 19);
+        let r: u32x4 = transmute(vzip2q_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_u64() {
+        let a: u64x2 = u64x2::new(0, 16);
+        let b: u64x2 = u64x2::new(1, 17);
+        let e: u64x2 = u64x2::new(16, 17);
+        let r: u64x2 = transmute(vzip2q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2_p8() {
+        let a: i8x8 = i8x8::new(0, 16, 16, 18, 16, 18, 20, 22);
+        let b: i8x8 = i8x8::new(1, 17, 17, 19, 17, 19, 21, 23);
+        let e: i8x8 = i8x8::new(16, 17, 18, 19, 20, 21, 22, 23);
+        let r: i8x8 = transmute(vzip2_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_p8() {
+        let a: i8x16 = i8x16::new(0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: i8x16 = i8x16::new(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r: i8x16 = transmute(vzip2q_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2_p16() {
+        let a: i16x4 = i16x4::new(0, 16, 16, 18);
+        let b: i16x4 = i16x4::new(1, 17, 17, 19);
+        let e: i16x4 = i16x4::new(16, 17, 18, 19);
+        let r: i16x4 = transmute(vzip2_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_p16() {
+        let a: i16x8 = i16x8::new(0, 16, 16, 18, 16, 18, 20, 22);
+        let b: i16x8 = i16x8::new(1, 17, 17, 19, 17, 19, 21, 23);
+        let e: i16x8 = i16x8::new(16, 17, 18, 19, 20, 21, 22, 23);
+        let r: i16x8 = transmute(vzip2q_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_p64() {
+        let a: i64x2 = i64x2::new(0, 16);
+        let b: i64x2 = i64x2::new(1, 17);
+        let e: i64x2 = i64x2::new(16, 17);
+        let r: i64x2 = transmute(vzip2q_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2_f32() {
+        let a: f32x2 = f32x2::new(0., 8.);
+        let b: f32x2 = f32x2::new(1., 9.);
+        let e: f32x2 = f32x2::new(8., 9.);
+        let r: f32x2 = transmute(vzip2_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_f32() {
+        let a: f32x4 = f32x4::new(0., 8., 8., 10.);
+        let b: f32x4 = f32x4::new(1., 9., 9., 11.);
+        let e: f32x4 = f32x4::new(8., 9., 10., 11.);
+        let r: f32x4 = transmute(vzip2q_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip2q_f64() {
+        let a: f64x2 = f64x2::new(0., 8.);
+        let b: f64x2 = f64x2::new(1., 9.);
+        let e: f64x2 = f64x2::new(8., 9.);
+        let r: f64x2 = transmute(vzip2q_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1_s8() {
+        let a: i8x8 = i8x8::new(1, 0, 2, 0, 2, 0, 3, 0);
+        let b: i8x8 = i8x8::new(2, 0, 3, 0, 7, 0, 8, 0);
+        let e: i8x8 = i8x8::new(1, 2, 2, 3, 2, 3, 7, 8);
+        let r: i8x8 = transmute(vuzp1_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_s8() {
+        let a: i8x16 = i8x16::new(1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0);
+        let b: i8x16 = i8x16::new(2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0);
+        let e: i8x16 = i8x16::new(1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vuzp1q_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1_s16() {
+        let a: i16x4 = i16x4::new(1, 0, 2, 0);
+        let b: i16x4 = i16x4::new(2, 0, 3, 0);
+        let e: i16x4 = i16x4::new(1, 2, 2, 3);
+        let r: i16x4 = transmute(vuzp1_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_s16() {
+        let a: i16x8 = i16x8::new(1, 0, 2, 0, 2, 0, 3, 0);
+        let b: i16x8 = i16x8::new(2, 0, 3, 0, 7, 0, 8, 0);
+        let e: i16x8 = i16x8::new(1, 2, 2, 3, 2, 3, 7, 8);
+        let r: i16x8 = transmute(vuzp1q_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_s32() {
+        let a: i32x4 = i32x4::new(1, 0, 2, 0);
+        let b: i32x4 = i32x4::new(2, 0, 3, 0);
+        let e: i32x4 = i32x4::new(1, 2, 2, 3);
+        let r: i32x4 = transmute(vuzp1q_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1_u8() {
+        let a: u8x8 = u8x8::new(1, 0, 2, 0, 2, 0, 3, 0);
+        let b: u8x8 = u8x8::new(2, 0, 3, 0, 7, 0, 8, 0);
+        let e: u8x8 = u8x8::new(1, 2, 2, 3, 2, 3, 7, 8);
+        let r: u8x8 = transmute(vuzp1_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_u8() {
+        let a: u8x16 = u8x16::new(1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0);
+        let b: u8x16 = u8x16::new(2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0);
+        let e: u8x16 = u8x16::new(1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16);
+        let r: u8x16 = transmute(vuzp1q_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1_u16() {
+        let a: u16x4 = u16x4::new(1, 0, 2, 0);
+        let b: u16x4 = u16x4::new(2, 0, 3, 0);
+        let e: u16x4 = u16x4::new(1, 2, 2, 3);
+        let r: u16x4 = transmute(vuzp1_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_u16() {
+        let a: u16x8 = u16x8::new(1, 0, 2, 0, 2, 0, 3, 0);
+        let b: u16x8 = u16x8::new(2, 0, 3, 0, 7, 0, 8, 0);
+        let e: u16x8 = u16x8::new(1, 2, 2, 3, 2, 3, 7, 8);
+        let r: u16x8 = transmute(vuzp1q_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_u32() {
+        let a: u32x4 = u32x4::new(1, 0, 2, 0);
+        let b: u32x4 = u32x4::new(2, 0, 3, 0);
+        let e: u32x4 = u32x4::new(1, 2, 2, 3);
+        let r: u32x4 = transmute(vuzp1q_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1_p8() {
+        let a: i8x8 = i8x8::new(1, 0, 2, 0, 2, 0, 3, 0);
+        let b: i8x8 = i8x8::new(2, 0, 3, 0, 7, 0, 8, 0);
+        let e: i8x8 = i8x8::new(1, 2, 2, 3, 2, 3, 7, 8);
+        let r: i8x8 = transmute(vuzp1_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_p8() {
+        let a: i8x16 = i8x16::new(1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0);
+        let b: i8x16 = i8x16::new(2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0);
+        let e: i8x16 = i8x16::new(1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vuzp1q_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1_p16() {
+        let a: i16x4 = i16x4::new(1, 0, 2, 0);
+        let b: i16x4 = i16x4::new(2, 0, 3, 0);
+        let e: i16x4 = i16x4::new(1, 2, 2, 3);
+        let r: i16x4 = transmute(vuzp1_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_p16() {
+        let a: i16x8 = i16x8::new(1, 0, 2, 0, 2, 0, 3, 0);
+        let b: i16x8 = i16x8::new(2, 0, 3, 0, 7, 0, 8, 0);
+        let e: i16x8 = i16x8::new(1, 2, 2, 3, 2, 3, 7, 8);
+        let r: i16x8 = transmute(vuzp1q_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1_s32() {
+        let a: i32x2 = i32x2::new(1, 0);
+        let b: i32x2 = i32x2::new(2, 0);
+        let e: i32x2 = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vuzp1_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_s64() {
+        let a: i64x2 = i64x2::new(1, 0);
+        let b: i64x2 = i64x2::new(2, 0);
+        let e: i64x2 = i64x2::new(1, 2);
+        let r: i64x2 = transmute(vuzp1q_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1_u32() {
+        let a: u32x2 = u32x2::new(1, 0);
+        let b: u32x2 = u32x2::new(2, 0);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vuzp1_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_u64() {
+        let a: u64x2 = u64x2::new(1, 0);
+        let b: u64x2 = u64x2::new(2, 0);
+        let e: u64x2 = u64x2::new(1, 2);
+        let r: u64x2 = transmute(vuzp1q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_p64() {
+        let a: i64x2 = i64x2::new(1, 0);
+        let b: i64x2 = i64x2::new(2, 0);
+        let e: i64x2 = i64x2::new(1, 2);
+        let r: i64x2 = transmute(vuzp1q_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_f32() {
+        let a: f32x4 = f32x4::new(0., 8., 1., 9.);
+        let b: f32x4 = f32x4::new(1., 10., 3., 11.);
+        let e: f32x4 = f32x4::new(0., 1., 1., 3.);
+        let r: f32x4 = transmute(vuzp1q_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1_f32() {
+        let a: f32x2 = f32x2::new(0., 8.);
+        let b: f32x2 = f32x2::new(1., 10.);
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vuzp1_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp1q_f64() {
+        let a: f64x2 = f64x2::new(0., 8.);
+        let b: f64x2 = f64x2::new(1., 10.);
+        let e: f64x2 = f64x2::new(0., 1.);
+        let r: f64x2 = transmute(vuzp1q_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2_s8() {
+        let a: i8x8 = i8x8::new(0, 17, 0, 18, 0, 18, 0, 19);
+        let b: i8x8 = i8x8::new(0, 18, 0, 19, 0, 23, 0, 24);
+        let e: i8x8 = i8x8::new(17, 18, 18, 19, 18, 19, 23, 24);
+        let r: i8x8 = transmute(vuzp2_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_s8() {
+        let a: i8x16 = i8x16::new(0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24);
+        let b: i8x16 = i8x16::new(0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32);
+        let e: i8x16 = i8x16::new(17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32);
+        let r: i8x16 = transmute(vuzp2q_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2_s16() {
+        let a: i16x4 = i16x4::new(0, 17, 0, 18);
+        let b: i16x4 = i16x4::new(0, 18, 0, 19);
+        let e: i16x4 = i16x4::new(17, 18, 18, 19);
+        let r: i16x4 = transmute(vuzp2_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_s16() {
+        let a: i16x8 = i16x8::new(0, 17, 0, 18, 0, 18, 0, 19);
+        let b: i16x8 = i16x8::new(0, 18, 0, 19, 0, 23, 0, 24);
+        let e: i16x8 = i16x8::new(17, 18, 18, 19, 18, 19, 23, 24);
+        let r: i16x8 = transmute(vuzp2q_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_s32() {
+        let a: i32x4 = i32x4::new(0, 17, 0, 18);
+        let b: i32x4 = i32x4::new(0, 18, 0, 19);
+        let e: i32x4 = i32x4::new(17, 18, 18, 19);
+        let r: i32x4 = transmute(vuzp2q_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2_u8() {
+        let a: u8x8 = u8x8::new(0, 17, 0, 18, 0, 18, 0, 19);
+        let b: u8x8 = u8x8::new(0, 18, 0, 19, 0, 23, 0, 24);
+        let e: u8x8 = u8x8::new(17, 18, 18, 19, 18, 19, 23, 24);
+        let r: u8x8 = transmute(vuzp2_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_u8() {
+        let a: u8x16 = u8x16::new(0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24);
+        let b: u8x16 = u8x16::new(0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32);
+        let e: u8x16 = u8x16::new(17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32);
+        let r: u8x16 = transmute(vuzp2q_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2_u16() {
+        let a: u16x4 = u16x4::new(0, 17, 0, 18);
+        let b: u16x4 = u16x4::new(0, 18, 0, 19);
+        let e: u16x4 = u16x4::new(17, 18, 18, 19);
+        let r: u16x4 = transmute(vuzp2_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_u16() {
+        let a: u16x8 = u16x8::new(0, 17, 0, 18, 0, 18, 0, 19);
+        let b: u16x8 = u16x8::new(0, 18, 0, 19, 0, 23, 0, 24);
+        let e: u16x8 = u16x8::new(17, 18, 18, 19, 18, 19, 23, 24);
+        let r: u16x8 = transmute(vuzp2q_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_u32() {
+        let a: u32x4 = u32x4::new(0, 17, 0, 18);
+        let b: u32x4 = u32x4::new(0, 18, 0, 19);
+        let e: u32x4 = u32x4::new(17, 18, 18, 19);
+        let r: u32x4 = transmute(vuzp2q_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2_p8() {
+        let a: i8x8 = i8x8::new(0, 17, 0, 18, 0, 18, 0, 19);
+        let b: i8x8 = i8x8::new(0, 18, 0, 19, 0, 23, 0, 24);
+        let e: i8x8 = i8x8::new(17, 18, 18, 19, 18, 19, 23, 24);
+        let r: i8x8 = transmute(vuzp2_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_p8() {
+        let a: i8x16 = i8x16::new(0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24);
+        let b: i8x16 = i8x16::new(0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32);
+        let e: i8x16 = i8x16::new(17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32);
+        let r: i8x16 = transmute(vuzp2q_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2_p16() {
+        let a: i16x4 = i16x4::new(0, 17, 0, 18);
+        let b: i16x4 = i16x4::new(0, 18, 0, 19);
+        let e: i16x4 = i16x4::new(17, 18, 18, 19);
+        let r: i16x4 = transmute(vuzp2_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_p16() {
+        let a: i16x8 = i16x8::new(0, 17, 0, 18, 0, 18, 0, 19);
+        let b: i16x8 = i16x8::new(0, 18, 0, 19, 0, 23, 0, 24);
+        let e: i16x8 = i16x8::new(17, 18, 18, 19, 18, 19, 23, 24);
+        let r: i16x8 = transmute(vuzp2q_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2_s32() {
+        let a: i32x2 = i32x2::new(0, 17);
+        let b: i32x2 = i32x2::new(0, 18);
+        let e: i32x2 = i32x2::new(17, 18);
+        let r: i32x2 = transmute(vuzp2_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_s64() {
+        let a: i64x2 = i64x2::new(0, 17);
+        let b: i64x2 = i64x2::new(0, 18);
+        let e: i64x2 = i64x2::new(17, 18);
+        let r: i64x2 = transmute(vuzp2q_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2_u32() {
+        let a: u32x2 = u32x2::new(0, 17);
+        let b: u32x2 = u32x2::new(0, 18);
+        let e: u32x2 = u32x2::new(17, 18);
+        let r: u32x2 = transmute(vuzp2_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_u64() {
+        let a: u64x2 = u64x2::new(0, 17);
+        let b: u64x2 = u64x2::new(0, 18);
+        let e: u64x2 = u64x2::new(17, 18);
+        let r: u64x2 = transmute(vuzp2q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_p64() {
+        let a: i64x2 = i64x2::new(0, 17);
+        let b: i64x2 = i64x2::new(0, 18);
+        let e: i64x2 = i64x2::new(17, 18);
+        let r: i64x2 = transmute(vuzp2q_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_f32() {
+        let a: f32x4 = f32x4::new(0., 8., 1., 9.);
+        let b: f32x4 = f32x4::new(2., 9., 3., 11.);
+        let e: f32x4 = f32x4::new(8., 9., 9., 11.);
+        let r: f32x4 = transmute(vuzp2q_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2_f32() {
+        let a: f32x2 = f32x2::new(0., 8.);
+        let b: f32x2 = f32x2::new(2., 9.);
+        let e: f32x2 = f32x2::new(8., 9.);
+        let r: f32x2 = transmute(vuzp2_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp2q_f64() {
+        let a: f64x2 = f64x2::new(0., 8.);
+        let b: f64x2 = f64x2::new(2., 9.);
+        let e: f64x2 = f64x2::new(8., 9.);
+        let r: f64x2 = transmute(vuzp2q_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_high_u8() {
+        let a: u16x8 = u16x8::new(9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let c: u8x16 = u8x16::new(10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12);
+        let e: u16x8 = u16x8::new(20, 20, 20, 20, 20, 20, 20, 20);
+        let r: u16x8 = transmute(vabal_high_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_high_u16() {
+        let a: u32x4 = u32x4::new(9, 10, 11, 12);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 9, 10, 11, 12);
+        let c: u16x8 = u16x8::new(10, 10, 10, 10, 20, 0, 2, 4);
+        let e: u32x4 = u32x4::new(20, 20, 20, 20);
+        let r: u32x4 = transmute(vabal_high_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_high_u32() {
+        let a: u64x2 = u64x2::new(15, 16);
+        let b: u32x4 = u32x4::new(1, 2, 15, 16);
+        let c: u32x4 = u32x4::new(10, 10, 10, 12);
+        let e: u64x2 = u64x2::new(20, 20);
+        let r: u64x2 = transmute(vabal_high_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_high_s8() {
+        let a: i16x8 = i16x8::new(9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let c: i8x16 = i8x16::new(10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12);
+        let e: i16x8 = i16x8::new(20, 20, 20, 20, 20, 20, 20, 20);
+        let r: i16x8 = transmute(vabal_high_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_high_s16() {
+        let a: i32x4 = i32x4::new(9, 10, 11, 12);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 9, 10, 11, 12);
+        let c: i16x8 = i16x8::new(10, 10, 10, 10, 20, 0, 2, 4);
+        let e: i32x4 = i32x4::new(20, 20, 20, 20);
+        let r: i32x4 = transmute(vabal_high_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_high_s32() {
+        let a: i64x2 = i64x2::new(15, 16);
+        let b: i32x4 = i32x4::new(1, 2, 15, 16);
+        let c: i32x4 = i32x4::new(10, 10, 10, 12);
+        let e: i64x2 = i64x2::new(20, 20);
+        let r: i64x2 = transmute(vabal_high_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabs_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vqabs_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabsq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, -7);
+        let e: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 7);
+        let r: i64x2 = transmute(vqabsq_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabsb_s8() {
+        let a: i8 = -7;
+        let e: i8 = 7;
+        let r: i8 = transmute(vqabsb_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabsh_s16() {
+        let a: i16 = -7;
+        let e: i16 = 7;
+        let r: i16 = transmute(vqabsh_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabss_s32() {
+        let a: i32 = -7;
+        let e: i32 = 7;
+        let r: i32 = transmute(vqabss_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabsd_s64() {
+        let a: i64 = -7;
+        let e: i64 = 7;
+        let r: i64 = transmute(vqabsd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vslid_n_s64() {
+        let a: i64 = 333;
+        let b: i64 = 2042;
+        let e: i64 = 8169;
+        let r: i64 = transmute(vslid_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vslid_n_u64() {
+        let a: u64 = 333;
+        let b: u64 = 2042;
+        let e: u64 = 8169;
+        let r: u64 = transmute(vslid_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsrid_n_s64() {
+        let a: i64 = 333;
+        let b: i64 = 2042;
+        let e: i64 = 510;
+        let r: i64 = transmute(vsrid_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsrid_n_u64() {
+        let a: u64 = 333;
+        let b: u64 = 2042;
+        let e: u64 = 510;
+        let r: u64 = transmute(vsrid_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
new file mode 100644
index 000000000..65ba527ee
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@@ -0,0 +1,5440 @@
+//! ARMv8 ASIMD intrinsics
+
+#![allow(non_camel_case_types)]
+
+#[rustfmt::skip]
+mod generated;
+#[rustfmt::skip]
+pub use self::generated::*;
+
+// FIXME: replace neon with asimd
+
+use crate::{
+    core_arch::{arm_shared::*, simd::*, simd_llvm::*},
+    hint::unreachable_unchecked,
+    mem::{transmute, zeroed},
+    ptr::{read_unaligned, write_unaligned},
+};
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+types! {
+    /// ARM-specific 64-bit wide vector of one packed `f64`.
+    #[stable(feature = "neon_intrinsics", since = "1.59.0")]
+    pub struct float64x1_t(f64); // FIXME: check this!
+    /// ARM-specific 128-bit wide vector of two packed `f64`.
+    #[stable(feature = "neon_intrinsics", since = "1.59.0")]
+    pub struct float64x2_t(f64, f64);
+}
+
+/// ARM-specific type containing two `float64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x1x2_t(pub float64x1_t, pub float64x1_t);
+/// ARM-specific type containing three `float64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x1x3_t(pub float64x1_t, pub float64x1_t, pub float64x1_t);
+/// ARM-specific type containing four `float64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x1x4_t(
+    pub float64x1_t,
+    pub float64x1_t,
+    pub float64x1_t,
+    pub float64x1_t,
+);
+
+/// ARM-specific type containing two `float64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x2x2_t(pub float64x2_t, pub float64x2_t);
+/// ARM-specific type containing three `float64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x2x3_t(pub float64x2_t, pub float64x2_t, pub float64x2_t);
+/// ARM-specific type containing four `float64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub struct float64x2x4_t(
+    pub float64x2_t,
+    pub float64x2_t,
+    pub float64x2_t,
+    pub float64x2_t,
+);
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    // absolute value
+    #[link_name = "llvm.aarch64.neon.abs.i64"]
+    fn vabsd_s64_(a: i64) -> i64;
+    #[link_name = "llvm.aarch64.neon.abs.v1i64"]
+    fn vabs_s64_(a: int64x1_t) -> int64x1_t;
+    #[link_name = "llvm.aarch64.neon.abs.v2i64"]
+    fn vabsq_s64_(a: int64x2_t) -> int64x2_t;
+
+    #[link_name = "llvm.aarch64.neon.suqadd.v8i8"]
+    fn vuqadd_s8_(a: int8x8_t, b: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v16i8"]
+    fn vuqaddq_s8_(a: int8x16_t, b: uint8x16_t) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v4i16"]
+    fn vuqadd_s16_(a: int16x4_t, b: uint16x4_t) -> int16x4_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v8i16"]
+    fn vuqaddq_s16_(a: int16x8_t, b: uint16x8_t) -> int16x8_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v2i32"]
+    fn vuqadd_s32_(a: int32x2_t, b: uint32x2_t) -> int32x2_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v4i32"]
+    fn vuqaddq_s32_(a: int32x4_t, b: uint32x4_t) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v1i64"]
+    fn vuqadd_s64_(a: int64x1_t, b: uint64x1_t) -> int64x1_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v2i64"]
+    fn vuqaddq_s64_(a: int64x2_t, b: uint64x2_t) -> int64x2_t;
+
+    #[link_name = "llvm.aarch64.neon.usqadd.v8i8"]
+    fn vsqadd_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v16i8"]
+    fn vsqaddq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v4i16"]
+    fn vsqadd_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v8i16"]
+    fn vsqaddq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v2i32"]
+    fn vsqadd_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v4i32"]
+    fn vsqaddq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v1i64"]
+    fn vsqadd_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v2i64"]
+    fn vsqaddq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+
+    #[link_name = "llvm.aarch64.neon.addp.v8i16"]
+    fn vpaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    #[link_name = "llvm.aarch64.neon.addp.v4i32"]
+    fn vpaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.addp.v2i64"]
+    fn vpaddq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    #[link_name = "llvm.aarch64.neon.addp.v16i8"]
+    fn vpaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.saddv.i32.v4i16"]
+    fn vaddv_s16_(a: int16x4_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.saddv.i32.v2i32"]
+    fn vaddv_s32_(a: int32x2_t) -> i32;
+    #[link_name = "llvm.aarch64.neon.saddv.i32.v8i8"]
+    fn vaddv_s8_(a: int8x8_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.uaddv.i32.v4i16"]
+    fn vaddv_u16_(a: uint16x4_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.uaddv.i32.v2i32"]
+    fn vaddv_u32_(a: uint32x2_t) -> u32;
+    #[link_name = "llvm.aarch64.neon.uaddv.i32.v8i8"]
+    fn vaddv_u8_(a: uint8x8_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.saddv.i32.v8i16"]
+    fn vaddvq_s16_(a: int16x8_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.saddv.i32.v4i32"]
+    fn vaddvq_s32_(a: int32x4_t) -> i32;
+    #[link_name = "llvm.aarch64.neon.saddv.i32.v16i8"]
+    fn vaddvq_s8_(a: int8x16_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.uaddv.i32.v8i16"]
+    fn vaddvq_u16_(a: uint16x8_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.uaddv.i32.v4i32"]
+    fn vaddvq_u32_(a: uint32x4_t) -> u32;
+    #[link_name = "llvm.aarch64.neon.uaddv.i32.v16i8"]
+    fn vaddvq_u8_(a: uint8x16_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.saddv.i64.v2i64"]
+    fn vaddvq_s64_(a: int64x2_t) -> i64;
+    #[link_name = "llvm.aarch64.neon.uaddv.i64.v2i64"]
+    fn vaddvq_u64_(a: uint64x2_t) -> u64;
+
+    #[link_name = "llvm.aarch64.neon.saddlv.i32.v8i8"]
+    fn vaddlv_s8_(a: int8x8_t) -> i32;
+    #[link_name = "llvm.aarch64.neon.uaddlv.i32.v8i8"]
+    fn vaddlv_u8_(a: uint8x8_t) -> u32;
+    #[link_name = "llvm.aarch64.neon.saddlv.i32.v16i8"]
+    fn vaddlvq_s8_(a: int8x16_t) -> i32;
+    #[link_name = "llvm.aarch64.neon.uaddlv.i32.v16i8"]
+    fn vaddlvq_u8_(a: uint8x16_t) -> u32;
+
+    #[link_name = "llvm.aarch64.neon.smaxv.i8.v8i8"]
+    fn vmaxv_s8_(a: int8x8_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.smaxv.i8.6i8"]
+    fn vmaxvq_s8_(a: int8x16_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.smaxv.i16.v4i16"]
+    fn vmaxv_s16_(a: int16x4_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.smaxv.i16.v8i16"]
+    fn vmaxvq_s16_(a: int16x8_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.smaxv.i32.v2i32"]
+    fn vmaxv_s32_(a: int32x2_t) -> i32;
+    #[link_name = "llvm.aarch64.neon.smaxv.i32.v4i32"]
+    fn vmaxvq_s32_(a: int32x4_t) -> i32;
+
+    #[link_name = "llvm.aarch64.neon.umaxv.i8.v8i8"]
+    fn vmaxv_u8_(a: uint8x8_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.umaxv.i8.6i8"]
+    fn vmaxvq_u8_(a: uint8x16_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.umaxv.i16.v4i16"]
+    fn vmaxv_u16_(a: uint16x4_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.umaxv.i16.v8i16"]
+    fn vmaxvq_u16_(a: uint16x8_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.umaxv.i32.v2i32"]
+    fn vmaxv_u32_(a: uint32x2_t) -> u32;
+    #[link_name = "llvm.aarch64.neon.umaxv.i32.v4i32"]
+    fn vmaxvq_u32_(a: uint32x4_t) -> u32;
+
+    #[link_name = "llvm.aarch64.neon.fmaxv.f32.v2f32"]
+    fn vmaxv_f32_(a: float32x2_t) -> f32;
+    #[link_name = "llvm.aarch64.neon.fmaxv.f32.v4f32"]
+    fn vmaxvq_f32_(a: float32x4_t) -> f32;
+    #[link_name = "llvm.aarch64.neon.fmaxv.f64.v2f64"]
+    fn vmaxvq_f64_(a: float64x2_t) -> f64;
+
+    #[link_name = "llvm.aarch64.neon.sminv.i8.v8i8"]
+    fn vminv_s8_(a: int8x8_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.sminv.i8.6i8"]
+    fn vminvq_s8_(a: int8x16_t) -> i8;
+    #[link_name = "llvm.aarch64.neon.sminv.i16.v4i16"]
+    fn vminv_s16_(a: int16x4_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.sminv.i16.v8i16"]
+    fn vminvq_s16_(a: int16x8_t) -> i16;
+    #[link_name = "llvm.aarch64.neon.sminv.i32.v2i32"]
+    fn vminv_s32_(a: int32x2_t) -> i32;
+    #[link_name = "llvm.aarch64.neon.sminv.i32.v4i32"]
+    fn vminvq_s32_(a: int32x4_t) -> i32;
+
+    #[link_name = "llvm.aarch64.neon.uminv.i8.v8i8"]
+    fn vminv_u8_(a: uint8x8_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.uminv.i8.6i8"]
+    fn vminvq_u8_(a: uint8x16_t) -> u8;
+    #[link_name = "llvm.aarch64.neon.uminv.i16.v4i16"]
+    fn vminv_u16_(a: uint16x4_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.uminv.i16.v8i16"]
+    fn vminvq_u16_(a: uint16x8_t) -> u16;
+    #[link_name = "llvm.aarch64.neon.uminv.i32.v2i32"]
+    fn vminv_u32_(a: uint32x2_t) -> u32;
+    #[link_name = "llvm.aarch64.neon.uminv.i32.v4i32"]
+    fn vminvq_u32_(a: uint32x4_t) -> u32;
+
+    #[link_name = "llvm.aarch64.neon.fminv.f32.v2f32"]
+    fn vminv_f32_(a: float32x2_t) -> f32;
+    #[link_name = "llvm.aarch64.neon.fminv.f32.v4f32"]
+    fn vminvq_f32_(a: float32x4_t) -> f32;
+    #[link_name = "llvm.aarch64.neon.fminv.f64.v2f64"]
+    fn vminvq_f64_(a: float64x2_t) -> f64;
+
+    #[link_name = "llvm.aarch64.neon.sminp.v16i8"]
+    fn vpminq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.sminp.v8i16"]
+    fn vpminq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    #[link_name = "llvm.aarch64.neon.sminp.v4i32"]
+    fn vpminq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.uminp.v16i8"]
+    fn vpminq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.uminp.v8i16"]
+    fn vpminq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    #[link_name = "llvm.aarch64.neon.uminp.v4i32"]
+    fn vpminq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.neon.fminp.4f32"]
+    fn vpminq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    #[link_name = "llvm.aarch64.neon.fminp.v2f64"]
+    fn vpminq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+
+    #[link_name = "llvm.aarch64.neon.smaxp.v16i8"]
+    fn vpmaxq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.smaxp.v8i16"]
+    fn vpmaxq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    #[link_name = "llvm.aarch64.neon.smaxp.v4i32"]
+    fn vpmaxq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.umaxp.v16i8"]
+    fn vpmaxq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.umaxp.v8i16"]
+    fn vpmaxq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    #[link_name = "llvm.aarch64.neon.umaxp.v4i32"]
+    fn vpmaxq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.neon.fmaxp.4f32"]
+    fn vpmaxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    #[link_name = "llvm.aarch64.neon.fmaxp.v2f64"]
+    fn vpmaxq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+
+    #[link_name = "llvm.aarch64.neon.tbl1.v8i8"]
+    fn vqtbl1(a: int8x16_t, b: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbl1.v16i8"]
+    fn vqtbl1q(a: int8x16_t, b: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbx1.v8i8"]
+    fn vqtbx1(a: int8x8_t, b: int8x16_t, c: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbx1.v16i8"]
+    fn vqtbx1q(a: int8x16_t, b: int8x16_t, c: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbl2.v8i8"]
+    fn vqtbl2(a0: int8x16_t, a1: int8x16_t, b: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbl2.v16i8"]
+    fn vqtbl2q(a0: int8x16_t, a1: int8x16_t, b: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbx2.v8i8"]
+    fn vqtbx2(a: int8x8_t, b0: int8x16_t, b1: int8x16_t, c: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbx2.v16i8"]
+    fn vqtbx2q(a: int8x16_t, b0: int8x16_t, b1: int8x16_t, c: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbl3.v8i8"]
+    fn vqtbl3(a0: int8x16_t, a1: int8x16_t, a2: int8x16_t, b: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbl3.v16i8"]
+    fn vqtbl3q(a0: int8x16_t, a1: int8x16_t, a2: int8x16_t, b: uint8x16_t) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbx3.v8i8"]
+    fn vqtbx3(a: int8x8_t, b0: int8x16_t, b1: int8x16_t, b2: int8x16_t, c: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbx3.v16i8"]
+    fn vqtbx3q(
+        a: int8x16_t,
+        b0: int8x16_t,
+        b1: int8x16_t,
+        b2: int8x16_t,
+        c: uint8x16_t,
+    ) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbl4.v8i8"]
+    fn vqtbl4(a0: int8x16_t, a1: int8x16_t, a2: int8x16_t, a3: int8x16_t, b: uint8x8_t)
+        -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.tbl4.v16i8"]
+    fn vqtbl4q(
+        a0: int8x16_t,
+        a1: int8x16_t,
+        a2: int8x16_t,
+        a3: int8x16_t,
+        b: uint8x16_t,
+    ) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.tbx4.v8i8"]
+    fn vqtbx4(
+        a: int8x8_t,
+        b0: int8x16_t,
+        b1: int8x16_t,
+        b2: int8x16_t,
+        b3: int8x16_t,
+        c: uint8x8_t,
+    ) -> int8x8_t;
+
+    #[link_name = "llvm.aarch64.neon.tbx4.v16i8"]
+    fn vqtbx4q(
+        a: int8x16_t,
+        b0: int8x16_t,
+        b1: int8x16_t,
+        b2: int8x16_t,
+        b3: int8x16_t,
+        c: uint8x16_t,
+    ) -> int8x16_t;
+
+    #[link_name = "llvm.aarch64.neon.vsli.v8i8"]
+    fn vsli_n_s8_(a: int8x8_t, b: int8x8_t, n: i32) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.vsli.v16i8"]
+    fn vsliq_n_s8_(a: int8x16_t, b: int8x16_t, n: i32) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.vsli.v4i16"]
+    fn vsli_n_s16_(a: int16x4_t, b: int16x4_t, n: i32) -> int16x4_t;
+    #[link_name = "llvm.aarch64.neon.vsli.v8i16"]
+    fn vsliq_n_s16_(a: int16x8_t, b: int16x8_t, n: i32) -> int16x8_t;
+    #[link_name = "llvm.aarch64.neon.vsli.v2i32"]
+    fn vsli_n_s32_(a: int32x2_t, b: int32x2_t, n: i32) -> int32x2_t;
+    #[link_name = "llvm.aarch64.neon.vsli.v4i32"]
+    fn vsliq_n_s32_(a: int32x4_t, b: int32x4_t, n: i32) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.vsli.v1i64"]
+    fn vsli_n_s64_(a: int64x1_t, b: int64x1_t, n: i32) -> int64x1_t;
+    #[link_name = "llvm.aarch64.neon.vsli.v2i64"]
+    fn vsliq_n_s64_(a: int64x2_t, b: int64x2_t, n: i32) -> int64x2_t;
+
+    #[link_name = "llvm.aarch64.neon.vsri.v8i8"]
+    fn vsri_n_s8_(a: int8x8_t, b: int8x8_t, n: i32) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.vsri.v16i8"]
+    fn vsriq_n_s8_(a: int8x16_t, b: int8x16_t, n: i32) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.vsri.v4i16"]
+    fn vsri_n_s16_(a: int16x4_t, b: int16x4_t, n: i32) -> int16x4_t;
+    #[link_name = "llvm.aarch64.neon.vsri.v8i16"]
+    fn vsriq_n_s16_(a: int16x8_t, b: int16x8_t, n: i32) -> int16x8_t;
+    #[link_name = "llvm.aarch64.neon.vsri.v2i32"]
+    fn vsri_n_s32_(a: int32x2_t, b: int32x2_t, n: i32) -> int32x2_t;
+    #[link_name = "llvm.aarch64.neon.vsri.v4i32"]
+    fn vsriq_n_s32_(a: int32x4_t, b: int32x4_t, n: i32) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.vsri.v1i64"]
+    fn vsri_n_s64_(a: int64x1_t, b: int64x1_t, n: i32) -> int64x1_t;
+    #[link_name = "llvm.aarch64.neon.vsri.v2i64"]
+    fn vsriq_n_s64_(a: int64x2_t, b: int64x2_t, n: i32) -> int64x2_t;
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_s64<const N1: i32, const N2: i32>(
+    _a: int64x1_t,
+    b: int64x1_t,
+) -> int64x1_t {
+    static_assert!(N1 : i32 where N1 == 0);
+    static_assert!(N2 : i32 where N2 == 0);
+    b
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_u64<const N1: i32, const N2: i32>(
+    _a: uint64x1_t,
+    b: uint64x1_t,
+) -> uint64x1_t {
+    static_assert!(N1 : i32 where N1 == 0);
+    static_assert!(N2 : i32 where N2 == 0);
+    b
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_p64<const N1: i32, const N2: i32>(
+    _a: poly64x1_t,
+    b: poly64x1_t,
+) -> poly64x1_t {
+    static_assert!(N1 : i32 where N1 == 0);
+    static_assert!(N2 : i32 where N2 == 0);
+    b
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_lane_f64<const N1: i32, const N2: i32>(
+    _a: float64x1_t,
+    b: float64x1_t,
+) -> float64x1_t {
+    static_assert!(N1 : i32 where N1 == 0);
+    static_assert!(N2 : i32 where N2 == 0);
+    b
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_s64<const LANE1: i32, const LANE2: i32>(
+    _a: int64x1_t,
+    b: int64x2_t,
+) -> int64x1_t {
+    static_assert!(LANE1 : i32 where LANE1 == 0);
+    static_assert_imm1!(LANE2);
+    transmute::<i64, _>(simd_extract(b, LANE2 as u32))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_u64<const LANE1: i32, const LANE2: i32>(
+    _a: uint64x1_t,
+    b: uint64x2_t,
+) -> uint64x1_t {
+    static_assert!(LANE1 : i32 where LANE1 == 0);
+    static_assert_imm1!(LANE2);
+    transmute::<u64, _>(simd_extract(b, LANE2 as u32))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_p64<const LANE1: i32, const LANE2: i32>(
+    _a: poly64x1_t,
+    b: poly64x2_t,
+) -> poly64x1_t {
+    static_assert!(LANE1 : i32 where LANE1 == 0);
+    static_assert_imm1!(LANE2);
+    transmute::<u64, _>(simd_extract(b, LANE2 as u32))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
+#[rustc_legacy_const_generics(1, 3)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcopy_laneq_f64<const LANE1: i32, const LANE2: i32>(
+    _a: float64x1_t,
+    b: float64x2_t,
+) -> float64x1_t {
+    static_assert!(LANE1 : i32 where LANE1 == 0);
+    static_assert_imm1!(LANE2);
+    transmute::<f64, _>(simd_extract(b, LANE2 as u32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_p64(ptr: *const p64) -> poly64x1_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_f64(ptr: *const f64) -> float64x1_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_f64(ptr: *const f64) -> float64x2_t {
+    read_unaligned(ptr.cast())
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_dup_f64(ptr: *const f64) -> float64x1_t {
+    vld1_f64(ptr)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld1r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_dup_f64(ptr: *const f64) -> float64x2_t {
+    let x = vld1q_lane_f64::<0>(ptr, transmute(f64x2::splat(0.)));
+    simd_shuffle2!(x, x, [0, 0])
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(ldr, LANE = 0))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x1_t) -> float64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(ld1, LANE = 1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld1q_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s8(ptr: *mut i8, a: int8x8_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s8(ptr: *mut i8, a: int8x16_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s16(ptr: *mut i16, a: int16x4_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s16(ptr: *mut i16, a: int16x8_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s32(ptr: *mut i32, a: int32x2_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s32(ptr: *mut i32, a: int32x4_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s64(ptr: *mut i64, a: int64x1_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s64(ptr: *mut i64, a: int64x2_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_u8(ptr: *mut u8, a: uint8x8_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_u8(ptr: *mut u8, a: uint8x16_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_u16(ptr: *mut u16, a: uint16x4_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_u16(ptr: *mut u16, a: uint16x8_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_u32(ptr: *mut u32, a: uint32x2_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_u32(ptr: *mut u32, a: uint32x4_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_u64(ptr: *mut u64, a: uint64x1_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_u64(ptr: *mut u64, a: uint64x2_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_p8(ptr: *mut p8, a: poly8x8_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_p8(ptr: *mut p8, a: poly8x16_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_p16(ptr: *mut p16, a: poly16x4_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_p16(ptr: *mut p16, a: poly16x8_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_p64(ptr: *mut p64, a: poly64x1_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_p64(ptr: *mut p64, a: poly64x2_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f32(ptr: *mut f32, a: float32x2_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f32(ptr: *mut f32, a: float32x4_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f64(ptr: *mut f64, a: float64x1_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(str))]
+#[allow(clippy::cast_ptr_alignment)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f64(ptr: *mut f64, a: float64x2_t) {
+    write_unaligned(ptr.cast(), a);
+}
+
+/// Absolute Value (wrapping).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(abs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabsd_s64(a: i64) -> i64 {
+    vabsd_s64_(a)
+}
+/// Absolute Value (wrapping).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(abs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabs_s64(a: int64x1_t) -> int64x1_t {
+    vabs_s64_(a)
+}
+/// Absolute Value (wrapping).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(abs))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vabsq_s64(a: int64x2_t) -> int64x2_t {
+    vabsq_s64_(a)
+}
+
+/// Bitwise Select instructions. This instruction sets each bit in the destination SIMD&FP register
+/// to the corresponding bit from the first source SIMD&FP register when the original
+/// destination bit was 1, otherwise from the second source SIMD&FP register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(bsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vbsl_f64(a: uint64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
+    simd_select(transmute::<_, int64x1_t>(a), b, c)
+}
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(bsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vbsl_p64(a: poly64x1_t, b: poly64x1_t, c: poly64x1_t) -> poly64x1_t {
+    simd_select(transmute::<_, int64x1_t>(a), b, c)
+}
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(bsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vbslq_f64(a: uint64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
+    simd_select(transmute::<_, int64x2_t>(a), b, c)
+}
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(bsl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vbslq_p64(a: poly64x2_t, b: poly64x2_t, c: poly64x2_t) -> poly64x2_t {
+    simd_select(transmute::<_, int64x2_t>(a), b, c)
+}
+
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqadd_s8(a: int8x8_t, b: uint8x8_t) -> int8x8_t {
+    vuqadd_s8_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqaddq_s8(a: int8x16_t, b: uint8x16_t) -> int8x16_t {
+    vuqaddq_s8_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqadd_s16(a: int16x4_t, b: uint16x4_t) -> int16x4_t {
+    vuqadd_s16_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqaddq_s16(a: int16x8_t, b: uint16x8_t) -> int16x8_t {
+    vuqaddq_s16_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqadd_s32(a: int32x2_t, b: uint32x2_t) -> int32x2_t {
+    vuqadd_s32_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqaddq_s32(a: int32x4_t, b: uint32x4_t) -> int32x4_t {
+    vuqaddq_s32_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqadd_s64(a: int64x1_t, b: uint64x1_t) -> int64x1_t {
+    vuqadd_s64_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vuqaddq_s64(a: int64x2_t, b: uint64x2_t) -> int64x2_t {
+    vuqaddq_s64_(a, b)
+}
+
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqadd_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    vsqadd_u8_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqaddq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    vsqaddq_u8_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqadd_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    vsqadd_u16_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqaddq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    vsqaddq_u16_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqadd_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    vsqadd_u32_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqaddq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    vsqaddq_u32_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqadd_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    vsqadd_u64_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsqaddq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    vsqaddq_u64_(a, b)
+}
+
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    vpaddq_s16_(a, b)
+}
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    transmute(vpaddq_s16_(transmute(a), transmute(b)))
+}
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    vpaddq_s32_(a, b)
+}
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    transmute(vpaddq_s32_(transmute(a), transmute(b)))
+}
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    vpaddq_s64_(a, b)
+}
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    transmute(vpaddq_s64_(transmute(a), transmute(b)))
+}
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    vpaddq_s8_(a, b)
+}
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    transmute(vpaddq_s8_(transmute(a), transmute(b)))
+}
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddd_s64(a: int64x2_t) -> i64 {
+    transmute(vaddvq_u64_(transmute(a)))
+}
+/// Add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpaddd_u64(a: uint64x2_t) -> u64 {
+    transmute(vaddvq_u64_(transmute(a)))
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddv_s16(a: int16x4_t) -> i16 {
+    vaddv_s16_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddv_s32(a: int32x2_t) -> i32 {
+    vaddv_s32_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddv_s8(a: int8x8_t) -> i8 {
+    vaddv_s8_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddv_u16(a: uint16x4_t) -> u16 {
+    vaddv_u16_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddv_u32(a: uint32x2_t) -> u32 {
+    vaddv_u32_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddv_u8(a: uint8x8_t) -> u8 {
+    vaddv_u8_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_s16(a: int16x8_t) -> i16 {
+    vaddvq_s16_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_s32(a: int32x4_t) -> i32 {
+    vaddvq_s32_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_s8(a: int8x16_t) -> i8 {
+    vaddvq_s8_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_u16(a: uint16x8_t) -> u16 {
+    vaddvq_u16_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_u32(a: uint32x4_t) -> u32 {
+    vaddvq_u32_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_u8(a: uint8x16_t) -> u8 {
+    vaddvq_u8_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_s64(a: int64x2_t) -> i64 {
+    vaddvq_s64_(a)
+}
+
+/// Add across vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(addp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddvq_u64(a: uint64x2_t) -> u64 {
+    vaddvq_u64_(a)
+}
+
+/// Signed Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(saddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlv_s8(a: int8x8_t) -> i16 {
+    vaddlv_s8_(a) as i16
+}
+
+/// Signed Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(saddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlvq_s8(a: int8x16_t) -> i16 {
+    vaddlvq_s8_(a) as i16
+}
+
+/// Unsigned Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlv_u8(a: uint8x8_t) -> u16 {
+    vaddlv_u8_(a) as u16
+}
+
+/// Unsigned Add Long across Vector
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uaddlv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddlvq_u8(a: uint8x16_t) -> u16 {
+    vaddlvq_u8_(a) as u16
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vadd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fadd))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
+    a.wrapping_add(b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(add))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 {
+    a.wrapping_add(b)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxv_s8(a: int8x8_t) -> i8 {
+    vmaxv_s8_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxvq_s8(a: int8x16_t) -> i8 {
+    vmaxvq_s8_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxv_s16(a: int16x4_t) -> i16 {
+    vmaxv_s16_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxvq_s16(a: int16x8_t) -> i16 {
+    vmaxvq_s16_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxv_s32(a: int32x2_t) -> i32 {
+    vmaxv_s32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxvq_s32(a: int32x4_t) -> i32 {
+    vmaxvq_s32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxv_u8(a: uint8x8_t) -> u8 {
+    vmaxv_u8_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxvq_u8(a: uint8x16_t) -> u8 {
+    vmaxvq_u8_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxv_u16(a: uint16x4_t) -> u16 {
+    vmaxv_u16_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxvq_u16(a: uint16x8_t) -> u16 {
+    vmaxvq_u16_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxv_u32(a: uint32x2_t) -> u32 {
+    vmaxv_u32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxvq_u32(a: uint32x4_t) -> u32 {
+    vmaxvq_u32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxv_f32(a: float32x2_t) -> f32 {
+    vmaxv_f32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxvq_f32(a: float32x4_t) -> f32 {
+    vmaxvq_f32_(a)
+}
+
+/// Horizontal vector max.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmaxvq_f64(a: float64x2_t) -> f64 {
+    vmaxvq_f64_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminv_s8(a: int8x8_t) -> i8 {
+    vminv_s8_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminvq_s8(a: int8x16_t) -> i8 {
+    vminvq_s8_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminv_s16(a: int16x4_t) -> i16 {
+    vminv_s16_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminvq_s16(a: int16x8_t) -> i16 {
+    vminvq_s16_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminv_s32(a: int32x2_t) -> i32 {
+    vminv_s32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminvq_s32(a: int32x4_t) -> i32 {
+    vminvq_s32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminv_u8(a: uint8x8_t) -> u8 {
+    vminv_u8_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminvq_u8(a: uint8x16_t) -> u8 {
+    vminvq_u8_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminv_u16(a: uint16x4_t) -> u16 {
+    vminv_u16_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminvq_u16(a: uint16x8_t) -> u16 {
+    vminvq_u16_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminv_u32(a: uint32x2_t) -> u32 {
+    vminv_u32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminvq_u32(a: uint32x4_t) -> u32 {
+    vminvq_u32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminv_f32(a: float32x2_t) -> f32 {
+    vminv_f32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminv))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminvq_f32(a: float32x4_t) -> f32 {
+    vminvq_f32_(a)
+}
+
+/// Horizontal vector min.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vminvq_f64(a: float64x2_t) -> f64 {
+    vminvq_f64_(a)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    vpminq_s8_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    vpminq_s16_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    vpminq_s32_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    vpminq_u8_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    vpminq_u16_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    vpminq_u32_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    vpminq_f32_(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fminp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    vpminq_f64_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    vpmaxq_s8_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    vpmaxq_s16_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    vpmaxq_s32_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    vpmaxq_u8_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    vpmaxq_u16_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    vpmaxq_u32_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    vpmaxq_f32_(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmaxp))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vpmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    vpmaxq_f64_(a, b)
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vext_p64<const N: i32>(a: poly64x1_t, _b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N : i32 where N == 0);
+    a
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vext_f64<const N: i32>(a: float64x1_t, _b: float64x1_t) -> float64x1_t {
+    static_assert!(N : i32 where N == 0);
+    a
+}
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_s8(low: int8x8_t, high: int8x8_t) -> int8x16_t {
+    simd_shuffle16!(
+        low,
+        high,
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    )
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_s16(low: int16x4_t, high: int16x4_t) -> int16x8_t {
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_s32(low: int32x2_t, high: int32x2_t) -> int32x4_t {
+    simd_shuffle4!(low, high, [0, 1, 2, 3])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_s64(low: int64x1_t, high: int64x1_t) -> int64x2_t {
+    simd_shuffle2!(low, high, [0, 1])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_u8(low: uint8x8_t, high: uint8x8_t) -> uint8x16_t {
+    simd_shuffle16!(
+        low,
+        high,
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    )
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_u16(low: uint16x4_t, high: uint16x4_t) -> uint16x8_t {
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_u32(low: uint32x2_t, high: uint32x2_t) -> uint32x4_t {
+    simd_shuffle4!(low, high, [0, 1, 2, 3])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_u64(low: uint64x1_t, high: uint64x1_t) -> uint64x2_t {
+    simd_shuffle2!(low, high, [0, 1])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_p64(low: poly64x1_t, high: poly64x1_t) -> poly64x2_t {
+    simd_shuffle2!(low, high, [0, 1])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdup_n_p64(value: p64) -> poly64x1_t {
+    transmute(u64x1::new(value))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdup_n_f64(value: f64) -> float64x1_t {
+    float64x1_t(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupq_n_p64(value: p64) -> poly64x2_t {
+    transmute(u64x2::new(value, value))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vdupq_n_f64(value: f64) -> float64x2_t {
+    float64x2_t(value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmov_n_p64(value: p64) -> poly64x1_t {
+    vdup_n_p64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmov_n_f64(value: f64) -> float64x1_t {
+    vdup_n_f64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovq_n_p64(value: p64) -> poly64x2_t {
+    vdupq_n_p64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(dup))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vmovq_n_f64(value: f64) -> float64x2_t {
+    vdupq_n_f64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vget_high_f64(a: float64x2_t) -> float64x1_t {
+    float64x1_t(simd_extract(a, 1))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ext))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vget_high_p64(a: poly64x2_t) -> poly64x1_t {
+    transmute(u64x1::new(simd_extract(a, 1)))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vget_low_f64(a: float64x2_t) -> float64x1_t {
+    float64x1_t(simd_extract(a, 0))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vget_low_p64(a: poly64x2_t) -> poly64x1_t {
+    transmute(u64x1::new(simd_extract(a, 0)))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, IMM5 = 0))]
+pub unsafe fn vget_lane_f64<const IMM5: i32>(v: float64x1_t) -> f64 {
+    static_assert!(IMM5 : i32 where IMM5 == 0);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, IMM5 = 0))]
+pub unsafe fn vgetq_lane_f64<const IMM5: i32>(v: float64x2_t) -> f64 {
+    static_assert_imm1!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/* FIXME: 16-bit float
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+pub unsafe fn vcombine_f16 ( low: float16x4_t,  high: float16x4_t) -> float16x8_t {
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+*/
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_f32(low: float32x2_t, high: float32x2_t) -> float32x4_t {
+    simd_shuffle4!(low, high, [0, 1, 2, 3])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_p8(low: poly8x8_t, high: poly8x8_t) -> poly8x16_t {
+    simd_shuffle16!(
+        low,
+        high,
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    )
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_p16(low: poly16x4_t, high: poly16x4_t) -> poly16x8_t {
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Vector combine
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(mov))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcombine_f64(low: float64x1_t, high: float64x1_t) -> float64x2_t {
+    simd_shuffle2!(low, high, [0, 1])
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vqtbl1_s8(vcombine_s8(a, zeroed()), transmute(b))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    vqtbl1_u8(vcombine_u8(a, zeroed()), b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
+    vqtbl1_p8(vcombine_p8(a, zeroed()), b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t {
+    vqtbl1_s8(vcombine_s8(a.0, a.1), transmute(b))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    vqtbl1_u8(vcombine_u8(a.0, a.1), b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    vqtbl1_p8(vcombine_p8(a.0, a.1), b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t {
+    vqtbl2_s8(
+        int8x16x2_t(vcombine_s8(a.0, a.1), vcombine_s8(a.2, zeroed())),
+        transmute(b),
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    vqtbl2_u8(
+        uint8x16x2_t(vcombine_u8(a.0, a.1), vcombine_u8(a.2, zeroed())),
+        b,
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    vqtbl2_p8(
+        poly8x16x2_t(vcombine_p8(a.0, a.1), vcombine_p8(a.2, zeroed())),
+        b,
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t {
+    vqtbl2_s8(
+        int8x16x2_t(vcombine_s8(a.0, a.1), vcombine_s8(a.2, a.3)),
+        transmute(b),
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    vqtbl2_u8(
+        uint8x16x2_t(vcombine_u8(a.0, a.1), vcombine_u8(a.2, a.3)),
+        b,
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    vqtbl2_p8(
+        poly8x16x2_t(vcombine_p8(a.0, a.1), vcombine_p8(a.2, a.3)),
+        b,
+    )
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    let r = vqtbx1_s8(a, vcombine_s8(b, zeroed()), transmute(c));
+    let m: int8x8_t = simd_lt(c, transmute(i8x8::splat(8)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    let r = vqtbx1_u8(a, vcombine_u8(b, zeroed()), c);
+    let m: int8x8_t = simd_lt(c, transmute(u8x8::splat(8)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
+    let r = vqtbx1_p8(a, vcombine_p8(b, zeroed()), c);
+    let m: int8x8_t = simd_lt(c, transmute(u8x8::splat(8)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t {
+    vqtbx1_s8(a, vcombine_s8(b.0, b.1), transmute(c))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    vqtbx1_u8(a, vcombine_u8(b.0, b.1), c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    vqtbx1_p8(a, vcombine_p8(b.0, b.1), c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
+    let r = vqtbx2_s8(
+        a,
+        int8x16x2_t(vcombine_s8(b.0, b.1), vcombine_s8(b.2, zeroed())),
+        transmute(c),
+    );
+    let m: int8x8_t = simd_lt(c, transmute(i8x8::splat(24)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    let r = vqtbx2_u8(
+        a,
+        uint8x16x2_t(vcombine_u8(b.0, b.1), vcombine_u8(b.2, zeroed())),
+        c,
+    );
+    let m: int8x8_t = simd_lt(c, transmute(u8x8::splat(24)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    let r = vqtbx2_p8(
+        a,
+        poly8x16x2_t(vcombine_p8(b.0, b.1), vcombine_p8(b.2, zeroed())),
+        c,
+    );
+    let m: int8x8_t = simd_lt(c, transmute(u8x8::splat(24)));
+    simd_select(m, r, a)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
+    vqtbx2_s8(
+        a,
+        int8x16x2_t(vcombine_s8(b.0, b.1), vcombine_s8(b.2, b.3)),
+        transmute(c),
+    )
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    vqtbx2_u8(
+        a,
+        uint8x16x2_t(vcombine_u8(b.0, b.1), vcombine_u8(b.2, b.3)),
+        c,
+    )
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    vqtbx2_p8(
+        a,
+        poly8x16x2_t(vcombine_p8(b.0, b.1), vcombine_p8(b.2, b.3)),
+        c,
+    )
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl1_s8(t: int8x16_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbl1(t, idx)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl1q_s8(t: int8x16_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbl1q(t, idx)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl1_u8(t: uint8x16_t, idx: uint8x8_t) -> uint8x8_t {
+    transmute(vqtbl1(transmute(t), transmute(idx)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl1q_u8(t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t {
+    transmute(vqtbl1q(transmute(t), transmute(idx)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl1_p8(t: poly8x16_t, idx: uint8x8_t) -> poly8x8_t {
+    transmute(vqtbl1(transmute(t), transmute(idx)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl1q_p8(t: poly8x16_t, idx: uint8x16_t) -> poly8x16_t {
+    transmute(vqtbl1q(transmute(t), transmute(idx)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx1_s8(a: int8x8_t, t: int8x16_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbx1(a, t, idx)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx1q_s8(a: int8x16_t, t: int8x16_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbx1q(a, t, idx)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx1_u8(a: uint8x8_t, t: uint8x16_t, idx: uint8x8_t) -> uint8x8_t {
+    transmute(vqtbx1(transmute(a), transmute(t), transmute(idx)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx1q_u8(a: uint8x16_t, t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t {
+    transmute(vqtbx1q(transmute(a), transmute(t), transmute(idx)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx1_p8(a: poly8x8_t, t: poly8x16_t, idx: uint8x8_t) -> poly8x8_t {
+    transmute(vqtbx1(transmute(a), transmute(t), transmute(idx)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx1q_p8(a: poly8x16_t, t: poly8x16_t, idx: uint8x16_t) -> poly8x16_t {
+    transmute(vqtbx1q(transmute(a), transmute(t), transmute(idx)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl2_s8(t: int8x16x2_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbl2(t.0, t.1, idx)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl2q_s8(t: int8x16x2_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbl2q(t.0, t.1, idx)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl2_u8(t: uint8x16x2_t, idx: uint8x8_t) -> uint8x8_t {
+    transmute(vqtbl2(transmute(t.0), transmute(t.1), transmute(idx)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl2q_u8(t: uint8x16x2_t, idx: uint8x16_t) -> uint8x16_t {
+    transmute(vqtbl2q(transmute(t.0), transmute(t.1), transmute(idx)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl2_p8(t: poly8x16x2_t, idx: uint8x8_t) -> poly8x8_t {
+    transmute(vqtbl2(transmute(t.0), transmute(t.1), transmute(idx)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl2q_p8(t: poly8x16x2_t, idx: uint8x16_t) -> poly8x16_t {
+    transmute(vqtbl2q(transmute(t.0), transmute(t.1), transmute(idx)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx2_s8(a: int8x8_t, t: int8x16x2_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbx2(a, t.0, t.1, idx)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx2q_s8(a: int8x16_t, t: int8x16x2_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbx2q(a, t.0, t.1, idx)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx2_u8(a: uint8x8_t, t: uint8x16x2_t, idx: uint8x8_t) -> uint8x8_t {
+    transmute(vqtbx2(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx2q_u8(a: uint8x16_t, t: uint8x16x2_t, idx: uint8x16_t) -> uint8x16_t {
+    transmute(vqtbx2q(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx2_p8(a: poly8x8_t, t: poly8x16x2_t, idx: uint8x8_t) -> poly8x8_t {
+    transmute(vqtbx2(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx2q_p8(a: poly8x16_t, t: poly8x16x2_t, idx: uint8x16_t) -> poly8x16_t {
+    transmute(vqtbx2q(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl3_s8(t: int8x16x3_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbl3(t.0, t.1, t.2, idx)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl3q_s8(t: int8x16x3_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbl3q(t.0, t.1, t.2, idx)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl3_u8(t: uint8x16x3_t, idx: uint8x8_t) -> uint8x8_t {
+    transmute(vqtbl3(
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl3q_u8(t: uint8x16x3_t, idx: uint8x16_t) -> uint8x16_t {
+    transmute(vqtbl3q(
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl3_p8(t: poly8x16x3_t, idx: uint8x8_t) -> poly8x8_t {
+    transmute(vqtbl3(
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl3q_p8(t: poly8x16x3_t, idx: uint8x16_t) -> poly8x16_t {
+    transmute(vqtbl3q(
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx3_s8(a: int8x8_t, t: int8x16x3_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbx3(a, t.0, t.1, t.2, idx)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx3q_s8(a: int8x16_t, t: int8x16x3_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbx3q(a, t.0, t.1, t.2, idx)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx3_u8(a: uint8x8_t, t: uint8x16x3_t, idx: uint8x8_t) -> uint8x8_t {
+    transmute(vqtbx3(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx3q_u8(a: uint8x16_t, t: uint8x16x3_t, idx: uint8x16_t) -> uint8x16_t {
+    transmute(vqtbx3q(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx3_p8(a: poly8x8_t, t: poly8x16x3_t, idx: uint8x8_t) -> poly8x8_t {
+    transmute(vqtbx3(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx3q_p8(a: poly8x16_t, t: poly8x16x3_t, idx: uint8x16_t) -> poly8x16_t {
+    transmute(vqtbx3q(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl4_s8(t: int8x16x4_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbl4(t.0, t.1, t.2, t.3, idx)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl4q_s8(t: int8x16x4_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbl4q(t.0, t.1, t.2, t.3, idx)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl4_u8(t: uint8x16x4_t, idx: uint8x8_t) -> uint8x8_t {
+    transmute(vqtbl4(
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(t.3),
+        transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl4q_u8(t: uint8x16x4_t, idx: uint8x16_t) -> uint8x16_t {
+    transmute(vqtbl4q(
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(t.3),
+        transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl4_p8(t: poly8x16x4_t, idx: uint8x8_t) -> poly8x8_t {
+    transmute(vqtbl4(
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(t.3),
+        transmute(idx),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbl))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbl4q_p8(t: poly8x16x4_t, idx: uint8x16_t) -> poly8x16_t {
+    transmute(vqtbl4q(
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(t.3),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx4_s8(a: int8x8_t, t: int8x16x4_t, idx: uint8x8_t) -> int8x8_t {
+    vqtbx4(a, t.0, t.1, t.2, t.3, idx)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx4q_s8(a: int8x16_t, t: int8x16x4_t, idx: uint8x16_t) -> int8x16_t {
+    vqtbx4q(a, t.0, t.1, t.2, t.3, idx)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx4_u8(a: uint8x8_t, t: uint8x16x4_t, idx: uint8x8_t) -> uint8x8_t {
+    transmute(vqtbx4(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(t.3),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx4q_u8(a: uint8x16_t, t: uint8x16x4_t, idx: uint8x16_t) -> uint8x16_t {
+    transmute(vqtbx4q(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(t.3),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx4_p8(a: poly8x8_t, t: poly8x16x4_t, idx: uint8x8_t) -> poly8x8_t {
+    transmute(vqtbx4(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(t.3),
+        transmute(idx),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(tbx))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqtbx4q_p8(a: poly8x16_t, t: poly8x16x4_t, idx: uint8x16_t) -> poly8x16_t {
+    transmute(vqtbx4q(
+        transmute(a),
+        transmute(t.0),
+        transmute(t.1),
+        transmute(t.2),
+        transmute(t.3),
+        transmute(idx),
+    ))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshld_n_s64<const N: i32>(a: i64) -> i64 {
+    static_assert_imm6!(N);
+    a << N
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshld_n_u64<const N: i32>(a: u64) -> u64 {
+    static_assert_imm6!(N);
+    a << N
+}
+
+/// Signed shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshrd_n_s64<const N: i32>(a: i64) -> i64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    let n: i32 = if N == 64 { 63 } else { N };
+    a >> n
+}
+
+/// Unsigned shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vshrd_n_u64<const N: i32>(a: u64) -> u64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    let n: i32 = if N == 64 {
+        return 0;
+    } else {
+        N
+    };
+    a >> n
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    a.wrapping_add(vshrd_n_s64::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    a.wrapping_add(vshrd_n_u64::<N>(b))
+}
+
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    vsli_n_s8_(a, b, N)
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    vsliq_n_s8_(a, b, N)
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    vsli_n_s16_(a, b, N)
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    vsliq_n_s16_(a, b, N)
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    vsli_n_s32_(a, b, N)
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    vsliq_n_s32_(a, b, N)
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N: i32 where N >= 0 && N <= 63);
+    vsli_n_s64_(a, b, N)
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 63);
+    vsliq_n_s64_(a, b, N)
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    transmute(vsli_n_s8_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    transmute(vsliq_n_s8_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    transmute(vsli_n_s16_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    transmute(vsliq_n_s16_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    transmute(vsli_n_s32_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    transmute(vsliq_n_s32_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N: i32 where N >= 0 && N <= 63);
+    transmute(vsli_n_s64_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 63);
+    transmute(vsliq_n_s64_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(N);
+    transmute(vsli_n_s8_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert_imm3!(N);
+    transmute(vsliq_n_s8_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert_imm4!(N);
+    transmute(vsli_n_s16_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert_imm4!(N);
+    transmute(vsliq_n_s16_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsli_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N: i32 where N >= 0 && N <= 63);
+    transmute(vsli_n_s64_(transmute(a), transmute(b), N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(sli, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsliq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 63);
+    transmute(vsliq_n_s64_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N: i32 where N >= 1 && N <= 8);
+    vsri_n_s8_(a, b, N)
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N: i32 where N >= 1 && N <= 8);
+    vsriq_n_s8_(a, b, N)
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N: i32 where N >= 1 && N <= 16);
+    vsri_n_s16_(a, b, N)
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N: i32 where N >= 1 && N <= 16);
+    vsriq_n_s16_(a, b, N)
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N: i32 where N >= 1 && N <= 32);
+    vsri_n_s32_(a, b, N)
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N: i32 where N >= 1 && N <= 32);
+    vsriq_n_s32_(a, b, N)
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N: i32 where N >= 1 && N <= 64);
+    vsri_n_s64_(a, b, N)
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N: i32 where N >= 1 && N <= 64);
+    vsriq_n_s64_(a, b, N)
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N: i32 where N >= 1 && N <= 8);
+    transmute(vsri_n_s8_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N: i32 where N >= 1 && N <= 8);
+    transmute(vsriq_n_s8_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N: i32 where N >= 1 && N <= 16);
+    transmute(vsri_n_s16_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N: i32 where N >= 1 && N <= 16);
+    transmute(vsriq_n_s16_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N: i32 where N >= 1 && N <= 32);
+    transmute(vsri_n_s32_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N: i32 where N >= 1 && N <= 32);
+    transmute(vsriq_n_s32_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N: i32 where N >= 1 && N <= 64);
+    transmute(vsri_n_s64_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N: i32 where N >= 1 && N <= 64);
+    transmute(vsriq_n_s64_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert!(N: i32 where N >= 1 && N <= 8);
+    transmute(vsri_n_s8_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert!(N: i32 where N >= 1 && N <= 8);
+    transmute(vsriq_n_s8_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert!(N: i32 where N >= 1 && N <= 16);
+    transmute(vsri_n_s16_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert!(N: i32 where N >= 1 && N <= 16);
+    transmute(vsriq_n_s16_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsri_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N: i32 where N >= 1 && N <= 64);
+    transmute(vsri_n_s64_(transmute(a), transmute(b), N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(sri, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vsriq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(N: i32 where N >= 1 && N <= 64);
+    transmute(vsriq_n_s64_(transmute(a), transmute(b), N))
+}
+
+/// SM3TT1A
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt1a, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsm3tt1aq_u32<const IMM2: i32>(
+    a: uint32x4_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint32x4_t {
+    static_assert_imm2!(IMM2);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt1a")]
+        fn vsm3tt1aq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t;
+    }
+    vsm3tt1aq_u32_(a, b, c, IMM2 as i64)
+}
+
+/// SM3TT1B
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt1b, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsm3tt1bq_u32<const IMM2: i32>(
+    a: uint32x4_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint32x4_t {
+    static_assert_imm2!(IMM2);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt1b")]
+        fn vsm3tt1bq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t;
+    }
+    vsm3tt1bq_u32_(a, b, c, IMM2 as i64)
+}
+
+/// SM3TT2A
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt2a, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsm3tt2aq_u32<const IMM2: i32>(
+    a: uint32x4_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint32x4_t {
+    static_assert_imm2!(IMM2);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt2a")]
+        fn vsm3tt2aq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t;
+    }
+    vsm3tt2aq_u32_(a, b, c, IMM2 as i64)
+}
+
+/// SM3TT2B
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt2b, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsm3tt2bq_u32<const IMM2: i32>(
+    a: uint32x4_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint32x4_t {
+    static_assert_imm2!(IMM2);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt2b")]
+        fn vsm3tt2bq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t;
+    }
+    vsm3tt2bq_u32_(a, b, c, IMM2 as i64)
+}
+
+/// Exclusive OR and rotate
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(xar, IMM6 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vxarq_u64<const IMM6: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert_imm6!(IMM6);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.xar")]
+        fn vxarq_u64_(a: uint64x2_t, b: uint64x2_t, n: i64) -> uint64x2_t;
+    }
+    vxarq_u64_(a, b, IMM6 as i64)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::aarch64::test_support::*;
+    use crate::core_arch::arm_shared::test_support::*;
+    use crate::core_arch::{aarch64::neon::*, aarch64::*, simd::*};
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqadd_s8() {
+        let a = i8x8::new(i8::MIN, -3, -2, -1, 0, 1, 2, i8::MAX);
+        let b = u8x8::new(u8::MAX, 1, 2, 3, 4, 5, 6, 7);
+        let e = i8x8::new(i8::MAX, -2, 0, 2, 4, 6, 8, i8::MAX);
+        let r: i8x8 = transmute(vuqadd_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddq_s8() {
+        let a = i8x16::new(
+            i8::MIN,
+            -7,
+            -6,
+            -5,
+            -4,
+            -3,
+            -2,
+            -1,
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            i8::MAX,
+        );
+        let b = u8x16::new(u8::MAX, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e = i8x16::new(
+            i8::MAX,
+            -6,
+            -4,
+            -2,
+            0,
+            2,
+            4,
+            6,
+            8,
+            10,
+            12,
+            14,
+            16,
+            18,
+            20,
+            i8::MAX,
+        );
+        let r: i8x16 = transmute(vuqaddq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqadd_s16() {
+        let a = i16x4::new(i16::MIN, -1, 0, i16::MAX);
+        let b = u16x4::new(u16::MAX, 1, 2, 3);
+        let e = i16x4::new(i16::MAX, 0, 2, i16::MAX);
+        let r: i16x4 = transmute(vuqadd_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddq_s16() {
+        let a = i16x8::new(i16::MIN, -3, -2, -1, 0, 1, 2, i16::MAX);
+        let b = u16x8::new(u16::MAX, 1, 2, 3, 4, 5, 6, 7);
+        let e = i16x8::new(i16::MAX, -2, 0, 2, 4, 6, 8, i16::MAX);
+        let r: i16x8 = transmute(vuqaddq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqadd_s32() {
+        let a = i32x2::new(i32::MIN, i32::MAX);
+        let b = u32x2::new(u32::MAX, 1);
+        let e = i32x2::new(i32::MAX, i32::MAX);
+        let r: i32x2 = transmute(vuqadd_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddq_s32() {
+        let a = i32x4::new(i32::MIN, -1, 0, i32::MAX);
+        let b = u32x4::new(u32::MAX, 1, 2, 3);
+        let e = i32x4::new(i32::MAX, 0, 2, i32::MAX);
+        let r: i32x4 = transmute(vuqaddq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqadd_s64() {
+        let a = i64x1::new(i64::MIN);
+        let b = u64x1::new(u64::MAX);
+        let e = i64x1::new(i64::MAX);
+        let r: i64x1 = transmute(vuqadd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddq_s64() {
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        let b = u64x2::new(u64::MAX, 1);
+        let e = i64x2::new(i64::MAX, i64::MAX);
+        let r: i64x2 = transmute(vuqaddq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqadd_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, u8::MAX);
+        let b = i8x8::new(i8::MIN, -3, -2, -1, 0, 1, 2, 3);
+        let e = u8x8::new(0, 0, 0, 2, 4, 6, 8, u8::MAX);
+        let r: u8x8 = transmute(vsqadd_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, u8::MAX);
+        let b = i8x16::new(i8::MIN, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7);
+        let e = u8x16::new(0, 0, 0, 0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, u8::MAX);
+        let r: u8x16 = transmute(vsqaddq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqadd_u16() {
+        let a = u16x4::new(0, 1, 2, u16::MAX);
+        let b = i16x4::new(i16::MIN, -1, 0, 1);
+        let e = u16x4::new(0, 0, 2, u16::MAX);
+        let r: u16x4 = transmute(vsqadd_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, u16::MAX);
+        let b = i16x8::new(i16::MIN, -3, -2, -1, 0, 1, 2, 3);
+        let e = u16x8::new(0, 0, 0, 2, 4, 6, 8, u16::MAX);
+        let r: u16x8 = transmute(vsqaddq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqadd_u32() {
+        let a = u32x2::new(0, u32::MAX);
+        let b = i32x2::new(i32::MIN, 1);
+        let e = u32x2::new(0, u32::MAX);
+        let r: u32x2 = transmute(vsqadd_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddq_u32() {
+        let a = u32x4::new(0, 1, 2, u32::MAX);
+        let b = i32x4::new(i32::MIN, -1, 0, 1);
+        let e = u32x4::new(0, 0, 2, u32::MAX);
+        let r: u32x4 = transmute(vsqaddq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqadd_u64() {
+        let a = u64x1::new(0);
+        let b = i64x1::new(i64::MIN);
+        let e = u64x1::new(0);
+        let r: u64x1 = transmute(vsqadd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddq_u64() {
+        let a = u64x2::new(0, u64::MAX);
+        let b = i64x2::new(i64::MIN, 1);
+        let e = u64x2::new(0, u64::MAX);
+        let r: u64x2 = transmute(vsqaddq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i16x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let r: i16x8 = transmute(vpaddq_s16(transmute(a), transmute(b)));
+        let e = i16x8::new(3, 7, 11, 15, -1, -5, -9, -13);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let b = i32x4::new(0, -1, -2, -3);
+        let r: i32x4 = transmute(vpaddq_s32(transmute(a), transmute(b)));
+        let e = i32x4::new(3, 7, -1, -5);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_s64() {
+        let a = i64x2::new(1, 2);
+        let b = i64x2::new(0, -1);
+        let r: i64x2 = transmute(vpaddq_s64(transmute(a), transmute(b)));
+        let e = i64x2::new(3, -1);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = i8x16::new(
+            0, -1, -2, -3, -4, -5, -6, -7, -8, -8, -10, -11, -12, -13, -14, -15,
+        );
+        let r: i8x16 = transmute(vpaddq_s8(transmute(a), transmute(b)));
+        let e = i8x16::new(
+            3, 7, 11, 15, 19, 23, 27, 31, -1, -5, -9, -13, -16, -21, -25, -29,
+        );
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = u16x8::new(17, 18, 19, 20, 20, 21, 22, 23);
+        let r: u16x8 = transmute(vpaddq_u16(transmute(a), transmute(b)));
+        let e = u16x8::new(1, 5, 9, 13, 35, 39, 41, 45);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let b = u32x4::new(17, 18, 19, 20);
+        let r: u32x4 = transmute(vpaddq_u32(transmute(a), transmute(b)));
+        let e = u32x4::new(1, 5, 35, 39);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_u64() {
+        let a = u64x2::new(0, 1);
+        let b = u64x2::new(17, 18);
+        let r: u64x2 = transmute(vpaddq_u64(transmute(a), transmute(b)));
+        let e = u64x2::new(1, 35);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddq_u8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = i8x16::new(
+            17, 18, 19, 20, 20, 21, 22, 23, 24, 25, 26, 27, 29, 29, 30, 31,
+        );
+        let r = i8x16(1, 5, 9, 13, 17, 21, 25, 29, 35, 39, 41, 45, 49, 53, 58, 61);
+        let e: i8x16 = transmute(vpaddq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddd_s64() {
+        let a = i64x2::new(2, -3);
+        let r: i64 = transmute(vpaddd_s64(transmute(a)));
+        let e = -1_i64;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddd_u64() {
+        let a = i64x2::new(2, 3);
+        let r: u64 = transmute(vpaddd_u64(transmute(a)));
+        let e = 5_u64;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_f64() {
+        let a = 1.;
+        let b = 8.;
+        let e = 9.;
+        let r: f64 = transmute(vadd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_f64() {
+        let a = f64x2::new(1., 2.);
+        let b = f64x2::new(8., 7.);
+        let e = f64x2::new(9., 9.);
+        let r: f64x2 = transmute(vaddq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s64() {
+        let a = 1_i64;
+        let b = 8_i64;
+        let e = 9_i64;
+        let r: i64 = transmute(vadd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u64() {
+        let a = 1_u64;
+        let b = 8_u64;
+        let e = 9_u64;
+        let r: u64 = transmute(vadd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddd_s64() {
+        let a = 1_i64;
+        let b = 8_i64;
+        let e = 9_i64;
+        let r: i64 = transmute(vaddd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddd_u64() {
+        let a = 1_u64;
+        let b = 8_u64;
+        let e = 9_u64;
+        let r: u64 = transmute(vaddd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_s8() {
+        let r = vmaxv_s8(transmute(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5)));
+        assert_eq!(r, 7_i8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_s8() {
+        #[rustfmt::skip]
+        let r = vmaxvq_s8(transmute(i8x16::new(
+            1, 2, 3, 4,
+            -16, 6, 7, 5,
+            8, 1, 1, 1,
+            1, 1, 1, 1,
+        )));
+        assert_eq!(r, 8_i8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_s16() {
+        let r = vmaxv_s16(transmute(i16x4::new(1, 2, -4, 3)));
+        assert_eq!(r, 3_i16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_s16() {
+        let r = vmaxvq_s16(transmute(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5)));
+        assert_eq!(r, 7_i16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_s32() {
+        let r = vmaxv_s32(transmute(i32x2::new(1, -4)));
+        assert_eq!(r, 1_i32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_s32() {
+        let r = vmaxvq_s32(transmute(i32x4::new(1, 2, -32, 4)));
+        assert_eq!(r, 4_i32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_u8() {
+        let r = vmaxv_u8(transmute(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5)));
+        assert_eq!(r, 8_u8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_u8() {
+        #[rustfmt::skip]
+        let r = vmaxvq_u8(transmute(u8x16::new(
+            1, 2, 3, 4,
+            16, 6, 7, 5,
+            8, 1, 1, 1,
+            1, 1, 1, 1,
+        )));
+        assert_eq!(r, 16_u8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_u16() {
+        let r = vmaxv_u16(transmute(u16x4::new(1, 2, 4, 3)));
+        assert_eq!(r, 4_u16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_u16() {
+        let r = vmaxvq_u16(transmute(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5)));
+        assert_eq!(r, 16_u16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_u32() {
+        let r = vmaxv_u32(transmute(u32x2::new(1, 4)));
+        assert_eq!(r, 4_u32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_u32() {
+        let r = vmaxvq_u32(transmute(u32x4::new(1, 2, 32, 4)));
+        assert_eq!(r, 32_u32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxv_f32() {
+        let r = vmaxv_f32(transmute(f32x2::new(1., 4.)));
+        assert_eq!(r, 4_f32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_f32() {
+        let r = vmaxvq_f32(transmute(f32x4::new(1., 2., 32., 4.)));
+        assert_eq!(r, 32_f32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxvq_f64() {
+        let r = vmaxvq_f64(transmute(f64x2::new(1., 4.)));
+        assert_eq!(r, 4_f64);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_s8() {
+        let r = vminv_s8(transmute(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5)));
+        assert_eq!(r, -8_i8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_s8() {
+        #[rustfmt::skip]
+        let r = vminvq_s8(transmute(i8x16::new(
+            1, 2, 3, 4,
+            -16, 6, 7, 5,
+            8, 1, 1, 1,
+            1, 1, 1, 1,
+        )));
+        assert_eq!(r, -16_i8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_s16() {
+        let r = vminv_s16(transmute(i16x4::new(1, 2, -4, 3)));
+        assert_eq!(r, -4_i16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_s16() {
+        let r = vminvq_s16(transmute(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5)));
+        assert_eq!(r, -16_i16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_s32() {
+        let r = vminv_s32(transmute(i32x2::new(1, -4)));
+        assert_eq!(r, -4_i32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_s32() {
+        let r = vminvq_s32(transmute(i32x4::new(1, 2, -32, 4)));
+        assert_eq!(r, -32_i32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_u8() {
+        let r = vminv_u8(transmute(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5)));
+        assert_eq!(r, 1_u8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_u8() {
+        #[rustfmt::skip]
+        let r = vminvq_u8(transmute(u8x16::new(
+            1, 2, 3, 4,
+            16, 6, 7, 5,
+            8, 1, 1, 1,
+            1, 1, 1, 1,
+        )));
+        assert_eq!(r, 1_u8);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_u16() {
+        let r = vminv_u16(transmute(u16x4::new(1, 2, 4, 3)));
+        assert_eq!(r, 1_u16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_u16() {
+        let r = vminvq_u16(transmute(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5)));
+        assert_eq!(r, 1_u16);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_u32() {
+        let r = vminv_u32(transmute(u32x2::new(1, 4)));
+        assert_eq!(r, 1_u32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_u32() {
+        let r = vminvq_u32(transmute(u32x4::new(1, 2, 32, 4)));
+        assert_eq!(r, 1_u32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminv_f32() {
+        let r = vminv_f32(transmute(f32x2::new(1., 4.)));
+        assert_eq!(r, 1_f32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_f32() {
+        let r = vminvq_f32(transmute(f32x4::new(1., 2., 32., 4.)));
+        assert_eq!(r, 1_f32);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminvq_f64() {
+        let r = vminvq_f64(transmute(f64x2::new(1., 4.)));
+        assert_eq!(r, 1_f64);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_s8() {
+        #[cfg_attr(rustfmt, skip)]
+        let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        #[cfg_attr(rustfmt, skip)]
+        let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
+        #[cfg_attr(rustfmt, skip)]
+        let e = i8x16::new(-2, -4, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
+        let r: i8x16 = transmute(vpminq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_s16() {
+        let a = i16x8::new(1, -2, 3, 4, 5, 6, 7, 8);
+        let b = i16x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = i16x8::new(-2, 3, 5, 7, 0, 2, 4, 6);
+        let r: i16x8 = transmute(vpminq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_s32() {
+        let a = i32x4::new(1, -2, 3, 4);
+        let b = i32x4::new(0, 3, 2, 5);
+        let e = i32x4::new(-2, 3, 0, 2);
+        let r: i32x4 = transmute(vpminq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_u8() {
+        #[cfg_attr(rustfmt, skip)]
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        #[cfg_attr(rustfmt, skip)]
+        let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
+        #[cfg_attr(rustfmt, skip)]
+        let e = u8x16::new(1, 3, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
+        let r: u8x16 = transmute(vpminq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u16x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = u16x8::new(1, 3, 5, 7, 0, 2, 4, 6);
+        let r: u16x8 = transmute(vpminq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpminq_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let b = u32x4::new(0, 3, 2, 5);
+        let e = u32x4::new(1, 3, 0, 2);
+        let r: u32x4 = transmute(vpminq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_f32() {
+        let a = f32x4::new(1., -2., 3., 4.);
+        let b = f32x4::new(0., 3., 2., 5.);
+        let e = f32x4::new(-2., 3., 0., 2.);
+        let r: f32x4 = transmute(vpminq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_f64() {
+        let a = f64x2::new(1., -2.);
+        let b = f64x2::new(0., 3.);
+        let e = f64x2::new(-2., 0.);
+        let r: f64x2 = transmute(vpminq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_s8() {
+        #[cfg_attr(rustfmt, skip)]
+        let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        #[cfg_attr(rustfmt, skip)]
+        let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
+        #[cfg_attr(rustfmt, skip)]
+        let e = i8x16::new(1, 3, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
+        let r: i8x16 = transmute(vpmaxq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_s16() {
+        let a = i16x8::new(1, -2, 3, 4, 5, 6, 7, 8);
+        let b = i16x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = i16x8::new(1, 4, 6, 8, 3, 5, 7, 9);
+        let r: i16x8 = transmute(vpmaxq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_s32() {
+        let a = i32x4::new(1, -2, 3, 4);
+        let b = i32x4::new(0, 3, 2, 5);
+        let e = i32x4::new(1, 4, 3, 5);
+        let r: i32x4 = transmute(vpmaxq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_u8() {
+        #[cfg_attr(rustfmt, skip)]
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        #[cfg_attr(rustfmt, skip)]
+        let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
+        #[cfg_attr(rustfmt, skip)]
+        let e = u8x16::new(2, 4, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
+        let r: u8x16 = transmute(vpmaxq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u16x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = u16x8::new(2, 4, 6, 8, 3, 5, 7, 9);
+        let r: u16x8 = transmute(vpmaxq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmaxq_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let b = u32x4::new(0, 3, 2, 5);
+        let e = u32x4::new(2, 4, 3, 5);
+        let r: u32x4 = transmute(vpmaxq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_f32() {
+        let a = f32x4::new(1., -2., 3., 4.);
+        let b = f32x4::new(0., 3., 2., 5.);
+        let e = f32x4::new(1., 4., 3., 5.);
+        let r: f32x4 = transmute(vpmaxq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_f64() {
+        let a = f64x2::new(1., -2.);
+        let b = f64x2::new(0., 3.);
+        let e = f64x2::new(1., 3.);
+        let r: f64x2 = transmute(vpmaxq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vext_p64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_f64() {
+        let a: f64x1 = f64x1::new(0.);
+        let b: f64x1 = f64x1::new(1.);
+        let e: f64x1 = f64x1::new(0.);
+        let r: f64x1 = transmute(vext_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshld_n_s64() {
+        let a: i64 = 1;
+        let e: i64 = 4;
+        let r: i64 = vshld_n_s64::<2>(a);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshld_n_u64() {
+        let a: u64 = 1;
+        let e: u64 = 4;
+        let r: u64 = vshld_n_u64::<2>(a);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrd_n_s64() {
+        let a: i64 = 4;
+        let e: i64 = 1;
+        let r: i64 = vshrd_n_s64::<2>(a);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrd_n_u64() {
+        let a: u64 = 4;
+        let e: u64 = 1;
+        let r: u64 = vshrd_n_u64::<2>(a);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsrad_n_s64() {
+        let a: i64 = 1;
+        let b: i64 = 4;
+        let e: i64 = 2;
+        let r: i64 = vsrad_n_s64::<2>(a, b);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsrad_n_u64() {
+        let a: u64 = 1;
+        let b: u64 = 4;
+        let e: u64 = 2;
+        let r: u64 = vsrad_n_u64::<2>(a, b);
+        assert_eq!(r, e);
+    }
+
+    macro_rules! test_vcombine {
+        ($test_id:ident => $fn_id:ident ([$($a:expr),*], [$($b:expr),*])) => {
+            #[allow(unused_assignments)]
+            #[simd_test(enable = "neon")]
+            unsafe fn $test_id() {
+                let a = [$($a),*];
+                let b = [$($b),*];
+                let e = [$($a),* $(, $b)*];
+                let c = $fn_id(transmute(a), transmute(b));
+                let mut d = e;
+                d = transmute(c);
+                assert_eq!(d, e);
+            }
+        }
+    }
+
+    test_vcombine!(test_vcombine_s8 => vcombine_s8([3_i8, -4, 5, -6, 7, 8, 9, 10], [13_i8, -14, 15, -16, 17, 18, 19, 110]));
+    test_vcombine!(test_vcombine_u8 => vcombine_u8([3_u8, 4, 5, 6, 7, 8, 9, 10], [13_u8, 14, 15, 16, 17, 18, 19, 110]));
+    test_vcombine!(test_vcombine_p8 => vcombine_p8([3_u8, 4, 5, 6, 7, 8, 9, 10], [13_u8, 14, 15, 16, 17, 18, 19, 110]));
+
+    test_vcombine!(test_vcombine_s16 => vcombine_s16([3_i16, -4, 5, -6], [13_i16, -14, 15, -16]));
+    test_vcombine!(test_vcombine_u16 => vcombine_u16([3_u16, 4, 5, 6], [13_u16, 14, 15, 16]));
+    test_vcombine!(test_vcombine_p16 => vcombine_p16([3_u16, 4, 5, 6], [13_u16, 14, 15, 16]));
+    // FIXME: 16-bit floats
+    // test_vcombine!(test_vcombine_f16 => vcombine_f16([3_f16, 4., 5., 6.],
+    // [13_f16, 14., 15., 16.]));
+
+    test_vcombine!(test_vcombine_s32 => vcombine_s32([3_i32, -4], [13_i32, -14]));
+    test_vcombine!(test_vcombine_u32 => vcombine_u32([3_u32, 4], [13_u32, 14]));
+    // note: poly32x4 does not exist, and neither does vcombine_p32
+    test_vcombine!(test_vcombine_f32 => vcombine_f32([3_f32, -4.], [13_f32, -14.]));
+
+    test_vcombine!(test_vcombine_s64 => vcombine_s64([-3_i64], [13_i64]));
+    test_vcombine!(test_vcombine_u64 => vcombine_u64([3_u64], [13_u64]));
+    test_vcombine!(test_vcombine_p64 => vcombine_p64([3_u64], [13_u64]));
+    test_vcombine!(test_vcombine_f64 => vcombine_f64([-3_f64], [13_f64]));
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_f64() {
+        let a: f64 = 3.3;
+        let e = f64x1::new(3.3);
+        let r: f64x1 = transmute(vdup_n_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_p64() {
+        let a: u64 = 3;
+        let e = u64x1::new(3);
+        let r: u64x1 = transmute(vdup_n_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_f64() {
+        let a: f64 = 3.3;
+        let e = f64x2::new(3.3, 3.3);
+        let r: f64x2 = transmute(vdupq_n_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_p64() {
+        let a: u64 = 3;
+        let e = u64x2::new(3, 3);
+        let r: u64x2 = transmute(vdupq_n_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_p64() {
+        let a: u64 = 3;
+        let e = u64x1::new(3);
+        let r: u64x1 = transmute(vmov_n_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_f64() {
+        let a: f64 = 3.3;
+        let e = f64x1::new(3.3);
+        let r: f64x1 = transmute(vmov_n_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_p64() {
+        let a: u64 = 3;
+        let e = u64x2::new(3, 3);
+        let r: u64x2 = transmute(vmovq_n_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_f64() {
+        let a: f64 = 3.3;
+        let e = f64x2::new(3.3, 3.3);
+        let r: f64x2 = transmute(vmovq_n_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_f64() {
+        let a = f64x2::new(1.0, 2.0);
+        let e = f64x1::new(2.0);
+        let r: f64x1 = transmute(vget_high_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_p64() {
+        let a = u64x2::new(1, 2);
+        let e = u64x1::new(2);
+        let r: u64x1 = transmute(vget_high_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_f64() {
+        let a = f64x2::new(1.0, 2.0);
+        let e = f64x1::new(1.0);
+        let r: f64x1 = transmute(vget_low_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_p64() {
+        let a = u64x2::new(1, 2);
+        let e = u64x1::new(1);
+        let r: u64x1 = transmute(vget_low_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_f64() {
+        let v = f64x1::new(1.0);
+        let r = vget_lane_f64::<0>(transmute(v));
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_f64() {
+        let v = f64x2::new(0.0, 1.0);
+        let r = vgetq_lane_f64::<1>(transmute(v));
+        assert_eq!(r, 1.0);
+        let r = vgetq_lane_f64::<0>(transmute(v));
+        assert_eq!(r, 0.0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vcopy_lane_s64::<0, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcopy_lane_u64::<0, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_p64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vcopy_lane_p64::<0, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 0.;
+        let e: f64 = 0.;
+        let r: f64 = transmute(vcopy_lane_f64::<0, 0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x2 = i64x2::new(0, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vcopy_laneq_s64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vcopy_laneq_u64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_p64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x2 = i64x2::new(0, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let r: i64x1 = transmute(vcopy_laneq_p64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcopy_laneq_f64() {
+        let a: f64 = 1.;
+        let b: f64x2 = f64x2::new(0., 0.5);
+        let e: f64 = 0.5;
+        let r: f64 = transmute(vcopy_laneq_f64::<0, 1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u64() {
+        test_cmp_u64(
+            |i, j| vceq_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a == b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u64() {
+        testq_cmp_u64(
+            |i, j| vceqq_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a == b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s64() {
+        test_cmp_s64(
+            |i, j| vceq_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a == b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s64() {
+        testq_cmp_s64(
+            |i, j| vceqq_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a == b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_p64() {
+        test_cmp_p64(
+            |i, j| vceq_p64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a == b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_p64() {
+        testq_cmp_p64(
+            |i, j| vceqq_p64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a == b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_f64() {
+        test_cmp_f64(
+            |i, j| vceq_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a == b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_f64() {
+        testq_cmp_f64(
+            |i, j| vceqq_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a == b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s64() {
+        test_cmp_s64(
+            |i, j| vcgt_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a > b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s64() {
+        testq_cmp_s64(
+            |i, j| vcgtq_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a > b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u64() {
+        test_cmp_u64(
+            |i, j| vcgt_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a > b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u64() {
+        testq_cmp_u64(
+            |i, j| vcgtq_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a > b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_f64() {
+        test_cmp_f64(
+            |i, j| vcgt_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a > b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_f64() {
+        testq_cmp_f64(
+            |i, j| vcgtq_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a > b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s64() {
+        test_cmp_s64(
+            |i, j| vclt_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a < b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s64() {
+        testq_cmp_s64(
+            |i, j| vcltq_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a < b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u64() {
+        test_cmp_u64(
+            |i, j| vclt_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a < b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u64() {
+        testq_cmp_u64(
+            |i, j| vcltq_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a < b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vltq_f64() {
+        test_cmp_f64(
+            |i, j| vclt_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a < b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_f64() {
+        testq_cmp_f64(
+            |i, j| vcltq_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a < b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s64() {
+        test_cmp_s64(
+            |i, j| vcle_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a <= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s64() {
+        testq_cmp_s64(
+            |i, j| vcleq_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a <= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u64() {
+        test_cmp_u64(
+            |i, j| vcle_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a <= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u64() {
+        testq_cmp_u64(
+            |i, j| vcleq_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a <= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vleq_f64() {
+        test_cmp_f64(
+            |i, j| vcle_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a <= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_f64() {
+        testq_cmp_f64(
+            |i, j| vcleq_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a <= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s64() {
+        test_cmp_s64(
+            |i, j| vcge_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a >= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s64() {
+        testq_cmp_s64(
+            |i, j| vcgeq_s64(i, j),
+            |a: i64, b: i64| -> u64 {
+                if a >= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u64() {
+        test_cmp_u64(
+            |i, j| vcge_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a >= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u64() {
+        testq_cmp_u64(
+            |i, j| vcgeq_u64(i, j),
+            |a: u64, b: u64| -> u64 {
+                if a >= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgeq_f64() {
+        test_cmp_f64(
+            |i, j| vcge_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a >= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_f64() {
+        testq_cmp_f64(
+            |i, j| vcgeq_f64(i, j),
+            |a: f64, b: f64| -> u64 {
+                if a >= b {
+                    0xFFFFFFFFFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_f64() {
+        test_ari_f64(|i, j| vmul_f64(i, j), |a: f64, b: f64| -> f64 { a * b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_f64() {
+        testq_ari_f64(|i, j| vmulq_f64(i, j), |a: f64, b: f64| -> f64 { a * b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_f64() {
+        test_ari_f64(|i, j| vsub_f64(i, j), |a: f64, b: f64| -> f64 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_f64() {
+        testq_ari_f64(|i, j| vsubq_f64(i, j), |a: f64, b: f64| -> f64 { a - b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabsd_s64() {
+        assert_eq!(vabsd_s64(-1), 1);
+        assert_eq!(vabsd_s64(0), 0);
+        assert_eq!(vabsd_s64(1), 1);
+        assert_eq!(vabsd_s64(i64::MIN), i64::MIN);
+        assert_eq!(vabsd_s64(i64::MIN + 1), i64::MAX);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabs_s64() {
+        let a = i64x1::new(i64::MIN);
+        let r: i64x1 = transmute(vabs_s64(transmute(a)));
+        let e = i64x1::new(i64::MIN);
+        assert_eq!(r, e);
+        let a = i64x1::new(i64::MIN + 1);
+        let r: i64x1 = transmute(vabs_s64(transmute(a)));
+        let e = i64x1::new(i64::MAX);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabsq_s64() {
+        let a = i64x2::new(i64::MIN, i64::MIN + 1);
+        let r: i64x2 = transmute(vabsq_s64(transmute(a)));
+        let e = i64x2::new(i64::MIN, i64::MAX);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_f64() {
+        let a = u64x1::new(u64::MAX);
+        let b = f64x1::new(f64::MAX);
+        let c = f64x1::new(f64::MIN);
+        let e = f64x1::new(f64::MAX);
+        let r: f64x1 = transmute(vbsl_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_p64() {
+        let a = u64x1::new(u64::MAX);
+        let b = u64x1::new(u64::MAX);
+        let c = u64x1::new(u64::MIN);
+        let e = u64x1::new(u64::MAX);
+        let r: u64x1 = transmute(vbsl_p64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_f64() {
+        let a = u64x2::new(u64::MAX, 0);
+        let b = f64x2::new(f64::MAX, f64::MAX);
+        let c = f64x2::new(f64::MIN, f64::MIN);
+        let e = f64x2::new(f64::MAX, f64::MIN);
+        let r: f64x2 = transmute(vbslq_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_p64() {
+        let a = u64x2::new(u64::MAX, 0);
+        let b = u64x2::new(u64::MAX, u64::MAX);
+        let c = u64x2::new(u64::MIN, u64::MIN);
+        let e = u64x2::new(u64::MAX, u64::MIN);
+        let r: u64x2 = transmute(vbslq_p64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddv_s16() {
+        let a = i16x4::new(1, 2, 3, -4);
+        let r: i16 = transmute(vaddv_s16(transmute(a)));
+        let e = 2_i16;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddv_u16() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let r: u16 = transmute(vaddv_u16(transmute(a)));
+        let e = 10_u16;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddv_s32() {
+        let a = i32x2::new(1, -2);
+        let r: i32 = transmute(vaddv_s32(transmute(a)));
+        let e = -1_i32;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddv_u32() {
+        let a = u32x2::new(1, 2);
+        let r: u32 = transmute(vaddv_u32(transmute(a)));
+        let e = 3_u32;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddv_s8() {
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, -8);
+        let r: i8 = transmute(vaddv_s8(transmute(a)));
+        let e = 20_i8;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddv_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8 = transmute(vaddv_u8(transmute(a)));
+        let e = 36_u8;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, -8);
+        let r: i16 = transmute(vaddvq_s16(transmute(a)));
+        let e = 20_i16;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16 = transmute(vaddvq_u16(transmute(a)));
+        let e = 36_u16;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_s32() {
+        let a = i32x4::new(1, 2, 3, -4);
+        let r: i32 = transmute(vaddvq_s32(transmute(a)));
+        let e = 2_i32;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let r: u32 = transmute(vaddvq_u32(transmute(a)));
+        let e = 10_u32;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -16);
+        let r: i8 = transmute(vaddvq_s8(transmute(a)));
+        let e = 104_i8;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_u8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8 = transmute(vaddvq_u8(transmute(a)));
+        let e = 136_u8;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_s64() {
+        let a = i64x2::new(1, -2);
+        let r: i64 = transmute(vaddvq_s64(transmute(a)));
+        let e = -1_i64;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddvq_u64() {
+        let a = u64x2::new(1, 2);
+        let r: u64 = transmute(vaddvq_u64(transmute(a)));
+        let e = 3_u64;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlv_s8() {
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, -8);
+        let r: i16 = vaddlv_s8(transmute(a));
+        let e = 20_i16;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlv_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16 = vaddlv_u8(transmute(a));
+        let e = 36_u16;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlvq_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -16);
+        let r: i16 = vaddlvq_s8(transmute(a));
+        let e = 104_i16;
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddlvq_u8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u16 = vaddlvq_u8(transmute(a));
+        let e = 136_u16;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f64() {
+        let a: [f64; 2] = [0., 1.];
+        let e = f64x1::new(1.);
+        let r: f64x1 = transmute(vld1_f64(a[1..].as_ptr()));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f64() {
+        let a: [f64; 3] = [0., 1., 2.];
+        let e = f64x2::new(1., 2.);
+        let r: f64x2 = transmute(vld1q_f64(a[1..].as_ptr()));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_f64() {
+        let a: [f64; 2] = [1., 42.];
+        let e = f64x1::new(42.);
+        let r: f64x1 = transmute(vld1_dup_f64(a[1..].as_ptr()));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_f64() {
+        let elem: f64 = 42.;
+        let e = f64x2::new(42., 42.);
+        let r: f64x2 = transmute(vld1q_dup_f64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_f64() {
+        let a = f64x1::new(0.);
+        let elem: f64 = 42.;
+        let e = f64x1::new(42.);
+        let r: f64x1 = transmute(vld1_lane_f64::<0>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_f64() {
+        let a = f64x2::new(0., 1.);
+        let elem: f64 = 42.;
+        let e = f64x2::new(0., 42.);
+        let r: f64x2 = transmute(vld1q_lane_f64::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f64() {
+        let mut vals = [0_f64; 2];
+        let a = f64x1::new(1.);
+
+        vst1_f64(vals[1..].as_mut_ptr(), transmute(a));
+
+        assert_eq!(vals[0], 0.);
+        assert_eq!(vals[1], 1.);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f64() {
+        let mut vals = [0_f64; 3];
+        let a = f64x2::new(1., 2.);
+
+        vst1q_f64(vals[1..].as_mut_ptr(), transmute(a));
+
+        assert_eq!(vals[0], 0.);
+        assert_eq!(vals[1], 1.);
+        assert_eq!(vals[2], 2.);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3tt1aq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2, 1536, 4, 16395);
+        let r: u32x4 = transmute(vsm3tt1aq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3tt1bq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2, 1536, 4, 16392);
+        let r: u32x4 = transmute(vsm3tt1bq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3tt2aq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2, 1572864, 4, 1447435);
+        let r: u32x4 = transmute(vsm3tt2aq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3tt2bq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2, 1572864, 4, 1052680);
+        let r: u32x4 = transmute(vsm3tt2bq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vxarq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let e: u64x2 = u64x2::new(2, 6);
+        let r: u64x2 = transmute(vxarq_u64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+}
+
+#[cfg(test)]
+#[cfg(target_endian = "little")]
+#[path = "../../arm_shared/neon/table_lookup_tests.rs"]
+mod table_lookup_tests;
+
+#[cfg(test)]
+#[path = "../../arm_shared/neon/shift_and_insert_tests.rs"]
+mod shift_and_insert_tests;
+
+#[cfg(test)]
+#[path = "../../arm_shared/neon/load_tests.rs"]
+mod load_tests;
+
+#[cfg(test)]
+#[path = "../../arm_shared/neon/store_tests.rs"]
+mod store_tests;
diff --git a/library/stdarch/crates/core_arch/src/aarch64/prefetch.rs b/library/stdarch/crates/core_arch/src/aarch64/prefetch.rs
new file mode 100644
index 000000000..3ae0ef506
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/prefetch.rs
@@ -0,0 +1,73 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+extern "unadjusted" {
+    #[link_name = "llvm.prefetch"]
+    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
+}
+
+/// See [`prefetch`](fn._prefetch.html).
+pub const _PREFETCH_READ: i32 = 0;
+
+/// See [`prefetch`](fn._prefetch.html).
+pub const _PREFETCH_WRITE: i32 = 1;
+
+/// See [`prefetch`](fn._prefetch.html).
+pub const _PREFETCH_LOCALITY0: i32 = 0;
+
+/// See [`prefetch`](fn._prefetch.html).
+pub const _PREFETCH_LOCALITY1: i32 = 1;
+
+/// See [`prefetch`](fn._prefetch.html).
+pub const _PREFETCH_LOCALITY2: i32 = 2;
+
+/// See [`prefetch`](fn._prefetch.html).
+pub const _PREFETCH_LOCALITY3: i32 = 3;
+
+/// Fetch the cache line that contains address `p` using the given `RW` and `LOCALITY`.
+///
+/// The `RW` must be one of:
+///
+/// * [`_PREFETCH_READ`](constant._PREFETCH_READ.html): the prefetch is preparing
+///   for a read.
+///
+/// * [`_PREFETCH_WRITE`](constant._PREFETCH_WRITE.html): the prefetch is preparing
+///   for a write.
+///
+/// The `LOCALITY` must be one of:
+///
+/// * [`_PREFETCH_LOCALITY0`](constant._PREFETCH_LOCALITY0.html): Streaming or
+///   non-temporal prefetch, for data that is used only once.
+///
+/// * [`_PREFETCH_LOCALITY1`](constant._PREFETCH_LOCALITY1.html): Fetch into level 3 cache.
+///
+/// * [`_PREFETCH_LOCALITY2`](constant._PREFETCH_LOCALITY2.html): Fetch into level 2 cache.
+///
+/// * [`_PREFETCH_LOCALITY3`](constant._PREFETCH_LOCALITY3.html): Fetch into level 1 cache.
+///
+/// The prefetch memory instructions signal to the memory system that memory accesses
+/// from a specified address are likely to occur in the near future. The memory system
+/// can respond by taking actions that are expected to speed up the memory access when
+/// they do occur, such as preloading the specified address into one or more caches.
+/// Because these signals are only hints, it is valid for a particular CPU to treat
+/// any or all prefetch instructions as a NOP.
+///
+///
+/// [Arm's documentation](https://developer.arm.com/documentation/den0024/a/the-a64-instruction-set/memory-access-instructions/prefetching-memory?lang=en)
+#[inline(always)]
+#[cfg_attr(test, assert_instr("prfm pldl1strm", RW = _PREFETCH_READ, LOCALITY = _PREFETCH_LOCALITY0))]
+#[cfg_attr(test, assert_instr("prfm pldl3keep", RW = _PREFETCH_READ, LOCALITY = _PREFETCH_LOCALITY1))]
+#[cfg_attr(test, assert_instr("prfm pldl2keep", RW = _PREFETCH_READ, LOCALITY = _PREFETCH_LOCALITY2))]
+#[cfg_attr(test, assert_instr("prfm pldl1keep", RW = _PREFETCH_READ, LOCALITY = _PREFETCH_LOCALITY3))]
+#[cfg_attr(test, assert_instr("prfm pstl1strm", RW = _PREFETCH_WRITE, LOCALITY = _PREFETCH_LOCALITY0))]
+#[cfg_attr(test, assert_instr("prfm pstl3keep", RW = _PREFETCH_WRITE, LOCALITY = _PREFETCH_LOCALITY1))]
+#[cfg_attr(test, assert_instr("prfm pstl2keep", RW = _PREFETCH_WRITE, LOCALITY = _PREFETCH_LOCALITY2))]
+#[cfg_attr(test, assert_instr("prfm pstl1keep", RW = _PREFETCH_WRITE, LOCALITY = _PREFETCH_LOCALITY3))]
+#[rustc_legacy_const_generics(1, 2)]
+// FIXME: Replace this with the standard ACLE __pld/__pldx/__pli/__plix intrinsics
+pub unsafe fn _prefetch<const RW: i32, const LOCALITY: i32>(p: *const i8) {
+    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
+    static_assert_imm1!(RW);
+    static_assert_imm2!(LOCALITY);
+    prefetch(p, RW, LOCALITY, 1);
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/test_support.rs b/library/stdarch/crates/core_arch/src/aarch64/test_support.rs
new file mode 100644
index 000000000..9c5994b15
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/test_support.rs
@@ -0,0 +1,184 @@
+use crate::core_arch::{aarch64::neon::*, arm_shared::*, simd::*};
+use std::{i16, i32, i8, mem::transmute, u16, u32, u8, vec::Vec};
+
+macro_rules! V_u64 {
+    () => {
+        vec![
+            0x0000000000000000u64,
+            0x0101010101010101u64,
+            0x0202020202020202u64,
+            0x0F0F0F0F0F0F0F0Fu64,
+            0x8080808080808080u64,
+            0xF0F0F0F0F0F0F0F0u64,
+            0xFFFFFFFFFFFFFFFFu64,
+        ]
+    };
+}
+
+macro_rules! V_f64 {
+    () => {
+        vec![
+            0.0f64,
+            1.0f64,
+            -1.0f64,
+            1.2f64,
+            2.4f64,
+            std::f64::MAX,
+            std::f64::MIN,
+            std::f64::INFINITY,
+            std::f64::NEG_INFINITY,
+            std::f64::NAN,
+        ]
+    };
+}
+
+macro_rules! to64 {
+    ($t : ident) => {
+        |v: $t| -> u64 { transmute(v) }
+    };
+}
+
+macro_rules! to128 {
+    ($t : ident) => {
+        |v: $t| -> u128 { transmute(v) }
+    };
+}
+
+pub(crate) fn test<T, U, V, W, X>(
+    vals: Vec<T>,
+    fill1: fn(T) -> V,
+    fill2: fn(U) -> W,
+    cast: fn(W) -> X,
+    test_fun: fn(V, V) -> W,
+    verify_fun: fn(T, T) -> U,
+) where
+    T: Copy + core::fmt::Debug,
+    U: Copy + core::fmt::Debug + std::cmp::PartialEq,
+    V: Copy + core::fmt::Debug,
+    W: Copy + core::fmt::Debug,
+    X: Copy + core::fmt::Debug + std::cmp::PartialEq,
+{
+    let pairs = vals.iter().zip(vals.iter());
+
+    for (i, j) in pairs {
+        let a: V = fill1(*i);
+        let b: V = fill1(*j);
+
+        let actual_pre: W = test_fun(a, b);
+        let expected_pre: W = fill2(verify_fun(*i, *j));
+
+        let actual: X = cast(actual_pre);
+        let expected: X = cast(expected_pre);
+
+        assert_eq!(
+            actual, expected,
+            "[{:?}:{:?}] :\nf({:?}, {:?}) = {:?}\ng({:?}, {:?}) = {:?}\n",
+            *i, *j, &a, &b, actual_pre, &a, &b, expected_pre
+        );
+    }
+}
+
+macro_rules! gen_test_fn {
+    ($n: ident, $t: ident, $u: ident, $v: ident, $w: ident, $x: ident, $vals: expr, $fill1: expr, $fill2: expr, $cast: expr) => {
+        pub(crate) fn $n(test_fun: fn($v, $v) -> $w, verify_fun: fn($t, $t) -> $u) {
+            unsafe {
+                test::<$t, $u, $v, $w, $x>($vals, $fill1, $fill2, $cast, test_fun, verify_fun)
+            };
+        }
+    };
+}
+
+macro_rules! gen_fill_fn {
+    ($id: ident, $el_width: expr, $num_els: expr, $in_t : ident, $out_t: ident, $cmp_t: ident) => {
+        pub(crate) fn $id(val: $in_t) -> $out_t {
+            let initial: [$in_t; $num_els] = [val; $num_els];
+            let result: $cmp_t = unsafe { transmute(initial) };
+            let result_out: $out_t = unsafe { transmute(result) };
+
+            // println!("FILL: {:016x} as {} x {}: {:016x}", val.reverse_bits(), $el_width, $num_els, (result as u64).reverse_bits());
+
+            result_out
+        }
+    };
+}
+
+gen_fill_fn!(fill_u64, 64, 1, u64, uint64x1_t, u64);
+gen_fill_fn!(fillq_u64, 64, 2, u64, uint64x2_t, u128);
+gen_fill_fn!(fill_f64, 64, 1, f64, float64x1_t, u64);
+gen_fill_fn!(fillq_f64, 64, 2, f64, float64x2_t, u128);
+gen_fill_fn!(fill_p64, 64, 1, u64, poly64x1_t, u64);
+gen_fill_fn!(fillq_p64, 64, 2, u64, poly64x2_t, u128);
+
+gen_test_fn!(
+    test_ari_f64,
+    f64,
+    f64,
+    float64x1_t,
+    float64x1_t,
+    u64,
+    V_f64!(),
+    fill_f64,
+    fill_f64,
+    to64!(float64x1_t)
+);
+gen_test_fn!(
+    test_cmp_f64,
+    f64,
+    u64,
+    float64x1_t,
+    uint64x1_t,
+    u64,
+    V_f64!(),
+    fill_f64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    testq_ari_f64,
+    f64,
+    f64,
+    float64x2_t,
+    float64x2_t,
+    u128,
+    V_f64!(),
+    fillq_f64,
+    fillq_f64,
+    to128!(float64x2_t)
+);
+gen_test_fn!(
+    testq_cmp_f64,
+    f64,
+    u64,
+    float64x2_t,
+    uint64x2_t,
+    u128,
+    V_f64!(),
+    fillq_f64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+
+gen_test_fn!(
+    test_cmp_p64,
+    u64,
+    u64,
+    poly64x1_t,
+    uint64x1_t,
+    u64,
+    V_u64!(),
+    fill_p64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    testq_cmp_p64,
+    u64,
+    u64,
+    poly64x2_t,
+    uint64x2_t,
+    u128,
+    V_u64!(),
+    fillq_p64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
diff --git a/library/stdarch/crates/core_arch/src/aarch64/tme.rs b/library/stdarch/crates/core_arch/src/aarch64/tme.rs
new file mode 100644
index 000000000..d1b2cf334
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/tme.rs
@@ -0,0 +1,179 @@
+//! ARM's Transactional Memory Extensions (TME).
+//!
+//! This CPU feature is available on Aarch64 - A architecture profile.
+//! This feature is in the non-neon feature set. TME specific vendor documentation can
+//! be found [TME Intrinsics Introduction][tme_intrinsics_intro].
+//!
+//! The reference is [ACLE Q4 2019][acle_q4_2019_ref].
+//!
+//! ACLE has a section for TME extensions and state masks for aborts and failure codes.
+//! [ARM A64 Architecture Register Datasheet][a_profile_future] also describes possible failure code scenarios.
+//!
+//! [acle_q4_2019_ref]: https://static.docs.arm.com/101028/0010/ACLE_2019Q4_release-0010.pdf
+//! [tme_intrinsics_intro]: https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics
+//! [llvm_aarch64_int]: https://github.com/llvm/llvm-project/commit/a36d31478c182903523e04eb271bbf102bfab2cc#diff-ff24e1c35f4d54f1110ce5d90c709319R626-R646
+//! [a_profile_future]: https://static.docs.arm.com/ddi0601/a/SysReg_xml_futureA-2019-04.pdf?_ga=2.116560387.441514988.1590524918-1110153136.1588469296
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+extern "unadjusted" {
+    #[link_name = "llvm.aarch64.tstart"]
+    fn aarch64_tstart() -> u64;
+    #[link_name = "llvm.aarch64.tcommit"]
+    fn aarch64_tcommit() -> ();
+    #[link_name = "llvm.aarch64.tcancel"]
+    fn aarch64_tcancel(imm0: u64) -> ();
+    #[link_name = "llvm.aarch64.ttest"]
+    fn aarch64_ttest() -> u64;
+}
+
+/// Transaction successfully started.
+pub const _TMSTART_SUCCESS: u64 = 0x00_u64;
+
+/// Extraction mask for failure reason
+pub const _TMFAILURE_REASON: u64 = 0x00007FFF_u64;
+
+/// Transaction retry is possible.
+pub const _TMFAILURE_RTRY: u64 = 1 << 15;
+
+/// Transaction executed a TCANCEL instruction
+pub const _TMFAILURE_CNCL: u64 = 1 << 16;
+
+/// Transaction aborted because a conflict occurred
+pub const _TMFAILURE_MEM: u64 = 1 << 17;
+
+/// Fallback error type for any other reason
+pub const _TMFAILURE_IMP: u64 = 1 << 18;
+
+/// Transaction aborted because a non-permissible operation was attempted
+pub const _TMFAILURE_ERR: u64 = 1 << 19;
+
+/// Transaction aborted due to read or write set limit was exceeded
+pub const _TMFAILURE_SIZE: u64 = 1 << 20;
+
+/// Transaction aborted due to transactional nesting level was exceeded
+pub const _TMFAILURE_NEST: u64 = 1 << 21;
+
+/// Transaction aborted due to a debug trap.
+pub const _TMFAILURE_DBG: u64 = 1 << 22;
+
+/// Transaction failed from interrupt
+pub const _TMFAILURE_INT: u64 = 1 << 23;
+
+/// Indicates a TRIVIAL version of TM is available
+pub const _TMFAILURE_TRIVIAL: u64 = 1 << 24;
+
+/// Starts a new transaction. When the transaction starts successfully the return value is 0.
+/// If the transaction fails, all state modifications are discarded and a cause of the failure
+/// is encoded in the return value.
+///
+/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics).
+#[inline]
+#[target_feature(enable = "tme")]
+#[cfg_attr(test, assert_instr(tstart))]
+pub unsafe fn __tstart() -> u64 {
+    aarch64_tstart()
+}
+
+/// Commits the current transaction. For a nested transaction, the only effect is that the
+/// transactional nesting depth is decreased. For an outer transaction, the state modifications
+/// performed transactionally are committed to the architectural state.
+///
+/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics).
+#[inline]
+#[target_feature(enable = "tme")]
+#[cfg_attr(test, assert_instr(tcommit))]
+pub unsafe fn __tcommit() {
+    aarch64_tcommit()
+}
+
+/// Cancels the current transaction and discards all state modifications that were performed transactionally.
+///
+/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics).
+#[inline]
+#[target_feature(enable = "tme")]
+#[cfg_attr(test, assert_instr(tcancel, IMM16 = 0x0))]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __tcancel<const IMM16: u64>() {
+    static_assert!(IMM16: u64 where IMM16 <= 65535);
+    aarch64_tcancel(IMM16);
+}
+
+/// Tests if executing inside a transaction. If no transaction is currently executing,
+/// the return value is 0. Otherwise, this intrinsic returns the depth of the transaction.
+///
+/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics).
+#[inline]
+#[target_feature(enable = "tme")]
+#[cfg_attr(test, assert_instr(ttest))]
+pub unsafe fn __ttest() -> u64 {
+    aarch64_ttest()
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::aarch64::*;
+
+    const CANCEL_CODE: u64 = (0 | (0x123 & _TMFAILURE_REASON) as u64) as u64;
+
+    #[simd_test(enable = "tme")]
+    unsafe fn test_tstart() {
+        let mut x = 0;
+        for i in 0..10 {
+            let code = tme::__tstart();
+            if code == _TMSTART_SUCCESS {
+                x += 1;
+                assert_eq!(x, i + 1);
+                break;
+            }
+            assert_eq!(x, 0);
+        }
+    }
+
+    #[simd_test(enable = "tme")]
+    unsafe fn test_tcommit() {
+        let mut x = 0;
+        for i in 0..10 {
+            let code = tme::__tstart();
+            if code == _TMSTART_SUCCESS {
+                x += 1;
+                assert_eq!(x, i + 1);
+                tme::__tcommit();
+            }
+            assert_eq!(x, i + 1);
+        }
+    }
+
+    #[simd_test(enable = "tme")]
+    unsafe fn test_tcancel() {
+        let mut x = 0;
+
+        for i in 0..10 {
+            let code = tme::__tstart();
+            if code == _TMSTART_SUCCESS {
+                x += 1;
+                assert_eq!(x, i + 1);
+                tme::__tcancel::<CANCEL_CODE>();
+                break;
+            }
+        }
+
+        assert_eq!(x, 0);
+    }
+
+    #[simd_test(enable = "tme")]
+    unsafe fn test_ttest() {
+        for _ in 0..10 {
+            let code = tme::__tstart();
+            if code == _TMSTART_SUCCESS {
+                if tme::__ttest() == 2 {
+                    tme::__tcancel::<CANCEL_CODE>();
+                    break;
+                }
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/v8.rs b/library/stdarch/crates/core_arch/src/aarch64/v8.rs
new file mode 100644
index 000000000..778721c68
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/v8.rs
@@ -0,0 +1,104 @@
+//! ARMv8 intrinsics.
+//!
+//! The reference is [ARMv8-A Reference Manual][armv8].
+//!
+//! [armv8]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.
+//! ddi0487a.k_10775/index.html
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Reverse the order of the bytes.
+#[inline]
+#[cfg_attr(test, assert_instr(rev))]
+pub unsafe fn _rev_u64(x: u64) -> u64 {
+    x.swap_bytes() as u64
+}
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(test, assert_instr(clz))]
+pub unsafe fn _clz_u64(x: u64) -> u64 {
+    x.leading_zeros() as u64
+}
+
+/// Reverse the bit order.
+#[inline]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn _rbit_u64(x: u64) -> u64 {
+    crate::intrinsics::bitreverse(x)
+}
+
+/// Counts the leading most significant bits set.
+///
+/// When all bits of the operand are set it returns the size of the operand in
+/// bits.
+#[inline]
+#[cfg_attr(test, assert_instr(cls))]
+pub unsafe fn _cls_u32(x: u32) -> u32 {
+    u32::leading_zeros((((((x as i32) >> 31) as u32) ^ x) << 1) | 1) as u32
+}
+
+/// Counts the leading most significant bits set.
+///
+/// When all bits of the operand are set it returns the size of the operand in
+/// bits.
+#[inline]
+#[cfg_attr(test, assert_instr(cls))]
+pub unsafe fn _cls_u64(x: u64) -> u64 {
+    u64::leading_zeros((((((x as i64) >> 63) as u64) ^ x) << 1) | 1) as u64
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::aarch64::v8;
+
+    #[test]
+    fn _rev_u64() {
+        unsafe {
+            assert_eq!(
+                v8::_rev_u64(0b0000_0000_1111_1111_0000_0000_1111_1111_u64),
+                0b1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
+            );
+        }
+    }
+
+    #[test]
+    fn _clz_u64() {
+        unsafe {
+            assert_eq!(v8::_clz_u64(0b0000_1010u64), 60u64);
+        }
+    }
+
+    #[test]
+    fn _rbit_u64() {
+        unsafe {
+            assert_eq!(
+                v8::_rbit_u64(0b0000_0000_1111_1101_0000_0000_1111_1111_u64),
+                0b1111_1111_0000_0000_1011_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
+            );
+        }
+    }
+
+    #[test]
+    fn _cls_u32() {
+        unsafe {
+            assert_eq!(
+                v8::_cls_u32(0b1111_1111_1111_1111_0000_0000_1111_1111_u32),
+                15_u32
+            );
+        }
+    }
+
+    #[test]
+    fn _cls_u64() {
+        unsafe {
+            assert_eq!(
+                v8::_cls_u64(
+                    0b1111_1111_1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_u64
+                ),
+                15_u64
+            );
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/armclang.rs b/library/stdarch/crates/core_arch/src/arm/armclang.rs
new file mode 100644
index 000000000..e68c02d02
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/armclang.rs
@@ -0,0 +1,35 @@
+//! ARM compiler specific intrinsics
+//!
+//! # References
+//!
+//! - [ARM Compiler v 6.10 - armclang Reference Guide][arm_comp_ref]
+//!
+//! [arm_comp_ref]: https://developer.arm.com/docs/100067/0610
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Inserts a breakpoint instruction.
+///
+/// `VAL` is a compile-time constant integer in range `[0, 255]`.
+///
+/// The breakpoint instruction inserted is `BKPT` on A32/T32.
+///
+/// # Note
+///
+/// [ARM's documentation][arm_docs] defines that `__breakpoint` accepts the
+/// following values for `VAL`:
+///
+/// - `0...65535` when compiling as A32,
+/// - `0...255` when compiling as T32.
+///
+/// The current implementation only accepts values in range `[0, 255]`.
+///
+/// [arm_docs]: https://developer.arm.com/docs/100067/latest/compiler-specific-intrinsics/__breakpoint-intrinsic
+#[cfg_attr(test, assert_instr(bkpt, VAL = 0))]
+#[inline(always)]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __breakpoint<const VAL: i32>() {
+    static_assert_imm8!(VAL);
+    crate::arch::asm!("bkpt #{}", const VAL);
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/dsp.rs b/library/stdarch/crates/core_arch/src/arm/dsp.rs
new file mode 100644
index 000000000..6720f97a5
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/dsp.rs
@@ -0,0 +1,384 @@
+//! # References:
+//!
+//! - Section 8.3 "16-bit multiplications"
+//!
+//! Intrinsics that could live here:
+//!
+//! - \[x\] __smulbb
+//! - \[x\] __smulbt
+//! - \[x\] __smultb
+//! - \[x\] __smultt
+//! - \[x\] __smulwb
+//! - \[x\] __smulwt
+//! - \[x\] __qadd
+//! - \[x\] __qsub
+//! - \[x\] __qdbl
+//! - \[x\] __smlabb
+//! - \[x\] __smlabt
+//! - \[x\] __smlatb
+//! - \[x\] __smlatt
+//! - \[x\] __smlawb
+//! - \[x\] __smlawt
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::mem::transmute;
+
+types! {
+    /// ARM-specific 32-bit wide vector of two packed `i16`.
+    pub struct int16x2_t(i16, i16);
+    /// ARM-specific 32-bit wide vector of two packed `u16`.
+    pub struct uint16x2_t(u16, u16);
+}
+
+extern "unadjusted" {
+    #[link_name = "llvm.arm.smulbb"]
+    fn arm_smulbb(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smulbt"]
+    fn arm_smulbt(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smultb"]
+    fn arm_smultb(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smultt"]
+    fn arm_smultt(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smulwb"]
+    fn arm_smulwb(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smulwt"]
+    fn arm_smulwt(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qadd"]
+    fn arm_qadd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub"]
+    fn arm_qsub(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlabb"]
+    fn arm_smlabb(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlabt"]
+    fn arm_smlabt(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlatb"]
+    fn arm_smlatb(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlatt"]
+    fn arm_smlatt(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlawb"]
+    fn arm_smlawb(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlawt"]
+    fn arm_smlawt(a: i32, b: i32, c: i32) -> i32;
+}
+
+/// Insert a SMULBB instruction
+///
+/// Returns the equivalent of a\[0\] * b\[0\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smulbb))]
+pub unsafe fn __smulbb(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smulbb(transmute(a), transmute(b))
+}
+
+/// Insert a SMULTB instruction
+///
+/// Returns the equivalent of a\[0\] * b\[1\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smultb))]
+pub unsafe fn __smultb(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smultb(transmute(a), transmute(b))
+}
+
+/// Insert a SMULTB instruction
+///
+/// Returns the equivalent of a\[1\] * b\[0\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smulbt))]
+pub unsafe fn __smulbt(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smulbt(transmute(a), transmute(b))
+}
+
+/// Insert a SMULTT instruction
+///
+/// Returns the equivalent of a\[1\] * b\[1\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smultt))]
+pub unsafe fn __smultt(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smultt(transmute(a), transmute(b))
+}
+
+/// Insert a SMULWB instruction
+///
+/// Multiplies the 32-bit signed first operand with the low halfword
+/// (as a 16-bit signed integer) of the second operand.
+/// Return the top 32 bits of the 48-bit product
+#[inline]
+#[cfg_attr(test, assert_instr(smulwb))]
+pub unsafe fn __smulwb(a: int16x2_t, b: i32) -> i32 {
+    arm_smulwb(transmute(a), b)
+}
+
+/// Insert a SMULWT instruction
+///
+/// Multiplies the 32-bit signed first operand with the high halfword
+/// (as a 16-bit signed integer) of the second operand.
+/// Return the top 32 bits of the 48-bit product
+#[inline]
+#[cfg_attr(test, assert_instr(smulwt))]
+pub unsafe fn __smulwt(a: int16x2_t, b: i32) -> i32 {
+    arm_smulwt(transmute(a), b)
+}
+
+/// Signed saturating addition
+///
+/// Returns the 32-bit saturating signed equivalent of a + b.
+/// Sets the Q flag if saturation occurs.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+pub unsafe fn __qadd(a: i32, b: i32) -> i32 {
+    arm_qadd(a, b)
+}
+
+/// Signed saturating subtraction
+///
+/// Returns the 32-bit saturating signed equivalent of a - b.
+/// Sets the Q flag if saturation occurs.
+#[inline]
+#[cfg_attr(test, assert_instr(qsub))]
+pub unsafe fn __qsub(a: i32, b: i32) -> i32 {
+    arm_qsub(a, b)
+}
+
+/// Insert a QADD instruction
+///
+/// Returns the 32-bit saturating signed equivalent of a + a
+/// Sets the Q flag if saturation occurs.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+pub unsafe fn __qdbl(a: i32) -> i32 {
+    arm_qadd(a, a)
+}
+
+/// Insert a SMLABB instruction
+///
+/// Returns the equivalent of a\[0\] * b\[0\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlabb))]
+pub unsafe fn __smlabb(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlabb(transmute(a), transmute(b), c)
+}
+
+/// Insert a SMLABT instruction
+///
+/// Returns the equivalent of a\[0\] * b\[1\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlabt))]
+pub unsafe fn __smlabt(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlabt(transmute(a), transmute(b), c)
+}
+
+/// Insert a SMLATB instruction
+///
+/// Returns the equivalent of a\[1\] * b\[0\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlatb))]
+pub unsafe fn __smlatb(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlatb(transmute(a), transmute(b), c)
+}
+
+/// Insert a SMLATT instruction
+///
+/// Returns the equivalent of a\[1\] * b\[1\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlatt))]
+pub unsafe fn __smlatt(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlatt(transmute(a), transmute(b), c)
+}
+
+/// Insert a SMLAWB instruction
+///
+/// Returns the equivalent of (a * b\[0\] + (c << 16)) >> 16
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlawb))]
+pub unsafe fn __smlawb(a: i32, b: int16x2_t, c: i32) -> i32 {
+    arm_smlawb(a, transmute(b), c)
+}
+
+/// Insert a SMLAWT instruction
+///
+/// Returns the equivalent of (a * b\[1\] + (c << 16)) >> 16
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlawt))]
+pub unsafe fn __smlawt(a: i32, b: int16x2_t, c: i32) -> i32 {
+    arm_smlawt(a, transmute(b), c)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::{
+        arm::*,
+        simd::{i16x2, i8x4, u8x4},
+    };
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[test]
+    fn smulbb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smulbb(transmute(a), transmute(b)), 10 * 30);
+        }
+    }
+
+    #[test]
+    fn smulbt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smulbt(transmute(a), transmute(b)), 10 * 40);
+        }
+    }
+
+    #[test]
+    fn smultb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smultb(transmute(a), transmute(b)), 20 * 30);
+        }
+    }
+
+    #[test]
+    fn smultt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smultt(transmute(a), transmute(b)), 20 * 40);
+        }
+    }
+
+    #[test]
+    fn smulwb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = 30;
+            assert_eq!(super::__smulwb(transmute(a), b), 20 * b);
+        }
+    }
+
+    #[test]
+    fn smulwt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = 30;
+            assert_eq!(super::__smulwt(transmute(a), b), (10 * b) >> 16);
+        }
+    }
+
+    #[test]
+    fn qadd() {
+        unsafe {
+            assert_eq!(super::__qadd(-10, 60), 50);
+            assert_eq!(super::__qadd(i32::MAX, 10), i32::MAX);
+            assert_eq!(super::__qadd(i32::MIN, -10), i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qsub() {
+        unsafe {
+            assert_eq!(super::__qsub(10, 60), -50);
+            assert_eq!(super::__qsub(i32::MAX, -10), i32::MAX);
+            assert_eq!(super::__qsub(i32::MIN, 10), i32::MIN);
+        }
+    }
+
+    fn qdbl() {
+        unsafe {
+            assert_eq!(super::__qdbl(10), 20);
+            assert_eq!(super::__qdbl(i32::MAX), i32::MAX);
+        }
+    }
+
+    fn smlabb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (10 * 30) + c;
+            assert_eq!(super::__smlabb(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlabt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (10 * 40) + c;
+            assert_eq!(super::__smlabt(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlatb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (20 * 30) + c;
+            assert_eq!(super::__smlabt(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlatt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (20 * 40) + c;
+            assert_eq!(super::__smlatt(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlawb() {
+        unsafe {
+            let a: i32 = 10;
+            let b = i16x2::new(30, 40);
+            let c: i32 = 50;
+            let r: i32 = ((a * 30) + (c << 16)) >> 16;
+            assert_eq!(super::__smlawb(a, transmute(b), c), r);
+        }
+    }
+
+    fn smlawt() {
+        unsafe {
+            let a: i32 = 10;
+            let b = i16x2::new(30, 40);
+            let c: i32 = 50;
+            let r: i32 = ((a * 40) + (c << 16)) >> 16;
+            assert_eq!(super::__smlawt(a, transmute(b), c), r);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/ex.rs b/library/stdarch/crates/core_arch/src/arm/ex.rs
new file mode 100644
index 000000000..75f378642
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/ex.rs
@@ -0,0 +1,125 @@
+// Reference: Section 5.4.4 "LDREX / STREX" of ACLE
+
+/// Removes the exclusive lock created by LDREX
+// Supported: v6, v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6-M
+// NOTE: there's no dedicated CLREX instruction in v6 (<v6k); to clear the exclusive monitor users
+// have to do a dummy STREX operation
+#[cfg(any(
+    all(target_feature = "v6k", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+    doc
+))]
+pub unsafe fn __clrex() {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.clrex"]
+        fn clrex();
+    }
+
+    clrex()
+}
+
+/// Executes an exclusive LDR instruction for 8 bit value.
+// Supported: v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6, v6-M
+#[cfg(any(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+    doc
+))]
+pub unsafe fn __ldrexb(p: *const u8) -> u8 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.ldrex.p0i8"]
+        fn ldrex8(p: *const u8) -> u32;
+    }
+
+    ldrex8(p) as u8
+}
+
+/// Executes an exclusive LDR instruction for 16 bit value.
+// Supported: v6K, v7-M, v7-A, v7-R, v8
+// Not supported: v5, v6, v6-M
+#[cfg(any(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+    doc
+))]
+pub unsafe fn __ldrexh(p: *const u16) -> u16 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.ldrex.p0i16"]
+        fn ldrex16(p: *const u16) -> u32;
+    }
+
+    ldrex16(p) as u16
+}
+
+/// Executes an exclusive LDR instruction for 32 bit value.
+// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
+// Not supported: v5, v6-M
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+    doc
+))]
+pub unsafe fn __ldrex(p: *const u32) -> u32 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.ldrex.p0i32"]
+        fn ldrex32(p: *const u32) -> u32;
+    }
+
+    ldrex32(p)
+}
+
+/// Executes an exclusive STR instruction for 8 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// supported: v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6, v6-M
+#[cfg(any(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+    doc
+))]
+pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.strex.p0i8"]
+        fn strex8(value: u32, addr: *mut u8) -> u32;
+    }
+
+    strex8(value, addr)
+}
+
+/// Executes an exclusive STR instruction for 16 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// Supported: v6K, v7-M, v7-A, v7-R, v8
+// Not supported: v5, v6, v6-M
+#[cfg(target_feature = "aarch64")]
+#[cfg(any(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+    doc
+))]
+pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.strex.p0i16"]
+        fn strex16(value: u32, addr: *mut u16) -> u32;
+    }
+
+    strex16(value as u32, addr)
+}
+
+/// Executes an exclusive STR instruction for 32 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
+// Not supported: v5, v6-M
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+    doc
+))]
+pub unsafe fn __strex(value: u32, addr: *mut u32) -> u32 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.strex.p0i32"]
+        fn strex32(value: u32, addr: *mut u32) -> u32;
+    }
+
+    strex32(value, addr)
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/mod.rs b/library/stdarch/crates/core_arch/src/arm/mod.rs
new file mode 100644
index 000000000..efe0068d4
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/mod.rs
@@ -0,0 +1,113 @@
+//! ARM intrinsics.
+//!
+//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The
+//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful.
+//!
+//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
+//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
+
+mod armclang;
+pub use self::armclang::*;
+
+mod v6;
+pub use self::v6::*;
+
+// Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT)
+#[cfg(any(target_feature = "v6", doc))]
+mod sat;
+
+#[cfg(any(target_feature = "v6", doc))]
+pub use self::sat::*;
+
+// Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
+// We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
+// section 5.4.7)
+// Here we workaround the difference between LLVM's +dsp and ACLE's __ARM_FEATURE_DSP by gating on
+// '+v5te' rather than on '+dsp'
+#[cfg(any(
+    // >= v5TE but excludes v7-M
+    all(target_feature = "v5te", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+pub mod dsp;
+
+#[cfg(any(
+    // >= v5TE but excludes v7-M
+    all(target_feature = "v5te", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+pub use self::dsp::*;
+
+// Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
+// Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated
+#[cfg(any(
+    // v7-A, v7-R
+    all(target_feature = "v6", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+mod simd32;
+
+#[cfg(any(
+    // v7-A, v7-R
+    all(target_feature = "v6", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+pub use self::simd32::*;
+
+#[cfg(any(target_feature = "v7", doc))]
+mod v7;
+#[cfg(any(target_feature = "v7", doc))]
+pub use self::v7::*;
+
+mod ex;
+pub use self::ex::*;
+
+pub use crate::core_arch::arm_shared::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[cfg(any(target_feature = "v7", doc))]
+pub(crate) mod neon;
+#[cfg(any(target_feature = "v7", doc))]
+pub use neon::*;
+
+/// Generates the trap instruction `UDF`
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(udf))]
+#[inline]
+pub unsafe fn udf() -> ! {
+    crate::intrinsics::abort()
+}
+
+/// Generates a DBG instruction.
+///
+/// This provides a hint to debugging and related systems. The argument must be
+/// a constant integer from 0 to 15 inclusive. See implementation documentation
+/// for the effect (if any) of this instruction and the meaning of the
+/// argument. This is available only when compiling for AArch32.
+// Section 10.1 of ACLE says that the supported arches are: 7, 7-M
+// "The DBG hint instruction is added in ARMv7. It is UNDEFINED in the ARMv6 base architecture, and
+// executes as a NOP instruction in ARMv6K and ARMv6T2." - ARM Architecture Reference Manual ARMv7-A
+// and ARMv7-R edition (ARM DDI 0406C.c) sections D12.4.1 "ARM instruction set support" and D12.4.2
+// "Thumb instruction set support"
+#[cfg(any(target_feature = "v7", doc))]
+#[inline(always)]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __dbg<const IMM4: i32>() {
+    static_assert_imm4!(IMM4);
+    dbg(IMM4);
+}
+
+extern "unadjusted" {
+    #[link_name = "llvm.arm.dbg"]
+    fn dbg(_: i32);
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/neon.rs b/library/stdarch/crates/core_arch/src/arm/neon.rs
new file mode 100644
index 000000000..a0ad92c33
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/neon.rs
@@ -0,0 +1,1369 @@
+use crate::core_arch::arm_shared::neon::*;
+use crate::core_arch::simd::{f32x4, i32x4, u32x4};
+use crate::core_arch::simd_llvm::*;
+use crate::mem::{align_of, transmute};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(non_camel_case_types)]
+pub(crate) type p8 = u8;
+#[allow(non_camel_case_types)]
+pub(crate) type p16 = u16;
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.arm.neon.vbsl.v8i8"]
+    fn vbsl_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vbsl.v16i8"]
+    fn vbslq_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vpadals.v4i16.v8i8"]
+    pub(crate) fn vpadal_s8_(a: int16x4_t, b: int8x8_t) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vpadals.v2i32.v4i16"]
+    pub(crate) fn vpadal_s16_(a: int32x2_t, b: int16x4_t) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vpadals.v1i64.v2i32"]
+    pub(crate) fn vpadal_s32_(a: int64x1_t, b: int32x2_t) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vpadals.v8i16.v16i8"]
+    pub(crate) fn vpadalq_s8_(a: int16x8_t, b: int8x16_t) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vpadals.v4i32.v8i16"]
+    pub(crate) fn vpadalq_s16_(a: int32x4_t, b: int16x8_t) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vpadals.v2i64.v4i32"]
+    pub(crate) fn vpadalq_s32_(a: int64x2_t, b: int32x4_t) -> int64x2_t;
+
+    #[link_name = "llvm.arm.neon.vpadalu.v4i16.v8i8"]
+    pub(crate) fn vpadal_u8_(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v2i32.v4i16"]
+    pub(crate) fn vpadal_u16_(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v1i64.v2i32"]
+    pub(crate) fn vpadal_u32_(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v8i16.v16i8"]
+    pub(crate) fn vpadalq_u8_(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v4i32.v8i16"]
+    pub(crate) fn vpadalq_u16_(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v2i64.v4i32"]
+    pub(crate) fn vpadalq_u32_(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t;
+
+    #[link_name = "llvm.arm.neon.vtbl1"]
+    fn vtbl1(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl2"]
+    fn vtbl2(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl3"]
+    fn vtbl3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl4"]
+    fn vtbl4(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+
+    #[link_name = "llvm.arm.neon.vtbx1"]
+    fn vtbx1(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx2"]
+    fn vtbx2(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx3"]
+    fn vtbx3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx4"]
+    fn vtbx4(
+        a: int8x8_t,
+        b: int8x8_t,
+        b: int8x8_t,
+        c: int8x8_t,
+        d: int8x8_t,
+        e: int8x8_t,
+    ) -> int8x8_t;
+
+    #[link_name = "llvm.arm.neon.vshiftins.v8i8"]
+    fn vshiftins_v8i8(a: int8x8_t, b: int8x8_t, shift: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v16i8"]
+    fn vshiftins_v16i8(a: int8x16_t, b: int8x16_t, shift: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v4i16"]
+    fn vshiftins_v4i16(a: int16x4_t, b: int16x4_t, shift: int16x4_t) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v8i16"]
+    fn vshiftins_v8i16(a: int16x8_t, b: int16x8_t, shift: int16x8_t) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v2i32"]
+    fn vshiftins_v2i32(a: int32x2_t, b: int32x2_t, shift: int32x2_t) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v4i32"]
+    fn vshiftins_v4i32(a: int32x4_t, b: int32x4_t, shift: int32x4_t) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v1i64"]
+    fn vshiftins_v1i64(a: int64x1_t, b: int64x1_t, shift: int64x1_t) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v2i64"]
+    fn vshiftins_v2i64(a: int64x2_t, b: int64x2_t, shift: int64x2_t) -> int64x2_t;
+
+    #[link_name = "llvm.arm.neon.vld1.v8i8.p0i8"]
+    fn vld1_v8i8(addr: *const i8, align: i32) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vld1.v16i8.p0i8"]
+    fn vld1q_v16i8(addr: *const i8, align: i32) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vld1.v4i16.p0i8"]
+    fn vld1_v4i16(addr: *const i8, align: i32) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vld1.v8i16.p0i8"]
+    fn vld1q_v8i16(addr: *const i8, align: i32) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vld1.v2i32.p0i8"]
+    fn vld1_v2i32(addr: *const i8, align: i32) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v4i32.p0i8"]
+    fn vld1q_v4i32(addr: *const i8, align: i32) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vld1.v1i64.p0i8"]
+    fn vld1_v1i64(addr: *const i8, align: i32) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vld1.v2i64.p0i8"]
+    fn vld1q_v2i64(addr: *const i8, align: i32) -> int64x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v2f32.p0i8"]
+    fn vld1_v2f32(addr: *const i8, align: i32) -> float32x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v4f32.p0i8"]
+    fn vld1q_v4f32(addr: *const i8, align: i32) -> float32x4_t;
+
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v8i8"]
+    fn vst1_v8i8(addr: *const i8, val: int8x8_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v16i8"]
+    fn vst1q_v16i8(addr: *const i8, val: int8x16_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v4i16"]
+    fn vst1_v4i16(addr: *const i8, val: int16x4_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v8i16"]
+    fn vst1q_v8i16(addr: *const i8, val: int16x8_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v2i32"]
+    fn vst1_v2i32(addr: *const i8, val: int32x2_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v4i32"]
+    fn vst1q_v4i32(addr: *const i8, val: int32x4_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v1i64"]
+    fn vst1_v1i64(addr: *const i8, val: int64x1_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v2i64"]
+    fn vst1q_v2i64(addr: *const i8, val: int64x2_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v2f32"]
+    fn vst1_v2f32(addr: *const i8, val: float32x2_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v4f32"]
+    fn vst1q_v4f32(addr: *const i8, val: float32x4_t, align: i32);
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
+    vld1_v8i8(ptr as *const i8, align_of::<i8>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
+    vld1q_v16i8(ptr as *const i8, align_of::<i8>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
+    vld1_v4i16(ptr as *const i8, align_of::<i16>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
+    vld1q_v8i16(ptr as *const i8, align_of::<i16>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
+    vld1_v2i32(ptr as *const i8, align_of::<i32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
+    vld1q_v4i32(ptr as *const i8, align_of::<i32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
+    vld1_v1i64(ptr as *const i8, align_of::<i64>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
+    vld1q_v2i64(ptr as *const i8, align_of::<i64>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
+    transmute(vld1_v8i8(ptr as *const i8, align_of::<u8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
+    transmute(vld1q_v16i8(ptr as *const i8, align_of::<u8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
+    transmute(vld1_v4i16(ptr as *const i8, align_of::<u16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
+    transmute(vld1q_v8i16(ptr as *const i8, align_of::<u16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
+    transmute(vld1_v2i32(ptr as *const i8, align_of::<u32>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
+    transmute(vld1q_v4i32(ptr as *const i8, align_of::<u32>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
+    transmute(vld1_v1i64(ptr as *const i8, align_of::<u64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
+    transmute(vld1q_v2i64(ptr as *const i8, align_of::<u64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
+    transmute(vld1_v8i8(ptr as *const i8, align_of::<p8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
+    transmute(vld1q_v16i8(ptr as *const i8, align_of::<p8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
+    transmute(vld1_v4i16(ptr as *const i8, align_of::<p16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
+    transmute(vld1q_v8i16(ptr as *const i8, align_of::<p16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_p64(ptr: *const p64) -> poly64x1_t {
+    transmute(vld1_v1i64(ptr as *const i8, align_of::<p64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t {
+    transmute(vld1q_v2i64(ptr as *const i8, align_of::<p64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
+    vld1_v2f32(ptr as *const i8, align_of::<f32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
+    vld1q_v4f32(ptr as *const i8, align_of::<f32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1_s8(ptr: *mut i8, a: int8x8_t) {
+    vst1_v8i8(ptr as *const i8, a, align_of::<i8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1q_s8(ptr: *mut i8, a: int8x16_t) {
+    vst1q_v16i8(ptr as *const i8, a, align_of::<i8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1_s16(ptr: *mut i16, a: int16x4_t) {
+    vst1_v4i16(ptr as *const i8, a, align_of::<i16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1q_s16(ptr: *mut i16, a: int16x8_t) {
+    vst1q_v8i16(ptr as *const i8, a, align_of::<i16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1_s32(ptr: *mut i32, a: int32x2_t) {
+    vst1_v2i32(ptr as *const i8, a, align_of::<i32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1q_s32(ptr: *mut i32, a: int32x4_t) {
+    vst1q_v4i32(ptr as *const i8, a, align_of::<i32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1_s64(ptr: *mut i64, a: int64x1_t) {
+    vst1_v1i64(ptr as *const i8, a, align_of::<i64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1q_s64(ptr: *mut i64, a: int64x2_t) {
+    vst1q_v2i64(ptr as *const i8, a, align_of::<i64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1_u8(ptr: *mut u8, a: uint8x8_t) {
+    vst1_v8i8(ptr as *const i8, transmute(a), align_of::<u8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1q_u8(ptr: *mut u8, a: uint8x16_t) {
+    vst1q_v16i8(ptr as *const i8, transmute(a), align_of::<u8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1_u16(ptr: *mut u16, a: uint16x4_t) {
+    vst1_v4i16(ptr as *const i8, transmute(a), align_of::<u16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1q_u16(ptr: *mut u16, a: uint16x8_t) {
+    vst1q_v8i16(ptr as *const i8, transmute(a), align_of::<u16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1_u32(ptr: *mut u32, a: uint32x2_t) {
+    vst1_v2i32(ptr as *const i8, transmute(a), align_of::<u32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1q_u32(ptr: *mut u32, a: uint32x4_t) {
+    vst1q_v4i32(ptr as *const i8, transmute(a), align_of::<u32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1_u64(ptr: *mut u64, a: uint64x1_t) {
+    vst1_v1i64(ptr as *const i8, transmute(a), align_of::<u64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1q_u64(ptr: *mut u64, a: uint64x2_t) {
+    vst1q_v2i64(ptr as *const i8, transmute(a), align_of::<u64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1_p8(ptr: *mut p8, a: poly8x8_t) {
+    vst1_v8i8(ptr as *const i8, transmute(a), align_of::<p8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1q_p8(ptr: *mut p8, a: poly8x16_t) {
+    vst1q_v16i8(ptr as *const i8, transmute(a), align_of::<p8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1_p16(ptr: *mut p16, a: poly16x4_t) {
+    vst1_v4i16(ptr as *const i8, transmute(a), align_of::<p16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1q_p16(ptr: *mut p16, a: poly16x8_t) {
+    vst1q_v8i16(ptr as *const i8, transmute(a), align_of::<p8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes,v8")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1_p64(ptr: *mut p64, a: poly64x1_t) {
+    vst1_v1i64(ptr as *const i8, transmute(a), align_of::<p64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes,v8")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1q_p64(ptr: *mut p64, a: poly64x2_t) {
+    vst1q_v2i64(ptr as *const i8, transmute(a), align_of::<p64>() as i32)
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1_f32(ptr: *mut f32, a: float32x2_t) {
+    vst1_v2f32(ptr as *const i8, a, align_of::<f32>() as i32)
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1q_f32(ptr: *mut f32, a: float32x4_t) {
+    vst1q_v4f32(ptr as *const i8, a, align_of::<f32>() as i32)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vtbl1(a, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl1(transmute(a), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl1(transmute(a), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t {
+    vtbl2(a.0, a.1, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t {
+    vtbl3(a.0, a.1, a.2, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl3(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl3(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t {
+    vtbl4(a.0, a.1, a.2, a.3, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl4(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(a.3),
+        transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl4(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(a.3),
+        transmute(b),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    vtbx1(a, b, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx1(transmute(a), transmute(b), transmute(c)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx1(transmute(a), transmute(b), transmute(c)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t {
+    vtbx2(a, b.0, b.1, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx2(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx2(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
+    vtbx3(a, b.0, b.1, b.2, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx3(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx3(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
+    vtbx4(a, b.0, b.1, b.2, b.3, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx4(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(b.3),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx4(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(b.3),
+        transmute(c),
+    ))
+}
+
+// These float-to-int implementations have undefined behaviour when `a` overflows
+// the destination type. Clang has the same problem: https://llvm.org/PR47510
+
+/// Floating-point Convert to Signed fixed-point, rounding toward Zero (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[target_feature(enable = "v7")]
+#[cfg_attr(test, assert_instr("vcvt.s32.f32"))]
+pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
+    transmute(simd_cast::<_, i32x4>(transmute::<_, f32x4>(a)))
+}
+
+/// Floating-point Convert to Unsigned fixed-point, rounding toward Zero (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[target_feature(enable = "v7")]
+#[cfg_attr(test, assert_instr("vcvt.u32.f32"))]
+pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    transmute(simd_cast::<_, u32x4>(transmute::<_, f32x4>(a)))
+}
+
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    vshiftins_v16i8(
+        a,
+        b,
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    )
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    vshiftins_v4i16(a, b, int16x4_t(n, n, n, n))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    vshiftins_v2i32(a, b, int32x2_t(N, N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    vshiftins_v4i32(a, b, int32x4_t(N, N, N, N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    vshiftins_v1i64(a, b, int64x1_t(N as i64))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    vshiftins_v2i64(a, b, int64x2_t(N as i64, N as i64))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    transmute(vshiftins_v2i32(transmute(a), transmute(b), int32x2_t(N, N)))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    transmute(vshiftins_v4i32(
+        transmute(a),
+        transmute(b),
+        int32x4_t(N, N, N, N),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(N as i64),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(N as i64, N as i64),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7,aes")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(N as i64),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7,aes")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(N as i64, N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    vshiftins_v16i8(
+        a,
+        b,
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    )
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    vshiftins_v4i16(a, b, int16x4_t(n, n, n, n))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    vshiftins_v2i32(a, b, int32x2_t(-N, -N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    vshiftins_v4i32(a, b, int32x4_t(-N, -N, -N, -N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    vshiftins_v1i64(a, b, int64x1_t(-N as i64))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    vshiftins_v2i64(a, b, int64x2_t(-N as i64, -N as i64))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    transmute(vshiftins_v2i32(
+        transmute(a),
+        transmute(b),
+        int32x2_t(-N, -N),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    transmute(vshiftins_v4i32(
+        transmute(a),
+        transmute(b),
+        int32x4_t(-N, -N, -N, -N),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(-N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(-N as i64, -N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7,aes")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(-N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7,aes")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(-N as i64, -N as i64),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core_arch::{arm::*, simd::*};
+    use crate::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_s32_f32() {
+        let f = f32x4::new(-1., 2., 3., 4.);
+        let e = i32x4::new(-1, 2, 3, 4);
+        let r: i32x4 = transmute(vcvtq_s32_f32(transmute(f)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_u32_f32() {
+        let f = f32x4::new(1., 2., 3., 4.);
+        let e = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vcvtq_u32_f32(transmute(f)));
+        assert_eq!(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/sat.rs b/library/stdarch/crates/core_arch/src/arm/sat.rs
new file mode 100644
index 000000000..38c98d734
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/sat.rs
@@ -0,0 +1,8 @@
+//! # References:
+//!
+//! - Section 8.4 "Saturating intrinsics"
+//!
+//! Intrinsics that could live here:
+//!
+//! - __ssat
+//! - __usat
diff --git a/library/stdarch/crates/core_arch/src/arm/simd32.rs b/library/stdarch/crates/core_arch/src/arm/simd32.rs
new file mode 100644
index 000000000..2d867acc8
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/simd32.rs
@@ -0,0 +1,728 @@
+//! # References
+//!
+//! - Section 8.5 "32-bit SIMD intrinsics" of ACLE
+//!
+//! Intrinsics that could live here
+//!
+//! - \[x\] __sel
+//! - \[ \] __ssat16
+//! - \[ \] __usat16
+//! - \[ \] __sxtab16
+//! - \[ \] __sxtb16
+//! - \[ \] __uxtab16
+//! - \[ \] __uxtb16
+//! - \[x\] __qadd8
+//! - \[x\] __qsub8
+//! - \[x\] __sadd8
+//! - \[x\] __shadd8
+//! - \[x\] __shsub8
+//! - \[x\] __ssub8
+//! - \[ \] __uadd8
+//! - \[ \] __uhadd8
+//! - \[ \] __uhsub8
+//! - \[ \] __uqadd8
+//! - \[ \] __uqsub8
+//! - \[x\] __usub8
+//! - \[x\] __usad8
+//! - \[x\] __usada8
+//! - \[x\] __qadd16
+//! - \[x\] __qasx
+//! - \[x\] __qsax
+//! - \[x\] __qsub16
+//! - \[x\] __sadd16
+//! - \[x\] __sasx
+//! - \[x\] __shadd16
+//! - \[ \] __shasx
+//! - \[ \] __shsax
+//! - \[x\] __shsub16
+//! - \[ \] __ssax
+//! - \[ \] __ssub16
+//! - \[ \] __uadd16
+//! - \[ \] __uasx
+//! - \[ \] __uhadd16
+//! - \[ \] __uhasx
+//! - \[ \] __uhsax
+//! - \[ \] __uhsub16
+//! - \[ \] __uqadd16
+//! - \[ \] __uqasx
+//! - \[x\] __uqsax
+//! - \[ \] __uqsub16
+//! - \[ \] __usax
+//! - \[ \] __usub16
+//! - \[x\] __smlad
+//! - \[ \] __smladx
+//! - \[ \] __smlald
+//! - \[ \] __smlaldx
+//! - \[x\] __smlsd
+//! - \[ \] __smlsdx
+//! - \[ \] __smlsld
+//! - \[ \] __smlsldx
+//! - \[x\] __smuad
+//! - \[x\] __smuadx
+//! - \[x\] __smusd
+//! - \[x\] __smusdx
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::{core_arch::arm::dsp::int16x2_t, mem::transmute};
+
+types! {
+    /// ARM-specific 32-bit wide vector of four packed `i8`.
+    pub struct int8x4_t(i8, i8, i8, i8);
+    /// ARM-specific 32-bit wide vector of four packed `u8`.
+    pub struct uint8x4_t(u8, u8, u8, u8);
+}
+
+macro_rules! dsp_call {
+    ($name:expr, $a:expr, $b:expr) => {
+        transmute($name(transmute($a), transmute($b)))
+    };
+}
+
+extern "unadjusted" {
+    #[link_name = "llvm.arm.qadd8"]
+    fn arm_qadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub8"]
+    fn arm_qsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub16"]
+    fn arm_qsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qadd16"]
+    fn arm_qadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qasx"]
+    fn arm_qasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsax"]
+    fn arm_qsax(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd16"]
+    fn arm_sadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd8"]
+    fn arm_sadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlad"]
+    fn arm_smlad(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlsd"]
+    fn arm_smlsd(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.sasx"]
+    fn arm_sasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sel"]
+    fn arm_sel(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd8"]
+    fn arm_shadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd16"]
+    fn arm_shadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub8"]
+    fn arm_shsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.ssub8"]
+    fn arm_ssub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.usub8"]
+    fn arm_usub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub16"]
+    fn arm_shsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuad"]
+    fn arm_smuad(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuadx"]
+    fn arm_smuadx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusd"]
+    fn arm_smusd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusdx"]
+    fn arm_smusdx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.usad8"]
+    fn arm_usad8(a: i32, b: i32) -> u32;
+}
+
+/// Saturating four 8-bit integer additions
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd8))]
+pub unsafe fn __qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qadd8, a, b)
+}
+
+/// Saturating two 8-bit integer subtraction
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+/// res\[2\] = a\[2\] - b\[2\]
+/// res\[3\] = a\[3\] - b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub8))]
+pub unsafe fn __qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qsub8, a, b)
+}
+
+/// Saturating two 16-bit integer subtraction
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub16))]
+pub unsafe fn __qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsub16, a, b)
+}
+
+/// Saturating two 16-bit integer additions
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd16))]
+pub unsafe fn __qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qadd16, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qasx))]
+pub unsafe fn __qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qasx, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] - b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsax))]
+pub unsafe fn __qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsax, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd16))]
+pub unsafe fn __sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sadd16, a, b)
+}
+
+/// Returns the 8-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd8))]
+pub unsafe fn __sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sadd8, a, b)
+}
+
+/// Dual 16-bit Signed Multiply with Addition of products
+/// and 32-bit accumulation.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlad))]
+pub unsafe fn __smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlad(transmute(a), transmute(b), c)
+}
+
+/// Dual 16-bit Signed Multiply with Subtraction  of products
+/// and 32-bit accumulation and overflow detection.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlsd))]
+pub unsafe fn __smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlsd(transmute(a), transmute(b), c)
+}
+
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sasx))]
+pub unsafe fn __sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sasx, a, b)
+}
+
+/// Select bytes from each operand according to APSR GE flags
+///
+/// Returns the equivalent of
+///
+/// res\[0\] = GE\[0\] ? a\[0\] : b\[0\]
+/// res\[1\] = GE\[1\] ? a\[1\] : b\[1\]
+/// res\[2\] = GE\[2\] ? a\[2\] : b\[2\]
+/// res\[3\] = GE\[3\] ? a\[3\] : b\[3\]
+///
+/// where GE are bits of APSR
+#[inline]
+#[cfg_attr(test, assert_instr(sel))]
+pub unsafe fn __sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sel, a, b)
+}
+
+/// Signed halving parallel byte-wise addition.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+/// res\[2\] = (a\[2\] + b\[2\]) / 2
+/// res\[3\] = (a\[3\] + b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd8))]
+pub unsafe fn __shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shadd8, a, b)
+}
+
+/// Signed halving parallel halfword-wise addition.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd16))]
+pub unsafe fn __shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shadd16, a, b)
+}
+
+/// Signed halving parallel byte-wise subtraction.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+/// res\[2\] = (a\[2\] - b\[2\]) / 2
+/// res\[3\] = (a\[3\] - b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub8))]
+pub unsafe fn __shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shsub8, a, b)
+}
+
+/// Inserts a `USUB8` instruction.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res\[0\] = a\[0\] - a\[0\]
+/// res\[1\] = a\[1\] - a\[1\]
+/// res\[2\] = a\[2\] - a\[2\]
+/// res\[3\] = a\[3\] - a\[3\]
+///
+/// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits.
+/// The GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(usub8))]
+pub unsafe fn __usub8(a: uint8x4_t, b: uint8x4_t) -> uint8x4_t {
+    dsp_call!(arm_usub8, a, b)
+}
+
+/// Inserts a `SSUB8` instruction.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - a\[0\]
+/// res\[1\] = a\[1\] - a\[1\]
+/// res\[2\] = a\[2\] - a\[2\]
+/// res\[3\] = a\[3\] - a\[3\]
+///
+/// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits.
+/// The GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(ssub8))]
+pub unsafe fn __ssub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_ssub8, a, b)
+}
+
+/// Signed halving parallel halfword-wise subtraction.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub16))]
+pub unsafe fn __shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shsub16, a, b)
+}
+
+/// Signed Dual Multiply Add.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuad))]
+pub unsafe fn __smuad(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuad(transmute(a), transmute(b))
+}
+
+/// Signed Dual Multiply Add Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] + a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuadx))]
+pub unsafe fn __smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuadx(transmute(a), transmute(b))
+}
+
+/// Signed Dual Multiply Subtract.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusd))]
+pub unsafe fn __smusd(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusd(transmute(a), transmute(b))
+}
+
+/// Signed Dual Multiply Subtract Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] - a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusdx))]
+pub unsafe fn __smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusdx(transmute(a), transmute(b))
+}
+
+/// Sum of 8-bit absolute differences.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\])
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+pub unsafe fn __usad8(a: int8x4_t, b: int8x4_t) -> u32 {
+    arm_usad8(transmute(a), transmute(b))
+}
+
+/// Sum of 8-bit absolute differences and constant.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+pub unsafe fn __usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
+    __usad8(a, b) + c
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::simd::{i16x2, i8x4, u8x4};
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[test]
+    fn qadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MAX);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(3, 1, 3, i8::MAX);
+            let r: i8x4 = dsp_call!(super::__qadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MIN);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(-1, 3, 3, i8::MIN);
+            let r: i8x4 = dsp_call!(super::__qsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, -1);
+            let c = i16x2::new(3, 1);
+            let r: i16x2 = dsp_call!(super::__qadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub16() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(20, -10);
+            let c = i16x2::new(-10, 30);
+            let r: i16x2 = dsp_call!(super::__qsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qasx() {
+        unsafe {
+            let a = i16x2::new(1, i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(-1, i16::MAX);
+            let r: i16x2 = dsp_call!(super::__qasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsax() {
+        unsafe {
+            let a = i16x2::new(1, i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, i16::MAX - 2);
+            let r: i16x2 = dsp_call!(super::__qsax, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd16() {
+        unsafe {
+            let a = i16x2::new(1, i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, -i16::MAX);
+            let r: i16x2 = dsp_call!(super::__sadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            let c = i8x4::new(5, 5, 5, -i8::MAX);
+            let r: i8x4 = dsp_call!(super::__sadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sasx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, 1);
+            let c = i16x2::new(0, 4);
+            let r: i16x2 = dsp_call!(super::__sasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smlad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = super::__smlad(transmute(a), transmute(b), 10);
+            assert_eq!(r, (1 * 3) + (2 * 4) + 10);
+        }
+    }
+
+    #[test]
+    fn smlsd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = super::__smlsd(transmute(a), transmute(b), 10);
+            assert_eq!(r, ((1 * 3) - (2 * 4)) + 10);
+        }
+    }
+
+    #[test]
+    fn sel() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            // call sadd8() to set GE bits
+            super::__sadd8(transmute(a), transmute(b));
+            let c = i8x4::new(1, 2, 3, i8::MAX);
+            let r: i8x4 = dsp_call!(super::__sel, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(3, 3, 3, 3);
+            let r: i8x4 = dsp_call!(super::__shadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(3, 3);
+            let r: i16x2 = dsp_call!(super::__shadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(-2, -1, 0, 1);
+            let r: i8x4 = dsp_call!(super::__shsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn ssub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(-4, -2, 0, 2);
+            let r: i8x4 = dsp_call!(super::__ssub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn usub8() {
+        unsafe {
+            let a = u8x4::new(1, 2, 3, 4);
+            let b = u8x4::new(5, 4, 3, 2);
+            let c = u8x4::new(252, 254, 0, 2);
+            let r: u8x4 = dsp_call!(super::__usub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(-2, -1);
+            let r: i16x2 = dsp_call!(super::__shsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smuad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smuad(transmute(a), transmute(b));
+            assert_eq!(r, 13);
+        }
+    }
+
+    #[test]
+    fn smuadx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smuadx(transmute(a), transmute(b));
+            assert_eq!(r, 14);
+        }
+    }
+
+    #[test]
+    fn smusd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smusd(transmute(a), transmute(b));
+            assert_eq!(r, -3);
+        }
+    }
+
+    #[test]
+    fn smusdx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smusdx(transmute(a), transmute(b));
+            assert_eq!(r, -6);
+        }
+    }
+
+    #[test]
+    fn usad8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let r = super::__usad8(transmute(a), transmute(b));
+            assert_eq!(r, 8);
+        }
+    }
+
+    #[test]
+    fn usad8a() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let c = 10;
+            let r = super::__usada8(transmute(a), transmute(b), c);
+            assert_eq!(r, 8 + c);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/v6.rs b/library/stdarch/crates/core_arch/src/arm/v6.rs
new file mode 100644
index 000000000..5df30cd62
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/v6.rs
@@ -0,0 +1,49 @@
+//! ARMv6 intrinsics.
+//!
+//! The reference is [ARMv6-M Architecture Reference Manual][armv6m].
+//!
+//! [armv6m]:
+//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.
+//! html
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Reverse the order of the bytes.
+#[inline]
+#[cfg_attr(test, assert_instr(rev))]
+pub unsafe fn _rev_u16(x: u16) -> u16 {
+    x.swap_bytes() as u16
+}
+
+/// Reverse the order of the bytes.
+#[inline]
+#[cfg_attr(test, assert_instr(rev))]
+pub unsafe fn _rev_u32(x: u32) -> u32 {
+    x.swap_bytes() as u32
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arm::v6;
+
+    #[test]
+    fn _rev_u16() {
+        unsafe {
+            assert_eq!(
+                v6::_rev_u16(0b0000_0000_1111_1111_u16),
+                0b1111_1111_0000_0000_u16
+            );
+        }
+    }
+
+    #[test]
+    fn _rev_u32() {
+        unsafe {
+            assert_eq!(
+                v6::_rev_u32(0b0000_0000_1111_1111_0000_0000_1111_1111_u32),
+                0b1111_1111_0000_0000_1111_1111_0000_0000_u32
+            );
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/v7.rs b/library/stdarch/crates/core_arch/src/arm/v7.rs
new file mode 100644
index 000000000..e7507f9b9
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/v7.rs
@@ -0,0 +1,88 @@
+//! ARMv7 intrinsics.
+//!
+//! The reference is [ARMv7-M Architecture Reference Manual (Issue
+//! E.b)][armv7m].
+//!
+//! [armv7m]:
+//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.
+//! b/index.html
+
+pub use super::v6::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang/stdarch/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
+pub unsafe fn _clz_u8(x: u8) -> u8 {
+    x.leading_zeros() as u8
+}
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang/stdarch/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
+pub unsafe fn _clz_u16(x: u16) -> u16 {
+    x.leading_zeros() as u16
+}
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang/stdarch/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
+pub unsafe fn _clz_u32(x: u32) -> u32 {
+    x.leading_zeros() as u32
+}
+
+/// Reverse the bit order.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn _rbit_u32(x: u32) -> u32 {
+    crate::intrinsics::bitreverse(x)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arm::v7;
+
+    #[test]
+    fn _clz_u8() {
+        unsafe {
+            assert_eq!(v7::_clz_u8(0b0000_1010u8), 4u8);
+        }
+    }
+
+    #[test]
+    fn _clz_u16() {
+        unsafe {
+            assert_eq!(v7::_clz_u16(0b0000_1010u16), 12u16);
+        }
+    }
+
+    #[test]
+    fn _clz_u32() {
+        unsafe {
+            assert_eq!(v7::_clz_u32(0b0000_1010u32), 28u32);
+        }
+    }
+
+    #[test]
+    #[cfg(dont_compile_me)] // FIXME need to add `v7` upstream in rustc
+    fn _rbit_u32() {
+        unsafe {
+            assert_eq!(
+                v7::_rbit_u32(0b0000_1010u32),
+                0b0101_0000_0000_0000_0000_0000_0000_0000u32
+            );
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/common.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/common.rs
new file mode 100644
index 000000000..0fb35534d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/common.rs
@@ -0,0 +1,14 @@
+//! Access types available on all architectures
+
+/// Full system is the required shareability domain, reads and writes are the
+/// required access types
+pub struct SY;
+
+dmb_dsb!(SY);
+
+impl super::super::sealed::Isb for SY {
+    #[inline(always)]
+    unsafe fn __isb(&self) {
+        super::isb(super::arg::SY)
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
new file mode 100644
index 000000000..6faae0fee
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
@@ -0,0 +1,38 @@
+// Reference: ARM11 MPCore Processor Technical Reference Manual (ARM DDI 0360E) Section 3.5 "Summary
+// of CP15 instructions"
+
+use crate::arch::asm;
+
+/// Full system is the required shareability domain, reads and writes are the
+/// required access types
+pub struct SY;
+
+impl super::super::sealed::Dmb for SY {
+    #[inline(always)]
+    unsafe fn __dmb(&self) {
+        asm!(
+            "mcr p15, 0, r0, c7, c10, 5",
+            options(preserves_flags, nostack)
+        )
+    }
+}
+
+impl super::super::sealed::Dsb for SY {
+    #[inline(always)]
+    unsafe fn __dsb(&self) {
+        asm!(
+            "mcr p15, 0, r0, c7, c10, 4",
+            options(preserves_flags, nostack)
+        )
+    }
+}
+
+impl super::super::sealed::Isb for SY {
+    #[inline(always)]
+    unsafe fn __isb(&self) {
+        asm!(
+            "mcr p15, 0, r0, c7, c5, 4",
+            options(preserves_flags, nostack)
+        )
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/mod.rs
new file mode 100644
index 000000000..6ccced00e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/mod.rs
@@ -0,0 +1,154 @@
+// Reference: Section 7.4 "Hints" of ACLE
+
+// CP15 instruction
+#[cfg(not(any(
+    // v8
+    target_arch = "aarch64",
+    // v7
+    target_feature = "v7",
+    // v6-M
+    target_feature = "mclass"
+)))]
+mod cp15;
+
+#[cfg(not(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+)))]
+pub use self::cp15::*;
+
+// Dedicated instructions
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+macro_rules! dmb_dsb {
+    ($A:ident) => {
+        impl super::super::sealed::Dmb for $A {
+            #[inline(always)]
+            unsafe fn __dmb(&self) {
+                super::dmb(super::arg::$A)
+            }
+        }
+
+        impl super::super::sealed::Dsb for $A {
+            #[inline(always)]
+            unsafe fn __dsb(&self) {
+                super::dsb(super::arg::$A)
+            }
+        }
+    };
+}
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+mod common;
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+pub use self::common::*;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7",))]
+mod not_mclass;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7",))]
+pub use self::not_mclass::*;
+
+#[cfg(target_arch = "aarch64")]
+mod v8;
+
+#[cfg(target_arch = "aarch64")]
+pub use self::v8::*;
+
+/// Generates a DMB (data memory barrier) instruction or equivalent CP15 instruction.
+///
+/// DMB ensures the observed ordering of memory accesses. Memory accesses of the specified type
+/// issued before the DMB are guaranteed to be observed (in the specified scope) before memory
+/// accesses issued after the DMB.
+///
+/// For example, DMB should be used between storing data, and updating a flag variable that makes
+/// that data available to another core.
+///
+/// The __dmb() intrinsic also acts as a compiler memory barrier of the appropriate type.
+#[inline(always)]
+pub unsafe fn __dmb<A>(arg: A)
+where
+    A: super::sealed::Dmb,
+{
+    arg.__dmb()
+}
+
+/// Generates a DSB (data synchronization barrier) instruction or equivalent CP15 instruction.
+///
+/// DSB ensures the completion of memory accesses. A DSB behaves as the equivalent DMB and has
+/// additional properties. After a DSB instruction completes, all memory accesses of the specified
+/// type issued before the DSB are guaranteed to have completed.
+///
+/// The __dsb() intrinsic also acts as a compiler memory barrier of the appropriate type.
+#[inline(always)]
+pub unsafe fn __dsb<A>(arg: A)
+where
+    A: super::sealed::Dsb,
+{
+    arg.__dsb()
+}
+
+/// Generates an ISB (instruction synchronization barrier) instruction or equivalent CP15
+/// instruction.
+///
+/// This instruction flushes the processor pipeline fetch buffers, so that following instructions
+/// are fetched from cache or memory.
+///
+/// An ISB is needed after some system maintenance operations. An ISB is also needed before
+/// transferring control to code that has been loaded or modified in memory, for example by an
+/// overlay mechanism or just-in-time code generator.  (Note that if instruction and data caches are
+/// separate, privileged cache maintenance operations would be needed in order to unify the caches.)
+///
+/// The only supported argument for the __isb() intrinsic is 15, corresponding to the SY (full
+/// system) scope of the ISB instruction.
+#[inline(always)]
+pub unsafe fn __isb<A>(arg: A)
+where
+    A: super::sealed::Isb,
+{
+    arg.__isb()
+}
+
+extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.dmb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.dmb")]
+    fn dmb(_: i32);
+
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.dsb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.dsb")]
+    fn dsb(_: i32);
+
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.isb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.isb")]
+    fn isb(_: i32);
+}
+
+// we put these in a module to prevent weirdness with glob re-exports
+mod arg {
+    // See Section 7.3  Memory barriers of ACLE
+    pub const SY: i32 = 15;
+    pub const ST: i32 = 14;
+    pub const LD: i32 = 13;
+    pub const ISH: i32 = 11;
+    pub const ISHST: i32 = 10;
+    pub const ISHLD: i32 = 9;
+    pub const NSH: i32 = 7;
+    pub const NSHST: i32 = 6;
+    pub const NSHLD: i32 = 5;
+    pub const OSH: i32 = 3;
+    pub const OSHST: i32 = 2;
+    pub const OSHLD: i32 = 1;
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/not_mclass.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/not_mclass.rs
new file mode 100644
index 000000000..385e1d528
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/not_mclass.rs
@@ -0,0 +1,43 @@
+//! Access types available on v7 and v8 but not on v7(E)-M or v8-M
+
+/// Full system is the required shareability domain, writes are the required
+/// access type
+pub struct ST;
+
+dmb_dsb!(ST);
+
+/// Inner Shareable is the required shareability domain, reads and writes are
+/// the required access types
+pub struct ISH;
+
+dmb_dsb!(ISH);
+
+/// Inner Shareable is the required shareability domain, writes are the required
+/// access type
+pub struct ISHST;
+
+dmb_dsb!(ISHST);
+
+/// Non-shareable is the required shareability domain, reads and writes are the
+/// required access types
+pub struct NSH;
+
+dmb_dsb!(NSH);
+
+/// Non-shareable is the required shareability domain, writes are the required
+/// access type
+pub struct NSHST;
+
+dmb_dsb!(NSHST);
+
+/// Outer Shareable is the required shareability domain, reads and writes are
+/// the required access types
+pub struct OSH;
+
+dmb_dsb!(OSH);
+
+/// Outer Shareable is the required shareability domain, writes are the required
+/// access type
+pub struct OSHST;
+
+dmb_dsb!(OSHST);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/v8.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/v8.rs
new file mode 100644
index 000000000..db15da805
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/v8.rs
@@ -0,0 +1,23 @@
+/// Full system is the required shareability domain, reads are the required
+/// access type
+pub struct LD;
+
+dmb_dsb!(LD);
+
+/// Inner Shareable is the required shareability domain, reads are the required
+/// access type
+pub struct ISHLD;
+
+dmb_dsb!(ISHLD);
+
+/// Non-shareable is the required shareability domain, reads are the required
+/// access type
+pub struct NSHLD;
+
+dmb_dsb!(NSHLD);
+
+/// Outer Shareable is the required shareability domain, reads are the required
+/// access type
+pub struct OSHLD;
+
+dmb_dsb!(OSHLD);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/crc.rs b/library/stdarch/crates/core_arch/src/arm_shared/crc.rs
new file mode 100644
index 000000000..e0d0fbe35
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/crc.rs
@@ -0,0 +1,121 @@
+extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32b")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32b")]
+    fn crc32b_(crc: u32, data: u32) -> u32;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32h")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32h")]
+    fn crc32h_(crc: u32, data: u32) -> u32;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32w")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32w")]
+    fn crc32w_(crc: u32, data: u32) -> u32;
+
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32cb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32cb")]
+    fn crc32cb_(crc: u32, data: u32) -> u32;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32ch")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32ch")]
+    fn crc32ch_(crc: u32, data: u32) -> u32;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32cw")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32cw")]
+    fn crc32cw_(crc: u32, data: u32) -> u32;
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// CRC32 single round checksum for bytes (8 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32b))]
+pub unsafe fn __crc32b(crc: u32, data: u8) -> u32 {
+    crc32b_(crc, data as u32)
+}
+
+/// CRC32 single round checksum for half words (16 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32h))]
+pub unsafe fn __crc32h(crc: u32, data: u16) -> u32 {
+    crc32h_(crc, data as u32)
+}
+
+/// CRC32 single round checksum for words (32 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32w))]
+pub unsafe fn __crc32w(crc: u32, data: u32) -> u32 {
+    crc32w_(crc, data)
+}
+
+/// CRC32-C single round checksum for bytes (8 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32cb))]
+pub unsafe fn __crc32cb(crc: u32, data: u8) -> u32 {
+    crc32cb_(crc, data as u32)
+}
+
+/// CRC32-C single round checksum for half words (16 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32ch))]
+pub unsafe fn __crc32ch(crc: u32, data: u16) -> u32 {
+    crc32ch_(crc, data as u32)
+}
+
+/// CRC32-C single round checksum for words (32 bits).
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(test, assert_instr(crc32cw))]
+pub unsafe fn __crc32cw(crc: u32, data: u32) -> u32 {
+    crc32cw_(crc, data)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::{arm_shared::*, simd::*};
+    use std::mem;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32b() {
+        assert_eq!(__crc32b(0, 0), 0);
+        assert_eq!(__crc32b(0, 255), 755167117);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32h() {
+        assert_eq!(__crc32h(0, 0), 0);
+        assert_eq!(__crc32h(0, 16384), 1994146192);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32w() {
+        assert_eq!(__crc32w(0, 0), 0);
+        assert_eq!(__crc32w(0, 4294967295), 3736805603);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32cb() {
+        assert_eq!(__crc32cb(0, 0), 0);
+        assert_eq!(__crc32cb(0, 255), 2910671697);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32ch() {
+        assert_eq!(__crc32ch(0, 0), 0);
+        assert_eq!(__crc32ch(0, 16384), 1098587580);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32cw() {
+        assert_eq!(__crc32cw(0, 0), 0);
+        assert_eq!(__crc32cw(0, 4294967295), 3080238136);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/crypto.rs b/library/stdarch/crates/core_arch/src/arm_shared/crypto.rs
new file mode 100644
index 000000000..3e9515e59
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/crypto.rs
@@ -0,0 +1,374 @@
+use crate::core_arch::arm_shared::{uint32x4_t, uint8x16_t};
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.aese")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.aese")]
+    fn vaeseq_u8_(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.aesd")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.aesd")]
+    fn vaesdq_u8_(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.aesmc")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.aesmc")]
+    fn vaesmcq_u8_(data: uint8x16_t) -> uint8x16_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.aesimc")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.aesimc")]
+    fn vaesimcq_u8_(data: uint8x16_t) -> uint8x16_t;
+
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha1h")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1h")]
+    fn vsha1h_u32_(hash_e: u32) -> u32;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha1su0")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1su0")]
+    fn vsha1su0q_u32_(w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t) -> uint32x4_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha1su1")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1su1")]
+    fn vsha1su1q_u32_(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha1c")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1c")]
+    fn vsha1cq_u32_(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha1p")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1p")]
+    fn vsha1pq_u32_(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha1m")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha1m")]
+    fn vsha1mq_u32_(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t;
+
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha256h")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha256h")]
+    fn vsha256hq_u32_(hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t) -> uint32x4_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha256h2")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha256h2")]
+    fn vsha256h2q_u32_(hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t) -> uint32x4_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha256su0")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha256su0")]
+    fn vsha256su0q_u32_(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha256su1")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sha256su1")]
+    fn vsha256su1q_u32_(tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t;
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// AES single round encryption.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "aes"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(aese))]
+pub unsafe fn vaeseq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t {
+    vaeseq_u8_(data, key)
+}
+
+/// AES single round decryption.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "aes"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(aesd))]
+pub unsafe fn vaesdq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t {
+    vaesdq_u8_(data, key)
+}
+
+/// AES mix columns.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "aes"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(aesmc))]
+pub unsafe fn vaesmcq_u8(data: uint8x16_t) -> uint8x16_t {
+    vaesmcq_u8_(data)
+}
+
+/// AES inverse mix columns.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "aes"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(aesimc))]
+pub unsafe fn vaesimcq_u8(data: uint8x16_t) -> uint8x16_t {
+    vaesimcq_u8_(data)
+}
+
+/// SHA1 fixed rotate.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha1h))]
+pub unsafe fn vsha1h_u32(hash_e: u32) -> u32 {
+    vsha1h_u32_(hash_e)
+}
+
+/// SHA1 hash update accelerator, choose.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha1c))]
+pub unsafe fn vsha1cq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t {
+    vsha1cq_u32_(hash_abcd, hash_e, wk)
+}
+
+/// SHA1 hash update accelerator, majority.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha1m))]
+pub unsafe fn vsha1mq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t {
+    vsha1mq_u32_(hash_abcd, hash_e, wk)
+}
+
+/// SHA1 hash update accelerator, parity.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha1p))]
+pub unsafe fn vsha1pq_u32(hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t) -> uint32x4_t {
+    vsha1pq_u32_(hash_abcd, hash_e, wk)
+}
+
+/// SHA1 schedule update accelerator, first part.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha1su0))]
+pub unsafe fn vsha1su0q_u32(w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t) -> uint32x4_t {
+    vsha1su0q_u32_(w0_3, w4_7, w8_11)
+}
+
+/// SHA1 schedule update accelerator, second part.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha1su1))]
+pub unsafe fn vsha1su1q_u32(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t {
+    vsha1su1q_u32_(tw0_3, w12_15)
+}
+
+/// SHA256 hash update accelerator.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha256h))]
+pub unsafe fn vsha256hq_u32(
+    hash_abcd: uint32x4_t,
+    hash_efgh: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    vsha256hq_u32_(hash_abcd, hash_efgh, wk)
+}
+
+/// SHA256 hash update accelerator, upper part.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha256h2))]
+pub unsafe fn vsha256h2q_u32(
+    hash_efgh: uint32x4_t,
+    hash_abcd: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    vsha256h2q_u32_(hash_efgh, hash_abcd, wk)
+}
+
+/// SHA256 schedule update accelerator, first part.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha256su0))]
+pub unsafe fn vsha256su0q_u32(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
+    vsha256su0q_u32_(w0_3, w4_7)
+}
+
+/// SHA256 schedule update accelerator, second part.
+#[inline]
+#[cfg_attr(not(target_arch = "arm"), target_feature(enable = "sha2"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))]
+#[cfg_attr(test, assert_instr(sha256su1))]
+pub unsafe fn vsha256su1q_u32(
+    tw0_3: uint32x4_t,
+    w8_11: uint32x4_t,
+    w12_15: uint32x4_t,
+) -> uint32x4_t {
+    vsha256su1q_u32_(tw0_3, w8_11, w12_15)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core_arch::{arm_shared::*, simd::*};
+    use std::mem;
+    use stdarch_test::simd_test;
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "aes"))]
+    unsafe fn test_vaeseq_u8() {
+        let data = mem::transmute(u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8));
+        let key = mem::transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let r: u8x16 = mem::transmute(vaeseq_u8(data, key));
+        assert_eq!(
+            r,
+            u8x16::new(
+                124, 123, 124, 118, 124, 123, 124, 197, 124, 123, 124, 118, 124, 123, 124, 197
+            )
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "aes"))]
+    unsafe fn test_vaesdq_u8() {
+        let data = mem::transmute(u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8));
+        let key = mem::transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let r: u8x16 = mem::transmute(vaesdq_u8(data, key));
+        assert_eq!(
+            r,
+            u8x16::new(9, 213, 9, 251, 9, 213, 9, 56, 9, 213, 9, 251, 9, 213, 9, 56)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "aes"))]
+    unsafe fn test_vaesmcq_u8() {
+        let data = mem::transmute(u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8));
+        let r: u8x16 = mem::transmute(vaesmcq_u8(data));
+        assert_eq!(
+            r,
+            u8x16::new(3, 4, 9, 10, 15, 8, 21, 30, 3, 4, 9, 10, 15, 8, 21, 30)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "aes"))]
+    unsafe fn test_vaesimcq_u8() {
+        let data = mem::transmute(u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8));
+        let r: u8x16 = mem::transmute(vaesimcq_u8(data));
+        assert_eq!(
+            r,
+            u8x16::new(43, 60, 33, 50, 103, 80, 125, 70, 43, 60, 33, 50, 103, 80, 125, 70)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha1h_u32() {
+        assert_eq!(vsha1h_u32(0x1234), 0x048d);
+        assert_eq!(vsha1h_u32(0x5678), 0x159e);
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha1su0q_u32() {
+        let r: u32x4 = mem::transmute(vsha1su0q_u32(
+            mem::transmute(u32x4::new(0x1234_u32, 0x5678_u32, 0x9abc_u32, 0xdef0_u32)),
+            mem::transmute(u32x4::new(0x1234_u32, 0x5678_u32, 0x9abc_u32, 0xdef0_u32)),
+            mem::transmute(u32x4::new(0x1234_u32, 0x5678_u32, 0x9abc_u32, 0xdef0_u32)),
+        ));
+        assert_eq!(r, u32x4::new(0x9abc, 0xdef0, 0x1234, 0x5678));
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha1su1q_u32() {
+        let r: u32x4 = mem::transmute(vsha1su1q_u32(
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x00008898, 0x00019988, 0x00008898, 0x0000acd0)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha1cq_u32() {
+        let r: u32x4 = mem::transmute(vsha1cq_u32(
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            0x1234,
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x8a32cbd8, 0x0c518a96, 0x0018a081, 0x0000c168)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha1pq_u32() {
+        let r: u32x4 = mem::transmute(vsha1pq_u32(
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            0x1234,
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x469f0ba3, 0x0a326147, 0x80145d7f, 0x00009f47)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha1mq_u32() {
+        let r: u32x4 = mem::transmute(vsha1mq_u32(
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            0x1234,
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0xaa39693b, 0x0d51bf84, 0x001aa109, 0x0000d278)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha256hq_u32() {
+        let r: u32x4 = mem::transmute(vsha256hq_u32(
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x05e9aaa8, 0xec5f4c02, 0x20a1ea61, 0x28738cef)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha256h2q_u32() {
+        let r: u32x4 = mem::transmute(vsha256h2q_u32(
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x3745362e, 0x2fb51d00, 0xbd4c529b, 0x968b8516)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha256su0q_u32() {
+        let r: u32x4 = mem::transmute(vsha256su0q_u32(
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0xe59e1c97, 0x5eaf68da, 0xd7bcb51f, 0x6c8de152)
+        );
+    }
+
+    #[cfg_attr(target_arch = "arm", simd_test(enable = "crypto"))]
+    #[cfg_attr(not(target_arch = "arm"), simd_test(enable = "sha2"))]
+    unsafe fn test_vsha256su1q_u32() {
+        let r: u32x4 = mem::transmute(vsha256su1q_u32(
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+            mem::transmute(u32x4::new(0x1234, 0x5678, 0x9abc, 0xdef0)),
+        ));
+        assert_eq!(
+            r,
+            u32x4::new(0x5e09e8d2, 0x74a6f16b, 0xc966606b, 0xa686ee9f)
+        );
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/hints.rs b/library/stdarch/crates/core_arch/src/arm_shared/hints.rs
new file mode 100644
index 000000000..d7e43f551
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/hints.rs
@@ -0,0 +1,95 @@
+// # References
+//
+// - Section 7.4 "Hints" of ACLE
+// - Section 7.7 "NOP" of ACLE
+
+/// Generates a WFI (wait for interrupt) hint instruction, or nothing.
+///
+/// The WFI instruction allows (but does not require) the processor to enter a
+/// low-power state until one of a number of asynchronous events occurs.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6", target_arch = "aarch64", doc))]
+#[inline(always)]
+pub unsafe fn __wfi() {
+    hint(HINT_WFI);
+}
+
+/// Generates a WFE (wait for event) hint instruction, or nothing.
+///
+/// The WFE instruction allows (but does not require) the processor to enter a
+/// low-power state until some event occurs such as a SEV being issued by
+/// another processor.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6", target_arch = "aarch64", doc))]
+#[inline(always)]
+pub unsafe fn __wfe() {
+    hint(HINT_WFE);
+}
+
+/// Generates a SEV (send a global event) hint instruction.
+///
+/// This causes an event to be signaled to all processors in a multiprocessor
+/// system. It is a NOP on a uniprocessor system.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M, 7-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6", target_arch = "aarch64", doc))]
+#[inline(always)]
+pub unsafe fn __sev() {
+    hint(HINT_SEV);
+}
+
+/// Generates a send a local event hint instruction.
+///
+/// This causes an event to be signaled to only the processor executing this
+/// instruction. In a multiprocessor system, it is not required to affect the
+/// other processors.
+// LLVM says "instruction requires: armv8"
+#[cfg(any(
+    target_feature = "v8", // 32-bit ARMv8
+    target_arch = "aarch64", // AArch64
+    doc,
+))]
+#[inline(always)]
+pub unsafe fn __sevl() {
+    hint(HINT_SEVL);
+}
+
+/// Generates a YIELD hint instruction.
+///
+/// This enables multithreading software to indicate to the hardware that it is
+/// performing a task, for example a spin-lock, that could be swapped out to
+/// improve overall system performance.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6", target_arch = "aarch64", doc))]
+#[inline(always)]
+pub unsafe fn __yield() {
+    hint(HINT_YIELD);
+}
+
+/// Generates an unspecified no-op instruction.
+///
+/// Note that not all architectures provide a distinguished NOP instruction. On
+/// those that do, it is unspecified whether this intrinsic generates it or
+/// another instruction. It is not guaranteed that inserting this instruction
+/// will increase execution time.
+#[inline(always)]
+pub unsafe fn __nop() {
+    crate::arch::asm!("nop", options(nomem, nostack, preserves_flags));
+}
+
+extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.hint")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.hint")]
+    fn hint(_: i32);
+}
+
+// from LLVM 7.0.1's lib/Target/ARM/{ARMInstrThumb,ARMInstrInfo,ARMInstrThumb2}.td
+const HINT_NOP: i32 = 0;
+const HINT_YIELD: i32 = 1;
+const HINT_WFE: i32 = 2;
+const HINT_WFI: i32 = 3;
+const HINT_SEV: i32 = 4;
+const HINT_SEVL: i32 = 5;
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/mod.rs
new file mode 100644
index 000000000..4c8d19854
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/mod.rs
@@ -0,0 +1,120 @@
+//! ARM C Language Extensions (ACLE)
+//!
+//! # Developer notes
+//!
+//! Below is a list of built-in targets that are representative of the different ARM
+//! architectures; the list includes the `target_feature`s they possess.
+//!
+//! - `armv4t-unknown-linux-gnueabi` - **ARMv4** - `+v4t`
+//! - `armv5te-unknown-linux-gnueabi` - **ARMv5TE** - `+v4t +v5te`
+//! - `arm-unknown-linux-gnueabi` - **ARMv6** - `+v4t +v5te +v6`
+//! - `thumbv6m-none-eabi` - **ARMv6-M** - `+v4t +v5te +v6 +thumb-mode +mclass`
+//! - `armv7-unknown-linux-gnueabihf` - **ARMv7-A** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +dsp +thumb2 +aclass`
+//! - `armv7r-none-eabi` - **ARMv7-R** - `+v4t +v5te +v6 +v6k +v6t2  +v7 +dsp +thumb2 +rclass`
+//! - `thumbv7m-none-eabi` - **ARMv7-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +thumb2 +thumb-mode +mclass`
+//! - `thumbv7em-none-eabi` - **ARMv7E-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +dsp +thumb2 +thumb-mode +mclass`
+//! - `thumbv8m.main-none-eabi` - **ARMv8-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +thumb2 +thumb-mode +mclass`
+//! - `armv8r-none-eabi` - **ARMv8-R** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +v8 +thumb2 +rclass`
+//! - `aarch64-unknown-linux-gnu` - **ARMv8-A (AArch64)** - `+fp +neon`
+//!
+//! Section 10.1 of ACLE says:
+//!
+//! - "In the sequence of Arm architectures { v5, v5TE, v6, v6T2, v7 } each architecture includes
+//! its predecessor instruction set."
+//!
+//! - "In the sequence of Thumb-only architectures { v6-M, v7-M, v7E-M } each architecture includes
+//! its predecessor instruction set."
+//!
+//! From that info and from looking at how LLVM features work (using custom targets) we can identify
+//! features that are subsets of others:
+//!
+//! Legend: `a < b` reads as "`a` is a subset of `b`"; this means that if `b` is enabled then `a` is
+//! enabled as well.
+//!
+//! - `v4t < v5te < v6 < v6k < v6t2 < v7 < v8`
+//! - `v6 < v8m < v6t2`
+//! - `v7 < v8m.main`
+//!
+//! *NOTE*: Section 5.4.7 of ACLE says:
+//!
+//! - "__ARM_FEATURE_DSP is defined to 1 if the DSP (v5E) instructions are supported and the
+//! intrinsics defined in Saturating intrinsics are available."
+//!
+//! This does *not* match how LLVM uses the '+dsp' feature; this feature is not set for v5te
+//! targets so we have to work around this difference.
+//!
+//! # References
+//!
+//! - [ACLE Q2 2018](https://developer.arm.com/docs/101028/latest)
+
+// Only for 'neon' submodule
+#![allow(non_camel_case_types)]
+
+// 8, 7 and 6-M are supported via dedicated instructions like DMB. All other arches are supported
+// via CP15 instructions. See Section 10.1 of ACLE
+mod barrier;
+
+pub use self::barrier::*;
+
+mod hints;
+pub use self::hints::*;
+
+mod registers;
+pub use self::registers::*;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+mod crc;
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub use crc::*;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+mod crypto;
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub use self::crypto::*;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub(crate) mod neon;
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub use self::neon::*;
+
+#[cfg(test)]
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub(crate) mod test_support;
+
+mod sealed {
+    pub trait Dmb {
+        unsafe fn __dmb(&self);
+    }
+
+    pub trait Dsb {
+        unsafe fn __dsb(&self);
+    }
+
+    pub trait Isb {
+        unsafe fn __isb(&self);
+    }
+
+    pub trait Rsr {
+        unsafe fn __rsr(&self) -> u32;
+    }
+
+    pub trait Rsr64 {
+        unsafe fn __rsr64(&self) -> u64;
+    }
+
+    pub trait Rsrp {
+        unsafe fn __rsrp(&self) -> *const u8;
+    }
+
+    pub trait Wsr {
+        unsafe fn __wsr(&self, value: u32);
+    }
+
+    pub trait Wsr64 {
+        unsafe fn __wsr64(&self, value: u64);
+    }
+
+    pub trait Wsrp {
+        unsafe fn __wsrp(&self, value: *const u8);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
new file mode 100644
index 000000000..d69fbd8e8
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -0,0 +1,40888 @@
+// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen/neon.spec` and run the following command to re-generate this file:
+//
+// ```
+// OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen -- crates/stdarch-gen/neon.spec
+// ```
+use super::*;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vand_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vand_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vand_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vandq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vand_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vand_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vandq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vand_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vandq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vand_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vandq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vand_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise and
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vand))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(and))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vandq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_and(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorr_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorr_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorrq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorr_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorrq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorr_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorr_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorrq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorr_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorrq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorr_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorrq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorr_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise or (immediate, inclusive)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orr))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vorrq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_or(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veor_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veorq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veor_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veorq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veor_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veorq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veor_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veorq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veor_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veorq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veor_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veorq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veor_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veorq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veor_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_xor(a, b)
+}
+
+/// Vector bitwise exclusive or (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(veor))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(eor))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn veorq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_xor(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sabd.v8i8")]
+        fn vabd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vabd_s8_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sabd.v16i8")]
+        fn vabdq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vabdq_s8_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sabd.v4i16")]
+        fn vabd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vabd_s16_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sabd.v8i16")]
+        fn vabdq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vabdq_s16_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sabd.v2i32")]
+        fn vabd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vabd_s32_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sabd.v4i32")]
+        fn vabdq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vabdq_s32_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uabd.v8i8")]
+        fn vabd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vabd_u8_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uabd.v16i8")]
+        fn vabdq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vabdq_u8_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uabd.v4i16")]
+        fn vabd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vabd_u16_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uabd.v8i16")]
+        fn vabdq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vabdq_u16_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uabd.v2i32")]
+        fn vabd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vabd_u32_(a, b)
+}
+
+/// Absolute difference between the arguments
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabdu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uabd.v4i32")]
+        fn vabdq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vabdq_u32_(a, b)
+}
+
+/// Absolute difference between the arguments of Floating
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fabd.v2f32")]
+        fn vabd_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vabd_f32_(a, b)
+}
+
+/// Absolute difference between the arguments of Floating
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabd.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fabd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabds.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fabd.v4f32")]
+        fn vabdq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+vabdq_f32_(a, b)
+}
+
+/// Unsigned Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabdl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    simd_cast(vabd_u8(a, b))
+}
+
+/// Unsigned Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabdl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    simd_cast(vabd_u16(a, b))
+}
+
+/// Unsigned Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabdl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    simd_cast(vabd_u32(a, b))
+}
+
+/// Signed Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabdl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let c: uint8x8_t = simd_cast(vabd_s8(a, b));
+    simd_cast(c)
+}
+
+/// Signed Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabdl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let c: uint16x4_t = simd_cast(vabd_s16(a, b));
+    simd_cast(c)
+}
+
+/// Signed Absolute difference Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabdl.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabdl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabdl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let c: uint32x2_t = simd_cast(vabd_s32(a, b));
+    simd_cast(c)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceq_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceqq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceq_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceqq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceq_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceqq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceq_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceqq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceq_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceqq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceq_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceqq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceq_p8(a: poly8x8_t, b: poly8x8_t) -> uint8x8_t {
+    simd_eq(a, b)
+}
+
+/// Compare bitwise Equal (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceqq_p8(a: poly8x16_t, b: poly8x16_t) -> uint8x16_t {
+    simd_eq(a, b)
+}
+
+/// Floating-point compare equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceq_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    simd_eq(a, b)
+}
+
+/// Floating-point compare equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vceq.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmeq))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vceqq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    simd_eq(a, b)
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtst_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    let c: int8x8_t = simd_and(a, b);
+    let d: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    let c: int8x16_t = simd_and(a, b);
+    let d: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtst_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    let c: int16x4_t = simd_and(a, b);
+    let d: i16x4 = i16x4::new(0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtstq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    let c: int16x8_t = simd_and(a, b);
+    let d: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtst_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    let c: int32x2_t = simd_and(a, b);
+    let d: i32x2 = i32x2::new(0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtstq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    let c: int32x4_t = simd_and(a, b);
+    let d: i32x4 = i32x4::new(0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtst_p8(a: poly8x8_t, b: poly8x8_t) -> uint8x8_t {
+    let c: poly8x8_t = simd_and(a, b);
+    let d: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtstq_p8(a: poly8x16_t, b: poly8x16_t) -> uint8x16_t {
+    let c: poly8x16_t = simd_and(a, b);
+    let d: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtst_p16(a: poly16x4_t, b: poly16x4_t) -> uint16x4_t {
+    let c: poly16x4_t = simd_and(a, b);
+    let d: i16x4 = i16x4::new(0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Signed compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtstq_p16(a: poly16x8_t, b: poly16x8_t) -> uint16x8_t {
+    let c: poly16x8_t = simd_and(a, b);
+    let d: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Unsigned compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtst_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let c: uint8x8_t = simd_and(a, b);
+    let d: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Unsigned compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    let c: uint8x16_t = simd_and(a, b);
+    let d: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Unsigned compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtst_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    let c: uint16x4_t = simd_and(a, b);
+    let d: u16x4 = u16x4::new(0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Unsigned compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtstq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    let c: uint16x8_t = simd_and(a, b);
+    let d: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Unsigned compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtst_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    let c: uint32x2_t = simd_and(a, b);
+    let d: u32x2 = u32x2::new(0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Unsigned compare bitwise Test bits nonzero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtst))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmtst))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtstq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    let c: uint32x4_t = simd_and(a, b);
+    let d: u32x4 = u32x4::new(0, 0, 0, 0);
+    simd_ne(c, transmute(d))
+}
+
+/// Floating-point absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fabs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabs_f32(a: float32x2_t) -> float32x2_t {
+    simd_fabs(a)
+}
+
+/// Floating-point absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fabs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabsq_f32(a: float32x4_t) -> float32x4_t {
+    simd_fabs(a)
+}
+
+/// Compare signed greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_gt(a, b)
+}
+
+/// Compare signed greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgtq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_gt(a, b)
+}
+
+/// Compare signed greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_gt(a, b)
+}
+
+/// Compare signed greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgtq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_gt(a, b)
+}
+
+/// Compare signed greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_gt(a, b)
+}
+
+/// Compare signed greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgtq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_gt(a, b)
+}
+
+/// Compare unsigned highe
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_gt(a, b)
+}
+
+/// Compare unsigned highe
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgtq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_gt(a, b)
+}
+
+/// Compare unsigned highe
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_gt(a, b)
+}
+
+/// Compare unsigned highe
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgtq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_gt(a, b)
+}
+
+/// Compare unsigned highe
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_gt(a, b)
+}
+
+/// Compare unsigned highe
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgtq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_gt(a, b)
+}
+
+/// Floating-point compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    simd_gt(a, b)
+}
+
+/// Floating-point compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgtq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    simd_gt(a, b)
+}
+
+/// Compare signed less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_lt(a, b)
+}
+
+/// Compare signed less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcltq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_lt(a, b)
+}
+
+/// Compare signed less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_lt(a, b)
+}
+
+/// Compare signed less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcltq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_lt(a, b)
+}
+
+/// Compare signed less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_lt(a, b)
+}
+
+/// Compare signed less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcltq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_lt(a, b)
+}
+
+/// Compare unsigned less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_lt(a, b)
+}
+
+/// Compare unsigned less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcltq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_lt(a, b)
+}
+
+/// Compare unsigned less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_lt(a, b)
+}
+
+/// Compare unsigned less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcltq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_lt(a, b)
+}
+
+/// Compare unsigned less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_lt(a, b)
+}
+
+/// Compare unsigned less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhi))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcltq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_lt(a, b)
+}
+
+/// Floating-point compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    simd_lt(a, b)
+}
+
+/// Floating-point compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcgt.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcltq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    simd_lt(a, b)
+}
+
+/// Compare signed less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcle_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_le(a, b)
+}
+
+/// Compare signed less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcleq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_le(a, b)
+}
+
+/// Compare signed less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcle_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_le(a, b)
+}
+
+/// Compare signed less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcleq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_le(a, b)
+}
+
+/// Compare signed less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcle_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_le(a, b)
+}
+
+/// Compare signed less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcleq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_le(a, b)
+}
+
+/// Compare unsigned less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcle_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_le(a, b)
+}
+
+/// Compare unsigned less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcleq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_le(a, b)
+}
+
+/// Compare unsigned less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcle_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_le(a, b)
+}
+
+/// Compare unsigned less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcleq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_le(a, b)
+}
+
+/// Compare unsigned less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcle_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_le(a, b)
+}
+
+/// Compare unsigned less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcleq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_le(a, b)
+}
+
+/// Floating-point compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcle_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    simd_le(a, b)
+}
+
+/// Floating-point compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcleq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    simd_le(a, b)
+}
+
+/// Compare signed greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcge_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_ge(a, b)
+}
+
+/// Compare signed greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgeq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_ge(a, b)
+}
+
+/// Compare signed greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcge_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_ge(a, b)
+}
+
+/// Compare signed greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgeq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_ge(a, b)
+}
+
+/// Compare signed greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcge_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_ge(a, b)
+}
+
+/// Compare signed greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgeq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_ge(a, b)
+}
+
+/// Compare unsigned greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcge_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_ge(a, b)
+}
+
+/// Compare unsigned greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgeq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_ge(a, b)
+}
+
+/// Compare unsigned greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcge_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_ge(a, b)
+}
+
+/// Compare unsigned greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgeq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_ge(a, b)
+}
+
+/// Compare unsigned greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcge_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_ge(a, b)
+}
+
+/// Compare unsigned greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cmhs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgeq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_ge(a, b)
+}
+
+/// Floating-point compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcge_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    simd_ge(a, b)
+}
+
+/// Floating-point compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcge.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcmge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcgeq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    simd_ge(a, b)
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcls_s8(a: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.cls.v8i8")]
+        fn vcls_s8_(a: int8x8_t) -> int8x8_t;
+    }
+vcls_s8_(a)
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclsq_s8(a: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.cls.v16i8")]
+        fn vclsq_s8_(a: int8x16_t) -> int8x16_t;
+    }
+vclsq_s8_(a)
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcls_s16(a: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.cls.v4i16")]
+        fn vcls_s16_(a: int16x4_t) -> int16x4_t;
+    }
+vcls_s16_(a)
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclsq_s16(a: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.cls.v8i16")]
+        fn vclsq_s16_(a: int16x8_t) -> int16x8_t;
+    }
+vclsq_s16_(a)
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcls_s32(a: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.cls.v2i32")]
+        fn vcls_s32_(a: int32x2_t) -> int32x2_t;
+    }
+vcls_s32_(a)
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vcls.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclsq_s32(a: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcls.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.cls.v4i32")]
+        fn vclsq_s32_(a: int32x4_t) -> int32x4_t;
+    }
+vclsq_s32_(a)
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcls_u8(a: uint8x8_t) -> int8x8_t {
+    transmute(vcls_s8(transmute(a)))
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclsq_u8(a: uint8x16_t) -> int8x16_t {
+    transmute(vclsq_s8(transmute(a)))
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcls_u16(a: uint16x4_t) -> int16x4_t {
+    transmute(vcls_s16(transmute(a)))
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclsq_u16(a: uint16x8_t) -> int16x8_t {
+    transmute(vclsq_s16(transmute(a)))
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcls_u32(a: uint32x2_t) -> int32x2_t {
+    transmute(vcls_s32(transmute(a)))
+}
+
+/// Count leading sign bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcls))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclsq_u32(a: uint32x4_t) -> int32x4_t {
+    transmute(vclsq_s32(transmute(a)))
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclz_s8(a: int8x8_t) -> int8x8_t {
+    vclz_s8_(a)
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclzq_s8(a: int8x16_t) -> int8x16_t {
+    vclzq_s8_(a)
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclz_s16(a: int16x4_t) -> int16x4_t {
+    vclz_s16_(a)
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclzq_s16(a: int16x8_t) -> int16x8_t {
+    vclzq_s16_(a)
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclz_s32(a: int32x2_t) -> int32x2_t {
+    vclz_s32_(a)
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclzq_s32(a: int32x4_t) -> int32x4_t {
+    vclzq_s32_(a)
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclz_u8(a: uint8x8_t) -> uint8x8_t {
+    transmute(vclz_s8_(transmute(a)))
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclzq_u8(a: uint8x16_t) -> uint8x16_t {
+    transmute(vclzq_s8_(transmute(a)))
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclz_u16(a: uint16x4_t) -> uint16x4_t {
+    transmute(vclz_s16_(transmute(a)))
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclzq_u16(a: uint16x8_t) -> uint16x8_t {
+    transmute(vclzq_s16_(transmute(a)))
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclz_u32(a: uint32x2_t) -> uint32x2_t {
+    transmute(vclz_s32_(transmute(a)))
+}
+
+/// Count leading zero bits
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vclz.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vclzq_u32(a: uint32x4_t) -> uint32x4_t {
+    transmute(vclzq_s32_(transmute(a)))
+}
+
+/// Floating-point absolute compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(facgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcagt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacgt.v2i32.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facgt.v2i32.v2f32")]
+        fn vcagt_f32_(a: float32x2_t, b: float32x2_t) -> uint32x2_t;
+    }
+vcagt_f32_(a, b)
+}
+
+/// Floating-point absolute compare greater than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(facgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcagtq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacgt.v4i32.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facgt.v4i32.v4f32")]
+        fn vcagtq_f32_(a: float32x4_t, b: float32x4_t) -> uint32x4_t;
+    }
+vcagtq_f32_(a, b)
+}
+
+/// Floating-point absolute compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(facge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcage_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacge.v2i32.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facge.v2i32.v2f32")]
+        fn vcage_f32_(a: float32x2_t, b: float32x2_t) -> uint32x2_t;
+    }
+vcage_f32_(a, b)
+}
+
+/// Floating-point absolute compare greater than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(facge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcageq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vacge.v4i32.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.facge.v4i32.v4f32")]
+        fn vcageq_f32_(a: float32x4_t, b: float32x4_t) -> uint32x4_t;
+    }
+vcageq_f32_(a, b)
+}
+
+/// Floating-point absolute compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(facgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcalt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    vcagt_f32(b, a)
+}
+
+/// Floating-point absolute compare less than
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacgt.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(facgt))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcaltq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    vcagtq_f32(b, a)
+}
+
+/// Floating-point absolute compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(facge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcale_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t {
+    vcage_f32(b, a)
+}
+
+/// Floating-point absolute compare less than or equal
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vacge.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(facge))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcaleq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t {
+    vcageq_f32(b, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_s8(a: u64) -> int8x8_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_s16(a: u64) -> int16x4_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_s32(a: u64) -> int32x2_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_s64(a: u64) -> int64x1_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_u8(a: u64) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_u16(a: u64) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_u32(a: u64) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_u64(a: u64) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_p8(a: u64) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_p16(a: u64) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_p64(a: u64) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcreate_f32(a: u64) -> float32x2_t {
+    transmute(a)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(scvtf))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcvt_f32_s32(a: int32x2_t) -> float32x2_t {
+    simd_cast(a)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(scvtf))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcvtq_f32_s32(a: int32x4_t) -> float32x4_t {
+    simd_cast(a)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ucvtf))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcvt_f32_u32(a: uint32x2_t) -> float32x2_t {
+    simd_cast(a)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ucvtf))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcvtq_f32_u32(a: uint32x4_t) -> float32x4_t {
+    simd_cast(a)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vcvt_n_f32_s32<const N: i32>(a: int32x2_t) -> float32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32")]
+        fn vcvt_n_f32_s32_(a: int32x2_t, n: i32) -> float32x2_t;
+    }
+vcvt_n_f32_s32_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_n_f32_s32<const N: i32>(a: int32x2_t) -> float32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32")]
+        fn vcvt_n_f32_s32_(a: int32x2_t, n: i32) -> float32x2_t;
+    }
+vcvt_n_f32_s32_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vcvtq_n_f32_s32<const N: i32>(a: int32x4_t) -> float32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32")]
+        fn vcvtq_n_f32_s32_(a: int32x4_t, n: i32) -> float32x4_t;
+    }
+vcvtq_n_f32_s32_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(scvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_n_f32_s32<const N: i32>(a: int32x4_t) -> float32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32")]
+        fn vcvtq_n_f32_s32_(a: int32x4_t, n: i32) -> float32x4_t;
+    }
+vcvtq_n_f32_s32_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vcvt_n_f32_u32<const N: i32>(a: uint32x2_t) -> float32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32")]
+        fn vcvt_n_f32_u32_(a: uint32x2_t, n: i32) -> float32x2_t;
+    }
+vcvt_n_f32_u32_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_n_f32_u32<const N: i32>(a: uint32x2_t) -> float32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32")]
+        fn vcvt_n_f32_u32_(a: uint32x2_t, n: i32) -> float32x2_t;
+    }
+vcvt_n_f32_u32_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vcvtq_n_f32_u32<const N: i32>(a: uint32x4_t) -> float32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32")]
+        fn vcvtq_n_f32_u32_(a: uint32x4_t, n: i32) -> float32x4_t;
+    }
+vcvtq_n_f32_u32_(a, N)
+}
+
+/// Fixed-point convert to floating-point
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ucvtf, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_n_f32_u32<const N: i32>(a: uint32x4_t) -> float32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32")]
+        fn vcvtq_n_f32_u32_(a: uint32x4_t, n: i32) -> float32x4_t;
+    }
+vcvtq_n_f32_u32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vcvt_n_s32_f32<const N: i32>(a: float32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32")]
+        fn vcvt_n_s32_f32_(a: float32x2_t, n: i32) -> int32x2_t;
+    }
+vcvt_n_s32_f32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_n_s32_f32<const N: i32>(a: float32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32")]
+        fn vcvt_n_s32_f32_(a: float32x2_t, n: i32) -> int32x2_t;
+    }
+vcvt_n_s32_f32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vcvtq_n_s32_f32<const N: i32>(a: float32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32")]
+        fn vcvtq_n_s32_f32_(a: float32x4_t, n: i32) -> int32x4_t;
+    }
+vcvtq_n_s32_f32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_n_s32_f32<const N: i32>(a: float32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32")]
+        fn vcvtq_n_s32_f32_(a: float32x4_t, n: i32) -> int32x4_t;
+    }
+vcvtq_n_s32_f32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vcvt_n_u32_f32<const N: i32>(a: float32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32")]
+        fn vcvt_n_u32_f32_(a: float32x2_t, n: i32) -> uint32x2_t;
+    }
+vcvt_n_u32_f32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvt_n_u32_f32<const N: i32>(a: float32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32")]
+        fn vcvt_n_u32_f32_(a: float32x2_t, n: i32) -> uint32x2_t;
+    }
+vcvt_n_u32_f32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vcvt, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vcvtq_n_u32_f32<const N: i32>(a: float32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32")]
+        fn vcvtq_n_u32_f32_(a: float32x4_t, n: i32) -> uint32x4_t;
+    }
+vcvtq_n_u32_f32_(a, N)
+}
+
+/// Floating-point convert to fixed-point, rounding toward zero
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vcvtq_n_u32_f32<const N: i32>(a: float32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32")]
+        fn vcvtq_n_u32_f32_(a: float32x4_t, n: i32) -> uint32x4_t;
+    }
+vcvtq_n_u32_f32_(a, N)
+}
+
+/// Floating-point convert to signed fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcvtzs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcvt_s32_f32(a: float32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptosi.sat.v2i32.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptosi.sat.v2i32.v2f32")]
+        fn vcvt_s32_f32_(a: float32x2_t) -> int32x2_t;
+    }
+vcvt_s32_f32_(a)
+}
+
+/// Floating-point convert to signed fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcvtzs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptosi.sat.v4i32.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptosi.sat.v4i32.v4f32")]
+        fn vcvtq_s32_f32_(a: float32x4_t) -> int32x4_t;
+    }
+vcvtq_s32_f32_(a)
+}
+
+/// Floating-point convert to unsigned fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcvtzu))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcvt_u32_f32(a: float32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptoui.sat.v2i32.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptoui.sat.v2i32.v2f32")]
+        fn vcvt_u32_f32_(a: float32x2_t) -> uint32x2_t;
+    }
+vcvt_u32_f32_(a)
+}
+
+/// Floating-point convert to unsigned fixed-point, rounding toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcvtzu))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptoui.sat.v4i32.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptoui.sat.v4i32.v4f32")]
+        fn vcvtq_u32_f32_(a: float32x4_t) -> uint32x4_t;
+    }
+vcvtq_u32_f32_(a)
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_imm4!(N);
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_s8<const N: i32>(a: int8x16_t) -> int8x8_t {
+    static_assert_imm4!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_s16<const N: i32>(a: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_s32<const N: i32>(a: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_s8<const N: i32>(a: int8x8_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_s16<const N: i32>(a: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_s32<const N: i32>(a: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_imm4!(N);
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x8_t {
+    static_assert_imm4!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x4_t {
+    static_assert_imm3!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x2_t {
+    static_assert_imm2!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x8_t {
+    static_assert_imm2!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_u32<const N: i32>(a: uint32x2_t) -> uint32x4_t {
+    static_assert_imm1!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x16_t {
+    static_assert_imm4!(N);
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x4_t {
+    static_assert_imm2!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x8_t {
+    static_assert_imm3!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 8))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x8_t {
+    static_assert_imm4!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x4_t {
+    static_assert_imm3!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 4))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x16_t {
+    static_assert_imm3!(N);
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x8_t {
+    static_assert_imm2!(N);
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_s64<const N: i32>(a: int64x1_t) -> int64x2_t {
+    static_assert!(N : i32 where N == 0);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_imm1!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_u64<const N: i32>(a: uint64x1_t) -> uint64x2_t {
+    static_assert!(N : i32 where N == 0);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_f32<const N: i32>(a: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_laneq_f32<const N: i32>(a: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_f32<const N: i32>(a: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(N);
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32", N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdupq_lane_f32<const N: i32>(a: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(N);
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, N = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N == 0);
+    a
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, N = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, N = 0))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_lane_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N == 0);
+    a
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_s64<const N: i32>(a: int64x2_t) -> int64x1_t {
+    static_assert_imm1!(N);
+    transmute::<i64, _>(simd_extract(a, N as u32))
+}
+
+/// Set all vector lanes to the same value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, N = 1))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vdup_laneq_u64<const N: i32>(a: uint64x2_t) -> uint64x1_t {
+    static_assert_imm1!(N);
+    transmute::<u64, _>(simd_extract(a, N as u32))
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vext_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    match N & 0b111 {
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 8))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert_imm4!(N);
+    match N & 0b1111 {
+        0 => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
+        2 => simd_shuffle16!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),
+        3 => simd_shuffle16!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
+        4 => simd_shuffle16!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
+        5 => simd_shuffle16!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
+        6 => simd_shuffle16!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
+        7 => simd_shuffle16!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]),
+        8 => simd_shuffle16!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
+        9 => simd_shuffle16!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
+        10 => simd_shuffle16!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
+        11 => simd_shuffle16!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
+        12 => simd_shuffle16!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
+        13 => simd_shuffle16!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
+        14 => simd_shuffle16!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+        15 => simd_shuffle16!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vext_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(N);
+    match N & 0b11 {
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(N);
+    match N & 0b111 {
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vext_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(N);
+    match N & 0b1 {
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    match N & 0b11 {
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vext_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    match N & 0b111 {
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 8))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert_imm4!(N);
+    match N & 0b1111 {
+        0 => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
+        2 => simd_shuffle16!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),
+        3 => simd_shuffle16!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
+        4 => simd_shuffle16!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
+        5 => simd_shuffle16!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
+        6 => simd_shuffle16!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
+        7 => simd_shuffle16!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]),
+        8 => simd_shuffle16!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
+        9 => simd_shuffle16!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
+        10 => simd_shuffle16!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
+        11 => simd_shuffle16!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
+        12 => simd_shuffle16!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
+        13 => simd_shuffle16!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
+        14 => simd_shuffle16!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+        15 => simd_shuffle16!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vext_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(N);
+    match N & 0b11 {
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(N);
+    match N & 0b111 {
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vext_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(N);
+    match N & 0b1 {
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(N);
+    match N & 0b11 {
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vext_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(N);
+    match N & 0b111 {
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 8))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert_imm4!(N);
+    match N & 0b1111 {
+        0 => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
+        2 => simd_shuffle16!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),
+        3 => simd_shuffle16!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
+        4 => simd_shuffle16!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
+        5 => simd_shuffle16!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
+        6 => simd_shuffle16!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
+        7 => simd_shuffle16!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]),
+        8 => simd_shuffle16!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
+        9 => simd_shuffle16!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
+        10 => simd_shuffle16!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
+        11 => simd_shuffle16!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
+        12 => simd_shuffle16!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
+        13 => simd_shuffle16!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
+        14 => simd_shuffle16!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+        15 => simd_shuffle16!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vext_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert_imm2!(N);
+    match N & 0b11 {
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 4))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert_imm3!(N);
+    match N & 0b111 {
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    match N & 0b1 {
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmov, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert_imm1!(N);
+    match N & 0b1 {
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vext_f32<const N: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(N);
+    match N & 0b1 {
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vext.8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vextq_f32<const N: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(N);
+    match N & 0b11 {
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
+        _ => unreachable_unchecked(),
+    }
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Floating-point multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Floating-point multiply-add to accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    simd_add(a, simd_mul(b, c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_n_s16(a: int16x4_t, b: int16x4_t, c: i16) -> int16x4_t {
+    vmla_s16(a, b, vdup_n_s16(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_n_s16(a: int16x8_t, b: int16x8_t, c: i16) -> int16x8_t {
+    vmlaq_s16(a, b, vdupq_n_s16(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_n_s32(a: int32x2_t, b: int32x2_t, c: i32) -> int32x2_t {
+    vmla_s32(a, b, vdup_n_s32(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_n_s32(a: int32x4_t, b: int32x4_t, c: i32) -> int32x4_t {
+    vmlaq_s32(a, b, vdupq_n_s32(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_n_u16(a: uint16x4_t, b: uint16x4_t, c: u16) -> uint16x4_t {
+    vmla_u16(a, b, vdup_n_u16(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_n_u16(a: uint16x8_t, b: uint16x8_t, c: u16) -> uint16x8_t {
+    vmlaq_u16(a, b, vdupq_n_u16(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_n_u32(a: uint32x2_t, b: uint32x2_t, c: u32) -> uint32x2_t {
+    vmla_u32(a, b, vdup_n_u32(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_n_u32(a: uint32x4_t, b: uint32x4_t, c: u32) -> uint32x4_t {
+    vmlaq_u32(a, b, vdupq_n_u32(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vmla_f32(a, b, vdup_n_f32(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vmlaq_f32(a, b, vdupq_n_f32(c))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    vmla_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    vmla_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    vmlaq_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    vmlaq_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    vmla_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    vmla_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    vmlaq_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlaq_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    vmla_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t {
+    static_assert_imm3!(LANE);
+    vmla_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t {
+    static_assert_imm2!(LANE);
+    vmlaq_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    vmlaq_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    vmla_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t {
+    static_assert_imm2!(LANE);
+    vmla_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t {
+    static_assert_imm1!(LANE);
+    vmlaq_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlaq_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    vmla_f32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmla_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    vmla_f32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    vmlaq_f32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlaq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    vmlaq_f32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Signed multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    simd_add(a, vmull_s8(b, c))
+}
+
+/// Signed multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    simd_add(a, vmull_s16(b, c))
+}
+
+/// Signed multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    simd_add(a, vmull_s32(b, c))
+}
+
+/// Unsigned multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    simd_add(a, vmull_u8(b, c))
+}
+
+/// Unsigned multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    simd_add(a, vmull_u16(b, c))
+}
+
+/// Unsigned multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    simd_add(a, vmull_u32(b, c))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vmlal_s16(a, b, vdup_n_s16(c))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vmlal_s32(a, b, vdup_n_s32(c))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_n_u16(a: uint32x4_t, b: uint16x4_t, c: u16) -> uint32x4_t {
+    vmlal_u16(a, b, vdup_n_u16(c))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t {
+    vmlal_u32(a, b, vdup_n_u32(c))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlal_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmlal_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmlal_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmlal_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlal_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmlal_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmlal_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlal_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmlal_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Floating-point multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Floating-point multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    simd_sub(a, simd_mul(b, c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_n_s16(a: int16x4_t, b: int16x4_t, c: i16) -> int16x4_t {
+    vmls_s16(a, b, vdup_n_s16(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_n_s16(a: int16x8_t, b: int16x8_t, c: i16) -> int16x8_t {
+    vmlsq_s16(a, b, vdupq_n_s16(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_n_s32(a: int32x2_t, b: int32x2_t, c: i32) -> int32x2_t {
+    vmls_s32(a, b, vdup_n_s32(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_n_s32(a: int32x4_t, b: int32x4_t, c: i32) -> int32x4_t {
+    vmlsq_s32(a, b, vdupq_n_s32(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_n_u16(a: uint16x4_t, b: uint16x4_t, c: u16) -> uint16x4_t {
+    vmls_u16(a, b, vdup_n_u16(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_n_u16(a: uint16x8_t, b: uint16x8_t, c: u16) -> uint16x8_t {
+    vmlsq_u16(a, b, vdupq_n_u16(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_n_u32(a: uint32x2_t, b: uint32x2_t, c: u32) -> uint32x2_t {
+    vmls_u32(a, b, vdup_n_u32(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_n_u32(a: uint32x4_t, b: uint32x4_t, c: u32) -> uint32x4_t {
+    vmlsq_u32(a, b, vdupq_n_u32(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vmls_f32(a, b, vdup_n_f32(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vmlsq_f32(a, b, vdupq_n_f32(c))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    vmls_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    vmls_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    vmlsq_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    vmlsq_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    vmls_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    vmls_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    vmlsq_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsq_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    vmls_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t {
+    static_assert_imm3!(LANE);
+    vmls_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t {
+    static_assert_imm2!(LANE);
+    vmlsq_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    vmlsq_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    vmls_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t {
+    static_assert_imm2!(LANE);
+    vmls_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t {
+    static_assert_imm1!(LANE);
+    vmlsq_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsq_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    vmls_f32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmls_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    vmls_f32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    vmlsq_f32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsq_f32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Signed multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    simd_sub(a, vmull_s8(b, c))
+}
+
+/// Signed multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    simd_sub(a, vmull_s16(b, c))
+}
+
+/// Signed multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    simd_sub(a, vmull_s32(b, c))
+}
+
+/// Unsigned multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    simd_sub(a, vmull_u8(b, c))
+}
+
+/// Unsigned multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    simd_sub(a, vmull_u16(b, c))
+}
+
+/// Unsigned multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    simd_sub(a, vmull_u32(b, c))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vmlsl_s16(a, b, vdup_n_s16(c))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vmlsl_s32(a, b, vdup_n_s32(c))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_n_u16(a: uint32x4_t, b: uint16x4_t, c: u16) -> uint32x4_t {
+    vmlsl_u16(a, b, vdup_n_u16(c))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t {
+    vmlsl_u32(a, b, vdup_n_u32(c))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsl_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmlsl_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmlsl_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmlsl_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsl_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmlsl_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmlsl_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector widening multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmlsl_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmlsl_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vneg_s8(a: int8x8_t) -> int8x8_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vnegq_s8(a: int8x16_t) -> int8x16_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vneg_s16(a: int16x4_t) -> int16x4_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vnegq_s16(a: int16x8_t) -> int16x8_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vneg_s32(a: int32x2_t) -> int32x2_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vnegq_s32(a: int32x4_t) -> int32x4_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fneg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vneg_f32(a: float32x2_t) -> float32x2_t {
+    simd_neg(a)
+}
+
+/// Negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fneg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vnegq_f32(a: float32x4_t) -> float32x4_t {
+    simd_neg(a)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqneg_s8(a: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v8i8")]
+        fn vqneg_s8_(a: int8x8_t) -> int8x8_t;
+    }
+vqneg_s8_(a)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqnegq_s8(a: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v16i8")]
+        fn vqnegq_s8_(a: int8x16_t) -> int8x16_t;
+    }
+vqnegq_s8_(a)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqneg_s16(a: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v4i16")]
+        fn vqneg_s16_(a: int16x4_t) -> int16x4_t;
+    }
+vqneg_s16_(a)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqnegq_s16(a: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v8i16")]
+        fn vqnegq_s16_(a: int16x8_t) -> int16x8_t;
+    }
+vqnegq_s16_(a)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqneg_s32(a: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v2i32")]
+        fn vqneg_s32_(a: int32x2_t) -> int32x2_t;
+    }
+vqneg_s32_(a)
+}
+
+/// Signed saturating negate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqnegq_s32(a: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v4i32")]
+        fn vqnegq_s32_(a: int32x4_t) -> int32x4_t;
+    }
+vqnegq_s32_(a)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v8i8")]
+        fn vqsub_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vqsub_u8_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v16i8")]
+        fn vqsubq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vqsubq_u8_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v4i16")]
+        fn vqsub_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vqsub_u16_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v8i16")]
+        fn vqsubq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vqsubq_u16_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v2i32")]
+        fn vqsub_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vqsub_u32_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v4i32")]
+        fn vqsubq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vqsubq_u32_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v1i64")]
+        fn vqsub_u64_(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t;
+    }
+vqsub_u64_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v2i64")]
+        fn vqsubq_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+vqsubq_u64_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v8i8")]
+        fn vqsub_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqsub_s8_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v16i8")]
+        fn vqsubq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vqsubq_s8_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v4i16")]
+        fn vqsub_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqsub_s16_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v8i16")]
+        fn vqsubq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqsubq_s16_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v2i32")]
+        fn vqsub_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqsub_s32_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v4i32")]
+        fn vqsubq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqsubq_s32_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v1i64")]
+        fn vqsub_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vqsub_s64_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v2i64")]
+        fn vqsubq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vqsubq_s64_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v8i8")]
+        fn vhadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vhadd_u8_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v16i8")]
+        fn vhaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vhaddq_u8_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v4i16")]
+        fn vhadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vhadd_u16_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v8i16")]
+        fn vhaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vhaddq_u16_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v2i32")]
+        fn vhadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vhadd_u32_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v4i32")]
+        fn vhaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vhaddq_u32_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v8i8")]
+        fn vhadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vhadd_s8_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v16i8")]
+        fn vhaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vhaddq_s8_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v4i16")]
+        fn vhadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vhadd_s16_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v8i16")]
+        fn vhaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vhaddq_s16_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v2i32")]
+        fn vhadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vhadd_s32_(a, b)
+}
+
+/// Halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v4i32")]
+        fn vhaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vhaddq_s32_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v8i8")]
+        fn vrhadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vrhadd_u8_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v16i8")]
+        fn vrhaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vrhaddq_u8_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v4i16")]
+        fn vrhadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vrhadd_u16_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v8i16")]
+        fn vrhaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vrhaddq_u16_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v2i32")]
+        fn vrhadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vrhadd_u32_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v4i32")]
+        fn vrhaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vrhaddq_u32_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v8i8")]
+        fn vrhadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vrhadd_s8_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v16i8")]
+        fn vrhaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vrhaddq_s8_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v4i16")]
+        fn vrhadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vrhadd_s16_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v8i16")]
+        fn vrhaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vrhaddq_s16_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v2i32")]
+        fn vrhadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vrhadd_s32_(a, b)
+}
+
+/// Rounding halving add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v4i32")]
+        fn vrhaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vrhaddq_s32_(a, b)
+}
+
+/// Floating-point round to integral, to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frintn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrndn_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v2f32")]
+        fn vrndn_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrndn_f32_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frintn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrndnq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v4f32")]
+        fn vrndnq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrndnq_f32_(a)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v8i8")]
+        fn vqadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vqadd_u8_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v16i8")]
+        fn vqaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vqaddq_u8_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v4i16")]
+        fn vqadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vqadd_u16_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v8i16")]
+        fn vqaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vqaddq_u16_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v2i32")]
+        fn vqadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vqadd_u32_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v4i32")]
+        fn vqaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vqaddq_u32_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v1i64")]
+        fn vqadd_u64_(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t;
+    }
+vqadd_u64_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v2i64")]
+        fn vqaddq_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+vqaddq_u64_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v8i8")]
+        fn vqadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqadd_s8_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v16i8")]
+        fn vqaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vqaddq_s8_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v4i16")]
+        fn vqadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqadd_s16_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v8i16")]
+        fn vqaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqaddq_s16_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v2i32")]
+        fn vqadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqadd_s32_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v4i32")]
+        fn vqaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqaddq_s32_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v1i64")]
+        fn vqadd_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vqadd_s64_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v2i64")]
+        fn vqaddq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vqaddq_s64_(a, b)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s8_x2(a: *const i8) -> int8x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v8i8.p0i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v8i8.p0i8")]
+        fn vld1_s8_x2_(a: *const i8) -> int8x8x2_t;
+    }
+vld1_s8_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s16_x2(a: *const i16) -> int16x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v4i16.p0i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v4i16.p0i16")]
+        fn vld1_s16_x2_(a: *const i16) -> int16x4x2_t;
+    }
+vld1_s16_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s32_x2(a: *const i32) -> int32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v2i32.p0i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v2i32.p0i32")]
+        fn vld1_s32_x2_(a: *const i32) -> int32x2x2_t;
+    }
+vld1_s32_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s64_x2(a: *const i64) -> int64x1x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v1i64.p0i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v1i64.p0i64")]
+        fn vld1_s64_x2_(a: *const i64) -> int64x1x2_t;
+    }
+vld1_s64_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s8_x2(a: *const i8) -> int8x16x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v16i8.p0i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v16i8.p0i8")]
+        fn vld1q_s8_x2_(a: *const i8) -> int8x16x2_t;
+    }
+vld1q_s8_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s16_x2(a: *const i16) -> int16x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v8i16.p0i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v8i16.p0i16")]
+        fn vld1q_s16_x2_(a: *const i16) -> int16x8x2_t;
+    }
+vld1q_s16_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s32_x2(a: *const i32) -> int32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v4i32.p0i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v4i32.p0i32")]
+        fn vld1q_s32_x2_(a: *const i32) -> int32x4x2_t;
+    }
+vld1q_s32_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s64_x2(a: *const i64) -> int64x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v2i64.p0i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v2i64.p0i64")]
+        fn vld1q_s64_x2_(a: *const i64) -> int64x2x2_t;
+    }
+vld1q_s64_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s8_x3(a: *const i8) -> int8x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v8i8.p0i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v8i8.p0i8")]
+        fn vld1_s8_x3_(a: *const i8) -> int8x8x3_t;
+    }
+vld1_s8_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s16_x3(a: *const i16) -> int16x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v4i16.p0i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v4i16.p0i16")]
+        fn vld1_s16_x3_(a: *const i16) -> int16x4x3_t;
+    }
+vld1_s16_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s32_x3(a: *const i32) -> int32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v2i32.p0i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v2i32.p0i32")]
+        fn vld1_s32_x3_(a: *const i32) -> int32x2x3_t;
+    }
+vld1_s32_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s64_x3(a: *const i64) -> int64x1x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v1i64.p0i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v1i64.p0i64")]
+        fn vld1_s64_x3_(a: *const i64) -> int64x1x3_t;
+    }
+vld1_s64_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s8_x3(a: *const i8) -> int8x16x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v16i8.p0i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v16i8.p0i8")]
+        fn vld1q_s8_x3_(a: *const i8) -> int8x16x3_t;
+    }
+vld1q_s8_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s16_x3(a: *const i16) -> int16x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v8i16.p0i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v8i16.p0i16")]
+        fn vld1q_s16_x3_(a: *const i16) -> int16x8x3_t;
+    }
+vld1q_s16_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s32_x3(a: *const i32) -> int32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v4i32.p0i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v4i32.p0i32")]
+        fn vld1q_s32_x3_(a: *const i32) -> int32x4x3_t;
+    }
+vld1q_s32_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s64_x3(a: *const i64) -> int64x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v2i64.p0i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v2i64.p0i64")]
+        fn vld1q_s64_x3_(a: *const i64) -> int64x2x3_t;
+    }
+vld1q_s64_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s8_x4(a: *const i8) -> int8x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v8i8.p0i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v8i8.p0i8")]
+        fn vld1_s8_x4_(a: *const i8) -> int8x8x4_t;
+    }
+vld1_s8_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s16_x4(a: *const i16) -> int16x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v4i16.p0i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v4i16.p0i16")]
+        fn vld1_s16_x4_(a: *const i16) -> int16x4x4_t;
+    }
+vld1_s16_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s32_x4(a: *const i32) -> int32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v2i32.p0i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v2i32.p0i32")]
+        fn vld1_s32_x4_(a: *const i32) -> int32x2x4_t;
+    }
+vld1_s32_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_s64_x4(a: *const i64) -> int64x1x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v1i64.p0i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v1i64.p0i64")]
+        fn vld1_s64_x4_(a: *const i64) -> int64x1x4_t;
+    }
+vld1_s64_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s8_x4(a: *const i8) -> int8x16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v16i8.p0i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v16i8.p0i8")]
+        fn vld1q_s8_x4_(a: *const i8) -> int8x16x4_t;
+    }
+vld1q_s8_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s16_x4(a: *const i16) -> int16x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v8i16.p0i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v8i16.p0i16")]
+        fn vld1q_s16_x4_(a: *const i16) -> int16x8x4_t;
+    }
+vld1q_s16_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s32_x4(a: *const i32) -> int32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v4i32.p0i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v4i32.p0i32")]
+        fn vld1q_s32_x4_(a: *const i32) -> int32x4x4_t;
+    }
+vld1q_s32_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_s64_x4(a: *const i64) -> int64x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v2i64.p0i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v2i64.p0i64")]
+        fn vld1q_s64_x4_(a: *const i64) -> int64x2x4_t;
+    }
+vld1q_s64_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u8_x2(a: *const u8) -> uint8x8x2_t {
+    transmute(vld1_s8_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u16_x2(a: *const u16) -> uint16x4x2_t {
+    transmute(vld1_s16_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u32_x2(a: *const u32) -> uint32x2x2_t {
+    transmute(vld1_s32_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u64_x2(a: *const u64) -> uint64x1x2_t {
+    transmute(vld1_s64_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u8_x2(a: *const u8) -> uint8x16x2_t {
+    transmute(vld1q_s8_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u16_x2(a: *const u16) -> uint16x8x2_t {
+    transmute(vld1q_s16_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u32_x2(a: *const u32) -> uint32x4x2_t {
+    transmute(vld1q_s32_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u64_x2(a: *const u64) -> uint64x2x2_t {
+    transmute(vld1q_s64_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u8_x3(a: *const u8) -> uint8x8x3_t {
+    transmute(vld1_s8_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u16_x3(a: *const u16) -> uint16x4x3_t {
+    transmute(vld1_s16_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u32_x3(a: *const u32) -> uint32x2x3_t {
+    transmute(vld1_s32_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u64_x3(a: *const u64) -> uint64x1x3_t {
+    transmute(vld1_s64_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u8_x3(a: *const u8) -> uint8x16x3_t {
+    transmute(vld1q_s8_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u16_x3(a: *const u16) -> uint16x8x3_t {
+    transmute(vld1q_s16_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u32_x3(a: *const u32) -> uint32x4x3_t {
+    transmute(vld1q_s32_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u64_x3(a: *const u64) -> uint64x2x3_t {
+    transmute(vld1q_s64_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u8_x4(a: *const u8) -> uint8x8x4_t {
+    transmute(vld1_s8_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u16_x4(a: *const u16) -> uint16x4x4_t {
+    transmute(vld1_s16_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u32_x4(a: *const u32) -> uint32x2x4_t {
+    transmute(vld1_s32_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_u64_x4(a: *const u64) -> uint64x1x4_t {
+    transmute(vld1_s64_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u8_x4(a: *const u8) -> uint8x16x4_t {
+    transmute(vld1q_s8_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u16_x4(a: *const u16) -> uint16x8x4_t {
+    transmute(vld1q_s16_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u32_x4(a: *const u32) -> uint32x4x4_t {
+    transmute(vld1q_s32_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_u64_x4(a: *const u64) -> uint64x2x4_t {
+    transmute(vld1q_s64_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_p8_x2(a: *const p8) -> poly8x8x2_t {
+    transmute(vld1_s8_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_p8_x3(a: *const p8) -> poly8x8x3_t {
+    transmute(vld1_s8_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_p8_x4(a: *const p8) -> poly8x8x4_t {
+    transmute(vld1_s8_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_p8_x2(a: *const p8) -> poly8x16x2_t {
+    transmute(vld1q_s8_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_p8_x3(a: *const p8) -> poly8x16x3_t {
+    transmute(vld1q_s8_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_p8_x4(a: *const p8) -> poly8x16x4_t {
+    transmute(vld1q_s8_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_p16_x2(a: *const p16) -> poly16x4x2_t {
+    transmute(vld1_s16_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_p16_x3(a: *const p16) -> poly16x4x3_t {
+    transmute(vld1_s16_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_p16_x4(a: *const p16) -> poly16x4x4_t {
+    transmute(vld1_s16_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_p16_x2(a: *const p16) -> poly16x8x2_t {
+    transmute(vld1q_s16_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_p16_x3(a: *const p16) -> poly16x8x3_t {
+    transmute(vld1q_s16_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_p16_x4(a: *const p16) -> poly16x8x4_t {
+    transmute(vld1q_s16_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_p64_x2(a: *const p64) -> poly64x1x2_t {
+    transmute(vld1_s64_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_p64_x3(a: *const p64) -> poly64x1x3_t {
+    transmute(vld1_s64_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_p64_x4(a: *const p64) -> poly64x1x4_t {
+    transmute(vld1_s64_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_p64_x2(a: *const p64) -> poly64x2x2_t {
+    transmute(vld1q_s64_x2(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_p64_x3(a: *const p64) -> poly64x2x3_t {
+    transmute(vld1q_s64_x3(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_p64_x4(a: *const p64) -> poly64x2x4_t {
+    transmute(vld1q_s64_x4(transmute(a)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_f32_x2(a: *const f32) -> float32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v2f32.p0f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v2f32.p0f32")]
+        fn vld1_f32_x2_(a: *const f32) -> float32x2x2_t;
+    }
+vld1_f32_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_f32_x2(a: *const f32) -> float32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x2.v4f32.p0f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x2.v4f32.p0f32")]
+        fn vld1q_f32_x2_(a: *const f32) -> float32x4x2_t;
+    }
+vld1q_f32_x2_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_f32_x3(a: *const f32) -> float32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v2f32.p0f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v2f32.p0f32")]
+        fn vld1_f32_x3_(a: *const f32) -> float32x2x3_t;
+    }
+vld1_f32_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_f32_x3(a: *const f32) -> float32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x3.v4f32.p0f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x3.v4f32.p0f32")]
+        fn vld1q_f32_x3_(a: *const f32) -> float32x4x3_t;
+    }
+vld1q_f32_x3_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1_f32_x4(a: *const f32) -> float32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v2f32.p0f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v2f32.p0f32")]
+        fn vld1_f32_x4_(a: *const f32) -> float32x2x4_t;
+    }
+vld1_f32_x4_(a)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld1q_f32_x4(a: *const f32) -> float32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1x4.v4f32.p0f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld1x4.v4f32.p0f32")]
+        fn vld1q_f32_x4_(a: *const f32) -> float32x4x4_t;
+    }
+vld1q_f32_x4_(a)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_s8(a: *const i8) -> int8x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v8i8.p0i8")]
+        fn vld2_s8_(ptr: *const i8, size: i32) -> int8x8x2_t;
+    }
+vld2_s8_(a as *const i8, 1)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_s8(a: *const i8) -> int8x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v8i8.p0v8i8")]
+        fn vld2_s8_(ptr: *const int8x8_t) -> int8x8x2_t;
+    }
+vld2_s8_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_s16(a: *const i16) -> int16x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4i16.p0i8")]
+        fn vld2_s16_(ptr: *const i8, size: i32) -> int16x4x2_t;
+    }
+vld2_s16_(a as *const i8, 2)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_s16(a: *const i16) -> int16x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v4i16.p0v4i16")]
+        fn vld2_s16_(ptr: *const int16x4_t) -> int16x4x2_t;
+    }
+vld2_s16_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_s32(a: *const i32) -> int32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v2i32.p0i8")]
+        fn vld2_s32_(ptr: *const i8, size: i32) -> int32x2x2_t;
+    }
+vld2_s32_(a as *const i8, 4)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_s32(a: *const i32) -> int32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2i32.p0v2i32")]
+        fn vld2_s32_(ptr: *const int32x2_t) -> int32x2x2_t;
+    }
+vld2_s32_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_s8(a: *const i8) -> int8x16x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v16i8.p0i8")]
+        fn vld2q_s8_(ptr: *const i8, size: i32) -> int8x16x2_t;
+    }
+vld2q_s8_(a as *const i8, 1)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_s8(a: *const i8) -> int8x16x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v16i8.p0v16i8")]
+        fn vld2q_s8_(ptr: *const int8x16_t) -> int8x16x2_t;
+    }
+vld2q_s8_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_s16(a: *const i16) -> int16x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v8i16.p0i8")]
+        fn vld2q_s16_(ptr: *const i8, size: i32) -> int16x8x2_t;
+    }
+vld2q_s16_(a as *const i8, 2)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_s16(a: *const i16) -> int16x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v8i16.p0v8i16")]
+        fn vld2q_s16_(ptr: *const int16x8_t) -> int16x8x2_t;
+    }
+vld2q_s16_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_s32(a: *const i32) -> int32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4i32.p0i8")]
+        fn vld2q_s32_(ptr: *const i8, size: i32) -> int32x4x2_t;
+    }
+vld2q_s32_(a as *const i8, 4)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_s32(a: *const i32) -> int32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v4i32.p0v4i32")]
+        fn vld2q_s32_(ptr: *const int32x4_t) -> int32x4x2_t;
+    }
+vld2q_s32_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v1i64.p0i8")]
+        fn vld2_s64_(ptr: *const i8, size: i32) -> int64x1x2_t;
+    }
+vld2_s64_(a as *const i8, 8)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v1i64.p0v1i64")]
+        fn vld2_s64_(ptr: *const int64x1_t) -> int64x1x2_t;
+    }
+vld2_s64_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_u8(a: *const u8) -> uint8x8x2_t {
+    transmute(vld2_s8(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_u16(a: *const u16) -> uint16x4x2_t {
+    transmute(vld2_s16(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_u32(a: *const u32) -> uint32x2x2_t {
+    transmute(vld2_s32(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_u8(a: *const u8) -> uint8x16x2_t {
+    transmute(vld2q_s8(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_u16(a: *const u16) -> uint16x8x2_t {
+    transmute(vld2q_s16(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_u32(a: *const u32) -> uint32x4x2_t {
+    transmute(vld2q_s32(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_p8(a: *const p8) -> poly8x8x2_t {
+    transmute(vld2_s8(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_p16(a: *const p16) -> poly16x4x2_t {
+    transmute(vld2_s16(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_p8(a: *const p8) -> poly8x16x2_t {
+    transmute(vld2q_s8(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_p16(a: *const p16) -> poly16x8x2_t {
+    transmute(vld2q_s16(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t {
+    transmute(vld2_s64(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_p64(a: *const p64) -> poly64x1x2_t {
+    transmute(vld2_s64(transmute(a)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_f32(a: *const f32) -> float32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v2f32.p0i8")]
+        fn vld2_f32_(ptr: *const i8, size: i32) -> float32x2x2_t;
+    }
+vld2_f32_(a as *const i8, 4)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_f32(a: *const f32) -> float32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2f32.p0v2f32")]
+        fn vld2_f32_(ptr: *const float32x2_t) -> float32x2x2_t;
+    }
+vld2_f32_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_f32(a: *const f32) -> float32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4f32.p0i8")]
+        fn vld2q_f32_(ptr: *const i8, size: i32) -> float32x4x2_t;
+    }
+vld2q_f32_(a as *const i8, 4)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_f32(a: *const f32) -> float32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v4f32.p0v4f32")]
+        fn vld2q_f32_(ptr: *const float32x4_t) -> float32x4x2_t;
+    }
+vld2q_f32_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_dup_s8(a: *const i8) -> int8x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v8i8.p0i8")]
+        fn vld2_dup_s8_(ptr: *const i8, size: i32) -> int8x8x2_t;
+    }
+vld2_dup_s8_(a as *const i8, 1)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_dup_s8(a: *const i8) -> int8x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v8i8.p0i8")]
+        fn vld2_dup_s8_(ptr: *const i8) -> int8x8x2_t;
+    }
+vld2_dup_s8_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_dup_s16(a: *const i16) -> int16x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4i16.p0i8")]
+        fn vld2_dup_s16_(ptr: *const i8, size: i32) -> int16x4x2_t;
+    }
+vld2_dup_s16_(a as *const i8, 2)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_dup_s16(a: *const i16) -> int16x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4i16.p0i16")]
+        fn vld2_dup_s16_(ptr: *const i16) -> int16x4x2_t;
+    }
+vld2_dup_s16_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v2i32.p0i8")]
+        fn vld2_dup_s32_(ptr: *const i8, size: i32) -> int32x2x2_t;
+    }
+vld2_dup_s32_(a as *const i8, 4)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2i32.p0i32")]
+        fn vld2_dup_s32_(ptr: *const i32) -> int32x2x2_t;
+    }
+vld2_dup_s32_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_dup_s8(a: *const i8) -> int8x16x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v16i8.p0i8")]
+        fn vld2q_dup_s8_(ptr: *const i8, size: i32) -> int8x16x2_t;
+    }
+vld2q_dup_s8_(a as *const i8, 1)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_dup_s8(a: *const i8) -> int8x16x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v16i8.p0i8")]
+        fn vld2q_dup_s8_(ptr: *const i8) -> int8x16x2_t;
+    }
+vld2q_dup_s8_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_dup_s16(a: *const i16) -> int16x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v8i16.p0i8")]
+        fn vld2q_dup_s16_(ptr: *const i8, size: i32) -> int16x8x2_t;
+    }
+vld2q_dup_s16_(a as *const i8, 2)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_dup_s16(a: *const i16) -> int16x8x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v8i16.p0i16")]
+        fn vld2q_dup_s16_(ptr: *const i16) -> int16x8x2_t;
+    }
+vld2q_dup_s16_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_dup_s32(a: *const i32) -> int32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4i32.p0i8")]
+        fn vld2q_dup_s32_(ptr: *const i8, size: i32) -> int32x4x2_t;
+    }
+vld2q_dup_s32_(a as *const i8, 4)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_dup_s32(a: *const i32) -> int32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4i32.p0i32")]
+        fn vld2q_dup_s32_(ptr: *const i32) -> int32x4x2_t;
+    }
+vld2q_dup_s32_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v1i64.p0i8")]
+        fn vld2_dup_s64_(ptr: *const i8, size: i32) -> int64x1x2_t;
+    }
+vld2_dup_s64_(a as *const i8, 8)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1i64.p0i64")]
+        fn vld2_dup_s64_(ptr: *const i64) -> int64x1x2_t;
+    }
+vld2_dup_s64_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t {
+    transmute(vld2_dup_s8(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t {
+    transmute(vld2_dup_s16(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t {
+    transmute(vld2_dup_s32(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_dup_u8(a: *const u8) -> uint8x16x2_t {
+    transmute(vld2q_dup_s8(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_dup_u16(a: *const u16) -> uint16x8x2_t {
+    transmute(vld2q_dup_s16(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_dup_u32(a: *const u32) -> uint32x4x2_t {
+    transmute(vld2q_dup_s32(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_dup_p8(a: *const p8) -> poly8x8x2_t {
+    transmute(vld2_dup_s8(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_dup_p16(a: *const p16) -> poly16x4x2_t {
+    transmute(vld2_dup_s16(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_dup_p8(a: *const p8) -> poly8x16x2_t {
+    transmute(vld2q_dup_s8(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_dup_p16(a: *const p16) -> poly16x8x2_t {
+    transmute(vld2q_dup_s16(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_dup_u64(a: *const u64) -> uint64x1x2_t {
+    transmute(vld2_dup_s64(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_dup_p64(a: *const p64) -> poly64x1x2_t {
+    transmute(vld2_dup_s64(transmute(a)))
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2_dup_f32(a: *const f32) -> float32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v2f32.p0i8")]
+        fn vld2_dup_f32_(ptr: *const i8, size: i32) -> float32x2x2_t;
+    }
+vld2_dup_f32_(a as *const i8, 4)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_dup_f32(a: *const f32) -> float32x2x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2f32.p0f32")]
+        fn vld2_dup_f32_(ptr: *const f32) -> float32x2x2_t;
+    }
+vld2_dup_f32_(a as _)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2))]
+pub unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4f32.p0i8")]
+        fn vld2q_dup_f32_(ptr: *const i8, size: i32) -> float32x4x2_t;
+    }
+vld2q_dup_f32_(a as *const i8, 4)
+}
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4f32.p0f32")]
+        fn vld2q_dup_f32_(ptr: *const f32) -> float32x4x2_t;
+    }
+vld2q_dup_f32_(a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld2_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x2_t) -> int8x8x2_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v8i8.p0i8")]
+        fn vld2_lane_s8_(ptr: *const i8, a: int8x8_t, b: int8x8_t, n: i32, size: i32) -> int8x8x2_t;
+    }
+vld2_lane_s8_(a as _, b.0, b.1, LANE, 1)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x2_t) -> int8x8x2_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v8i8.p0i8")]
+        fn vld2_lane_s8_(a: int8x8_t, b: int8x8_t, n: i64, ptr: *const i8) -> int8x8x2_t;
+    }
+vld2_lane_s8_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld2_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x2_t) -> int16x4x2_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4i16.p0i8")]
+        fn vld2_lane_s16_(ptr: *const i8, a: int16x4_t, b: int16x4_t, n: i32, size: i32) -> int16x4x2_t;
+    }
+vld2_lane_s16_(a as _, b.0, b.1, LANE, 2)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x2_t) -> int16x4x2_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v4i16.p0i8")]
+        fn vld2_lane_s16_(a: int16x4_t, b: int16x4_t, n: i64, ptr: *const i8) -> int16x4x2_t;
+    }
+vld2_lane_s16_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld2_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x2_t) -> int32x2x2_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v2i32.p0i8")]
+        fn vld2_lane_s32_(ptr: *const i8, a: int32x2_t, b: int32x2_t, n: i32, size: i32) -> int32x2x2_t;
+    }
+vld2_lane_s32_(a as _, b.0, b.1, LANE, 4)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x2_t) -> int32x2x2_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2i32.p0i8")]
+        fn vld2_lane_s32_(a: int32x2_t, b: int32x2_t, n: i64, ptr: *const i8) -> int32x2x2_t;
+    }
+vld2_lane_s32_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld2q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x2_t) -> int16x8x2_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v8i16.p0i8")]
+        fn vld2q_lane_s16_(ptr: *const i8, a: int16x8_t, b: int16x8_t, n: i32, size: i32) -> int16x8x2_t;
+    }
+vld2q_lane_s16_(a as _, b.0, b.1, LANE, 2)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x2_t) -> int16x8x2_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v8i16.p0i8")]
+        fn vld2q_lane_s16_(a: int16x8_t, b: int16x8_t, n: i64, ptr: *const i8) -> int16x8x2_t;
+    }
+vld2q_lane_s16_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld2q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x2_t) -> int32x4x2_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4i32.p0i8")]
+        fn vld2q_lane_s32_(ptr: *const i8, a: int32x4_t, b: int32x4_t, n: i32, size: i32) -> int32x4x2_t;
+    }
+vld2q_lane_s32_(a as _, b.0, b.1, LANE, 4)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x2_t) -> int32x4x2_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v4i32.p0i8")]
+        fn vld2q_lane_s32_(a: int32x4_t, b: int32x4_t, n: i64, ptr: *const i8) -> int32x4x2_t;
+    }
+vld2q_lane_s32_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_lane_u8<const LANE: i32>(a: *const u8, b: uint8x8x2_t) -> uint8x8x2_t {
+    static_assert_imm3!(LANE);
+    transmute(vld2_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_lane_u16<const LANE: i32>(a: *const u16, b: uint16x4x2_t) -> uint16x4x2_t {
+    static_assert_imm2!(LANE);
+    transmute(vld2_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_lane_u32<const LANE: i32>(a: *const u32, b: uint32x2x2_t) -> uint32x2x2_t {
+    static_assert_imm1!(LANE);
+    transmute(vld2_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_lane_u16<const LANE: i32>(a: *const u16, b: uint16x8x2_t) -> uint16x8x2_t {
+    static_assert_imm3!(LANE);
+    transmute(vld2q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_lane_u32<const LANE: i32>(a: *const u32, b: uint32x4x2_t) -> uint32x4x2_t {
+    static_assert_imm2!(LANE);
+    transmute(vld2q_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_lane_p8<const LANE: i32>(a: *const p8, b: poly8x8x2_t) -> poly8x8x2_t {
+    static_assert_imm3!(LANE);
+    transmute(vld2_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2_lane_p16<const LANE: i32>(a: *const p16, b: poly16x4x2_t) -> poly16x4x2_t {
+    static_assert_imm2!(LANE);
+    transmute(vld2_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld2q_lane_p16<const LANE: i32>(a: *const p16, b: poly16x8x2_t) -> poly16x8x2_t {
+    static_assert_imm3!(LANE);
+    transmute(vld2q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld2_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x2_t) -> float32x2x2_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v2f32.p0i8")]
+        fn vld2_lane_f32_(ptr: *const i8, a: float32x2_t, b: float32x2_t, n: i32, size: i32) -> float32x2x2_t;
+    }
+vld2_lane_f32_(a as _, b.0, b.1, LANE, 4)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x2_t) -> float32x2x2_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2f32.p0i8")]
+        fn vld2_lane_f32_(a: float32x2_t, b: float32x2_t, n: i64, ptr: *const i8) -> float32x2x2_t;
+    }
+vld2_lane_f32_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld2q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x2_t) -> float32x4x2_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4f32.p0i8")]
+        fn vld2q_lane_f32_(ptr: *const i8, a: float32x4_t, b: float32x4_t, n: i32, size: i32) -> float32x4x2_t;
+    }
+vld2q_lane_f32_(a as _, b.0, b.1, LANE, 4)
+}
+
+/// Load multiple 2-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld2q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x2_t) -> float32x4x2_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v4f32.p0i8")]
+        fn vld2q_lane_f32_(a: float32x4_t, b: float32x4_t, n: i64, ptr: *const i8) -> float32x4x2_t;
+    }
+vld2q_lane_f32_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v8i8.p0i8")]
+        fn vld3_s8_(ptr: *const i8, size: i32) -> int8x8x3_t;
+    }
+vld3_s8_(a as *const i8, 1)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v8i8.p0v8i8")]
+        fn vld3_s8_(ptr: *const int8x8_t) -> int8x8x3_t;
+    }
+vld3_s8_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4i16.p0i8")]
+        fn vld3_s16_(ptr: *const i8, size: i32) -> int16x4x3_t;
+    }
+vld3_s16_(a as *const i8, 2)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v4i16.p0v4i16")]
+        fn vld3_s16_(ptr: *const int16x4_t) -> int16x4x3_t;
+    }
+vld3_s16_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v2i32.p0i8")]
+        fn vld3_s32_(ptr: *const i8, size: i32) -> int32x2x3_t;
+    }
+vld3_s32_(a as *const i8, 4)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2i32.p0v2i32")]
+        fn vld3_s32_(ptr: *const int32x2_t) -> int32x2x3_t;
+    }
+vld3_s32_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v16i8.p0i8")]
+        fn vld3q_s8_(ptr: *const i8, size: i32) -> int8x16x3_t;
+    }
+vld3q_s8_(a as *const i8, 1)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v16i8.p0v16i8")]
+        fn vld3q_s8_(ptr: *const int8x16_t) -> int8x16x3_t;
+    }
+vld3q_s8_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v8i16.p0i8")]
+        fn vld3q_s16_(ptr: *const i8, size: i32) -> int16x8x3_t;
+    }
+vld3q_s16_(a as *const i8, 2)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v8i16.p0v8i16")]
+        fn vld3q_s16_(ptr: *const int16x8_t) -> int16x8x3_t;
+    }
+vld3q_s16_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4i32.p0i8")]
+        fn vld3q_s32_(ptr: *const i8, size: i32) -> int32x4x3_t;
+    }
+vld3q_s32_(a as *const i8, 4)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v4i32.p0v4i32")]
+        fn vld3q_s32_(ptr: *const int32x4_t) -> int32x4x3_t;
+    }
+vld3q_s32_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v1i64.p0i8")]
+        fn vld3_s64_(ptr: *const i8, size: i32) -> int64x1x3_t;
+    }
+vld3_s64_(a as *const i8, 8)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v1i64.p0v1i64")]
+        fn vld3_s64_(ptr: *const int64x1_t) -> int64x1x3_t;
+    }
+vld3_s64_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_u8(a: *const u8) -> uint8x8x3_t {
+    transmute(vld3_s8(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_u16(a: *const u16) -> uint16x4x3_t {
+    transmute(vld3_s16(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_u32(a: *const u32) -> uint32x2x3_t {
+    transmute(vld3_s32(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_u8(a: *const u8) -> uint8x16x3_t {
+    transmute(vld3q_s8(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_u16(a: *const u16) -> uint16x8x3_t {
+    transmute(vld3q_s16(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_u32(a: *const u32) -> uint32x4x3_t {
+    transmute(vld3q_s32(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_p8(a: *const p8) -> poly8x8x3_t {
+    transmute(vld3_s8(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_p16(a: *const p16) -> poly16x4x3_t {
+    transmute(vld3_s16(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_p8(a: *const p8) -> poly8x16x3_t {
+    transmute(vld3q_s8(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_p16(a: *const p16) -> poly16x8x3_t {
+    transmute(vld3q_s16(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_u64(a: *const u64) -> uint64x1x3_t {
+    transmute(vld3_s64(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_p64(a: *const p64) -> poly64x1x3_t {
+    transmute(vld3_s64(transmute(a)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v2f32.p0i8")]
+        fn vld3_f32_(ptr: *const i8, size: i32) -> float32x2x3_t;
+    }
+vld3_f32_(a as *const i8, 4)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2f32.p0v2f32")]
+        fn vld3_f32_(ptr: *const float32x2_t) -> float32x2x3_t;
+    }
+vld3_f32_(a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4f32.p0i8")]
+        fn vld3q_f32_(ptr: *const i8, size: i32) -> float32x4x3_t;
+    }
+vld3q_f32_(a as *const i8, 4)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v4f32.p0v4f32")]
+        fn vld3q_f32_(ptr: *const float32x4_t) -> float32x4x3_t;
+    }
+vld3q_f32_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_dup_s8(a: *const i8) -> int8x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v8i8.p0i8")]
+        fn vld3_dup_s8_(ptr: *const i8, size: i32) -> int8x8x3_t;
+    }
+vld3_dup_s8_(a as *const i8, 1)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_dup_s8(a: *const i8) -> int8x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v8i8.p0i8")]
+        fn vld3_dup_s8_(ptr: *const i8) -> int8x8x3_t;
+    }
+vld3_dup_s8_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_dup_s16(a: *const i16) -> int16x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4i16.p0i8")]
+        fn vld3_dup_s16_(ptr: *const i8, size: i32) -> int16x4x3_t;
+    }
+vld3_dup_s16_(a as *const i8, 2)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_dup_s16(a: *const i16) -> int16x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v4i16.p0i16")]
+        fn vld3_dup_s16_(ptr: *const i16) -> int16x4x3_t;
+    }
+vld3_dup_s16_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_dup_s32(a: *const i32) -> int32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v2i32.p0i8")]
+        fn vld3_dup_s32_(ptr: *const i8, size: i32) -> int32x2x3_t;
+    }
+vld3_dup_s32_(a as *const i8, 4)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_dup_s32(a: *const i32) -> int32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2i32.p0i32")]
+        fn vld3_dup_s32_(ptr: *const i32) -> int32x2x3_t;
+    }
+vld3_dup_s32_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_dup_s8(a: *const i8) -> int8x16x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v16i8.p0i8")]
+        fn vld3q_dup_s8_(ptr: *const i8, size: i32) -> int8x16x3_t;
+    }
+vld3q_dup_s8_(a as *const i8, 1)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_dup_s8(a: *const i8) -> int8x16x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v16i8.p0i8")]
+        fn vld3q_dup_s8_(ptr: *const i8) -> int8x16x3_t;
+    }
+vld3q_dup_s8_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_dup_s16(a: *const i16) -> int16x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v8i16.p0i8")]
+        fn vld3q_dup_s16_(ptr: *const i8, size: i32) -> int16x8x3_t;
+    }
+vld3q_dup_s16_(a as *const i8, 2)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_dup_s16(a: *const i16) -> int16x8x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v8i16.p0i16")]
+        fn vld3q_dup_s16_(ptr: *const i16) -> int16x8x3_t;
+    }
+vld3q_dup_s16_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_dup_s32(a: *const i32) -> int32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4i32.p0i8")]
+        fn vld3q_dup_s32_(ptr: *const i8, size: i32) -> int32x4x3_t;
+    }
+vld3q_dup_s32_(a as *const i8, 4)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_dup_s32(a: *const i32) -> int32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v4i32.p0i32")]
+        fn vld3q_dup_s32_(ptr: *const i32) -> int32x4x3_t;
+    }
+vld3q_dup_s32_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld3_dup_s64(a: *const i64) -> int64x1x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v1i64.p0i8")]
+        fn vld3_dup_s64_(ptr: *const i8, size: i32) -> int64x1x3_t;
+    }
+vld3_dup_s64_(a as *const i8, 8)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_dup_s64(a: *const i64) -> int64x1x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v1i64.p0i64")]
+        fn vld3_dup_s64_(ptr: *const i64) -> int64x1x3_t;
+    }
+vld3_dup_s64_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_dup_u8(a: *const u8) -> uint8x8x3_t {
+    transmute(vld3_dup_s8(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_dup_u16(a: *const u16) -> uint16x4x3_t {
+    transmute(vld3_dup_s16(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_dup_u32(a: *const u32) -> uint32x2x3_t {
+    transmute(vld3_dup_s32(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_dup_u8(a: *const u8) -> uint8x16x3_t {
+    transmute(vld3q_dup_s8(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_dup_u16(a: *const u16) -> uint16x8x3_t {
+    transmute(vld3q_dup_s16(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_dup_u32(a: *const u32) -> uint32x4x3_t {
+    transmute(vld3q_dup_s32(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_dup_p8(a: *const p8) -> poly8x8x3_t {
+    transmute(vld3_dup_s8(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_dup_p16(a: *const p16) -> poly16x4x3_t {
+    transmute(vld3_dup_s16(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_dup_p8(a: *const p8) -> poly8x16x3_t {
+    transmute(vld3q_dup_s8(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_dup_p16(a: *const p16) -> poly16x8x3_t {
+    transmute(vld3q_dup_s16(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_dup_u64(a: *const u64) -> uint64x1x3_t {
+    transmute(vld3_dup_s64(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_dup_p64(a: *const p64) -> poly64x1x3_t {
+    transmute(vld3_dup_s64(transmute(a)))
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3_dup_f32(a: *const f32) -> float32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v2f32.p0i8")]
+        fn vld3_dup_f32_(ptr: *const i8, size: i32) -> float32x2x3_t;
+    }
+vld3_dup_f32_(a as *const i8, 4)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_dup_f32(a: *const f32) -> float32x2x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2f32.p0f32")]
+        fn vld3_dup_f32_(ptr: *const f32) -> float32x2x3_t;
+    }
+vld3_dup_f32_(a as _)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3))]
+pub unsafe fn vld3q_dup_f32(a: *const f32) -> float32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4f32.p0i8")]
+        fn vld3q_dup_f32_(ptr: *const i8, size: i32) -> float32x4x3_t;
+    }
+vld3q_dup_f32_(a as *const i8, 4)
+}
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_dup_f32(a: *const f32) -> float32x4x3_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v4f32.p0f32")]
+        fn vld3q_dup_f32_(ptr: *const f32) -> float32x4x3_t;
+    }
+vld3q_dup_f32_(a as _)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld3_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x3_t) -> int8x8x3_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v8i8.p0i8")]
+        fn vld3_lane_s8_(ptr: *const i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i32, size: i32) -> int8x8x3_t;
+    }
+vld3_lane_s8_(a as _, b.0, b.1, b.2, LANE, 1)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x3_t) -> int8x8x3_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v8i8.p0i8")]
+        fn vld3_lane_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i64, ptr: *const i8) -> int8x8x3_t;
+    }
+vld3_lane_s8_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld3_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x3_t) -> int16x4x3_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4i16.p0i8")]
+        fn vld3_lane_s16_(ptr: *const i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, n: i32, size: i32) -> int16x4x3_t;
+    }
+vld3_lane_s16_(a as _, b.0, b.1, b.2, LANE, 2)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x3_t) -> int16x4x3_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v4i16.p0i8")]
+        fn vld3_lane_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, n: i64, ptr: *const i8) -> int16x4x3_t;
+    }
+vld3_lane_s16_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld3_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x3_t) -> int32x2x3_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v2i32.p0i8")]
+        fn vld3_lane_s32_(ptr: *const i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, n: i32, size: i32) -> int32x2x3_t;
+    }
+vld3_lane_s32_(a as _, b.0, b.1, b.2, LANE, 4)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x3_t) -> int32x2x3_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2i32.p0i8")]
+        fn vld3_lane_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, n: i64, ptr: *const i8) -> int32x2x3_t;
+    }
+vld3_lane_s32_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld3q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x3_t) -> int16x8x3_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v8i16.p0i8")]
+        fn vld3q_lane_s16_(ptr: *const i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, n: i32, size: i32) -> int16x8x3_t;
+    }
+vld3q_lane_s16_(a as _, b.0, b.1, b.2, LANE, 2)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x3_t) -> int16x8x3_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v8i16.p0i8")]
+        fn vld3q_lane_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, n: i64, ptr: *const i8) -> int16x8x3_t;
+    }
+vld3q_lane_s16_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld3q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x3_t) -> int32x4x3_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4i32.p0i8")]
+        fn vld3q_lane_s32_(ptr: *const i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, n: i32, size: i32) -> int32x4x3_t;
+    }
+vld3q_lane_s32_(a as _, b.0, b.1, b.2, LANE, 4)
+}
+
+/// Load multiple 3-element structures to two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x3_t) -> int32x4x3_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v4i32.p0i8")]
+        fn vld3q_lane_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, n: i64, ptr: *const i8) -> int32x4x3_t;
+    }
+vld3q_lane_s32_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_lane_u8<const LANE: i32>(a: *const u8, b: uint8x8x3_t) -> uint8x8x3_t {
+    static_assert_imm3!(LANE);
+    transmute(vld3_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_lane_u16<const LANE: i32>(a: *const u16, b: uint16x4x3_t) -> uint16x4x3_t {
+    static_assert_imm2!(LANE);
+    transmute(vld3_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_lane_u32<const LANE: i32>(a: *const u32, b: uint32x2x3_t) -> uint32x2x3_t {
+    static_assert_imm1!(LANE);
+    transmute(vld3_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_lane_u16<const LANE: i32>(a: *const u16, b: uint16x8x3_t) -> uint16x8x3_t {
+    static_assert_imm3!(LANE);
+    transmute(vld3q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_lane_u32<const LANE: i32>(a: *const u32, b: uint32x4x3_t) -> uint32x4x3_t {
+    static_assert_imm2!(LANE);
+    transmute(vld3q_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_lane_p8<const LANE: i32>(a: *const p8, b: poly8x8x3_t) -> poly8x8x3_t {
+    static_assert_imm3!(LANE);
+    transmute(vld3_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3_lane_p16<const LANE: i32>(a: *const p16, b: poly16x4x3_t) -> poly16x4x3_t {
+    static_assert_imm2!(LANE);
+    transmute(vld3_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld3q_lane_p16<const LANE: i32>(a: *const p16, b: poly16x8x3_t) -> poly16x8x3_t {
+    static_assert_imm3!(LANE);
+    transmute(vld3q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld3_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x3_t) -> float32x2x3_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v2f32.p0i8")]
+        fn vld3_lane_f32_(ptr: *const i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, n: i32, size: i32) -> float32x2x3_t;
+    }
+vld3_lane_f32_(a as _, b.0, b.1, b.2, LANE, 4)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x3_t) -> float32x2x3_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2f32.p0i8")]
+        fn vld3_lane_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, n: i64, ptr: *const i8) -> float32x2x3_t;
+    }
+vld3_lane_f32_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld3q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x3_t) -> float32x4x3_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4f32.p0i8")]
+        fn vld3q_lane_f32_(ptr: *const i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, n: i32, size: i32) -> float32x4x3_t;
+    }
+vld3q_lane_f32_(a as _, b.0, b.1, b.2, LANE, 4)
+}
+
+/// Load multiple 3-element structures to three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld3q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x3_t) -> float32x4x3_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v4f32.p0i8")]
+        fn vld3q_lane_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, n: i64, ptr: *const i8) -> float32x4x3_t;
+    }
+vld3q_lane_f32_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v8i8.p0i8")]
+        fn vld4_s8_(ptr: *const i8, size: i32) -> int8x8x4_t;
+    }
+vld4_s8_(a as *const i8, 1)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v8i8.p0v8i8")]
+        fn vld4_s8_(ptr: *const int8x8_t) -> int8x8x4_t;
+    }
+vld4_s8_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4i16.p0i8")]
+        fn vld4_s16_(ptr: *const i8, size: i32) -> int16x4x4_t;
+    }
+vld4_s16_(a as *const i8, 2)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v4i16.p0v4i16")]
+        fn vld4_s16_(ptr: *const int16x4_t) -> int16x4x4_t;
+    }
+vld4_s16_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v2i32.p0i8")]
+        fn vld4_s32_(ptr: *const i8, size: i32) -> int32x2x4_t;
+    }
+vld4_s32_(a as *const i8, 4)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2i32.p0v2i32")]
+        fn vld4_s32_(ptr: *const int32x2_t) -> int32x2x4_t;
+    }
+vld4_s32_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v16i8.p0i8")]
+        fn vld4q_s8_(ptr: *const i8, size: i32) -> int8x16x4_t;
+    }
+vld4q_s8_(a as *const i8, 1)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v16i8.p0v16i8")]
+        fn vld4q_s8_(ptr: *const int8x16_t) -> int8x16x4_t;
+    }
+vld4q_s8_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v8i16.p0i8")]
+        fn vld4q_s16_(ptr: *const i8, size: i32) -> int16x8x4_t;
+    }
+vld4q_s16_(a as *const i8, 2)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v8i16.p0v8i16")]
+        fn vld4q_s16_(ptr: *const int16x8_t) -> int16x8x4_t;
+    }
+vld4q_s16_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4i32.p0i8")]
+        fn vld4q_s32_(ptr: *const i8, size: i32) -> int32x4x4_t;
+    }
+vld4q_s32_(a as *const i8, 4)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v4i32.p0v4i32")]
+        fn vld4q_s32_(ptr: *const int32x4_t) -> int32x4x4_t;
+    }
+vld4q_s32_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v1i64.p0i8")]
+        fn vld4_s64_(ptr: *const i8, size: i32) -> int64x1x4_t;
+    }
+vld4_s64_(a as *const i8, 8)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v1i64.p0v1i64")]
+        fn vld4_s64_(ptr: *const int64x1_t) -> int64x1x4_t;
+    }
+vld4_s64_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_u8(a: *const u8) -> uint8x8x4_t {
+    transmute(vld4_s8(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_u16(a: *const u16) -> uint16x4x4_t {
+    transmute(vld4_s16(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_u32(a: *const u32) -> uint32x2x4_t {
+    transmute(vld4_s32(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_u8(a: *const u8) -> uint8x16x4_t {
+    transmute(vld4q_s8(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_u16(a: *const u16) -> uint16x8x4_t {
+    transmute(vld4q_s16(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_u32(a: *const u32) -> uint32x4x4_t {
+    transmute(vld4q_s32(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_p8(a: *const p8) -> poly8x8x4_t {
+    transmute(vld4_s8(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_p16(a: *const p16) -> poly16x4x4_t {
+    transmute(vld4_s16(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_p8(a: *const p8) -> poly8x16x4_t {
+    transmute(vld4q_s8(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_p16(a: *const p16) -> poly16x8x4_t {
+    transmute(vld4q_s16(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_u64(a: *const u64) -> uint64x1x4_t {
+    transmute(vld4_s64(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_p64(a: *const p64) -> poly64x1x4_t {
+    transmute(vld4_s64(transmute(a)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v2f32.p0i8")]
+        fn vld4_f32_(ptr: *const i8, size: i32) -> float32x2x4_t;
+    }
+vld4_f32_(a as *const i8, 4)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2f32.p0v2f32")]
+        fn vld4_f32_(ptr: *const float32x2_t) -> float32x2x4_t;
+    }
+vld4_f32_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4f32.p0i8")]
+        fn vld4q_f32_(ptr: *const i8, size: i32) -> float32x4x4_t;
+    }
+vld4q_f32_(a as *const i8, 4)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v4f32.p0v4f32")]
+        fn vld4q_f32_(ptr: *const float32x4_t) -> float32x4x4_t;
+    }
+vld4q_f32_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_dup_s8(a: *const i8) -> int8x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v8i8.p0i8")]
+        fn vld4_dup_s8_(ptr: *const i8, size: i32) -> int8x8x4_t;
+    }
+vld4_dup_s8_(a as *const i8, 1)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_s8(a: *const i8) -> int8x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v8i8.p0i8")]
+        fn vld4_dup_s8_(ptr: *const i8) -> int8x8x4_t;
+    }
+vld4_dup_s8_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_dup_s16(a: *const i16) -> int16x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4i16.p0i8")]
+        fn vld4_dup_s16_(ptr: *const i8, size: i32) -> int16x4x4_t;
+    }
+vld4_dup_s16_(a as *const i8, 2)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_s16(a: *const i16) -> int16x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v4i16.p0i16")]
+        fn vld4_dup_s16_(ptr: *const i16) -> int16x4x4_t;
+    }
+vld4_dup_s16_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_dup_s32(a: *const i32) -> int32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v2i32.p0i8")]
+        fn vld4_dup_s32_(ptr: *const i8, size: i32) -> int32x2x4_t;
+    }
+vld4_dup_s32_(a as *const i8, 4)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_s32(a: *const i32) -> int32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2i32.p0i32")]
+        fn vld4_dup_s32_(ptr: *const i32) -> int32x2x4_t;
+    }
+vld4_dup_s32_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_dup_s8(a: *const i8) -> int8x16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v16i8.p0i8")]
+        fn vld4q_dup_s8_(ptr: *const i8, size: i32) -> int8x16x4_t;
+    }
+vld4q_dup_s8_(a as *const i8, 1)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_s8(a: *const i8) -> int8x16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v16i8.p0i8")]
+        fn vld4q_dup_s8_(ptr: *const i8) -> int8x16x4_t;
+    }
+vld4q_dup_s8_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_dup_s16(a: *const i16) -> int16x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v8i16.p0i8")]
+        fn vld4q_dup_s16_(ptr: *const i8, size: i32) -> int16x8x4_t;
+    }
+vld4q_dup_s16_(a as *const i8, 2)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_s16(a: *const i16) -> int16x8x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v8i16.p0i16")]
+        fn vld4q_dup_s16_(ptr: *const i16) -> int16x8x4_t;
+    }
+vld4q_dup_s16_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_dup_s32(a: *const i32) -> int32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4i32.p0i8")]
+        fn vld4q_dup_s32_(ptr: *const i8, size: i32) -> int32x4x4_t;
+    }
+vld4q_dup_s32_(a as *const i8, 4)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_s32(a: *const i32) -> int32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v4i32.p0i32")]
+        fn vld4q_dup_s32_(ptr: *const i32) -> int32x4x4_t;
+    }
+vld4q_dup_s32_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vld4_dup_s64(a: *const i64) -> int64x1x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v1i64.p0i8")]
+        fn vld4_dup_s64_(ptr: *const i8, size: i32) -> int64x1x4_t;
+    }
+vld4_dup_s64_(a as *const i8, 8)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_s64(a: *const i64) -> int64x1x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v1i64.p0i64")]
+        fn vld4_dup_s64_(ptr: *const i64) -> int64x1x4_t;
+    }
+vld4_dup_s64_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_dup_u8(a: *const u8) -> uint8x8x4_t {
+    transmute(vld4_dup_s8(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_dup_u16(a: *const u16) -> uint16x4x4_t {
+    transmute(vld4_dup_s16(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_dup_u32(a: *const u32) -> uint32x2x4_t {
+    transmute(vld4_dup_s32(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_dup_u8(a: *const u8) -> uint8x16x4_t {
+    transmute(vld4q_dup_s8(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_dup_u16(a: *const u16) -> uint16x8x4_t {
+    transmute(vld4q_dup_s16(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_dup_u32(a: *const u32) -> uint32x4x4_t {
+    transmute(vld4q_dup_s32(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_dup_p8(a: *const p8) -> poly8x8x4_t {
+    transmute(vld4_dup_s8(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_dup_p16(a: *const p16) -> poly16x4x4_t {
+    transmute(vld4_dup_s16(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_dup_p8(a: *const p8) -> poly8x16x4_t {
+    transmute(vld4q_dup_s8(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_dup_p16(a: *const p16) -> poly16x8x4_t {
+    transmute(vld4q_dup_s16(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_dup_u64(a: *const u64) -> uint64x1x4_t {
+    transmute(vld4_dup_s64(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_dup_p64(a: *const p64) -> poly64x1x4_t {
+    transmute(vld4_dup_s64(transmute(a)))
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v2f32.p0i8")]
+        fn vld4_dup_f32_(ptr: *const i8, size: i32) -> float32x2x4_t;
+    }
+vld4_dup_f32_(a as *const i8, 4)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2f32.p0f32")]
+        fn vld4_dup_f32_(ptr: *const f32) -> float32x2x4_t;
+    }
+vld4_dup_f32_(a as _)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4))]
+pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4f32.p0i8")]
+        fn vld4q_dup_f32_(ptr: *const i8, size: i32) -> float32x4x4_t;
+    }
+vld4q_dup_f32_(a as *const i8, 4)
+}
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4r))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v4f32.p0f32")]
+        fn vld4q_dup_f32_(ptr: *const f32) -> float32x4x4_t;
+    }
+vld4q_dup_f32_(a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld4_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x4_t) -> int8x8x4_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v8i8.p0i8")]
+        fn vld4_lane_s8_(ptr: *const i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, n: i32, size: i32) -> int8x8x4_t;
+    }
+vld4_lane_s8_(a as _, b.0, b.1, b.2, b.3, LANE, 1)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_s8<const LANE: i32>(a: *const i8, b: int8x8x4_t) -> int8x8x4_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v8i8.p0i8")]
+        fn vld4_lane_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, n: i64, ptr: *const i8) -> int8x8x4_t;
+    }
+vld4_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld4_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x4_t) -> int16x4x4_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4i16.p0i8")]
+        fn vld4_lane_s16_(ptr: *const i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, n: i32, size: i32) -> int16x4x4_t;
+    }
+vld4_lane_s16_(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_s16<const LANE: i32>(a: *const i16, b: int16x4x4_t) -> int16x4x4_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v4i16.p0i8")]
+        fn vld4_lane_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, n: i64, ptr: *const i8) -> int16x4x4_t;
+    }
+vld4_lane_s16_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld4_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x4_t) -> int32x2x4_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v2i32.p0i8")]
+        fn vld4_lane_s32_(ptr: *const i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, n: i32, size: i32) -> int32x2x4_t;
+    }
+vld4_lane_s32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_s32<const LANE: i32>(a: *const i32, b: int32x2x4_t) -> int32x2x4_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2i32.p0i8")]
+        fn vld4_lane_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, n: i64, ptr: *const i8) -> int32x2x4_t;
+    }
+vld4_lane_s32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld4q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x4_t) -> int16x8x4_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v8i16.p0i8")]
+        fn vld4q_lane_s16_(ptr: *const i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, n: i32, size: i32) -> int16x8x4_t;
+    }
+vld4q_lane_s16_(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_s16<const LANE: i32>(a: *const i16, b: int16x8x4_t) -> int16x8x4_t {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v8i16.p0i8")]
+        fn vld4q_lane_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, n: i64, ptr: *const i8) -> int16x8x4_t;
+    }
+vld4q_lane_s16_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld4q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x4_t) -> int32x4x4_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4i32.p0i8")]
+        fn vld4q_lane_s32_(ptr: *const i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, n: i32, size: i32) -> int32x4x4_t;
+    }
+vld4q_lane_s32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_s32<const LANE: i32>(a: *const i32, b: int32x4x4_t) -> int32x4x4_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v4i32.p0i8")]
+        fn vld4q_lane_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, n: i64, ptr: *const i8) -> int32x4x4_t;
+    }
+vld4q_lane_s32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_lane_u8<const LANE: i32>(a: *const u8, b: uint8x8x4_t) -> uint8x8x4_t {
+    static_assert_imm3!(LANE);
+    transmute(vld4_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_lane_u16<const LANE: i32>(a: *const u16, b: uint16x4x4_t) -> uint16x4x4_t {
+    static_assert_imm2!(LANE);
+    transmute(vld4_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_lane_u32<const LANE: i32>(a: *const u32, b: uint32x2x4_t) -> uint32x2x4_t {
+    static_assert_imm1!(LANE);
+    transmute(vld4_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_lane_u16<const LANE: i32>(a: *const u16, b: uint16x8x4_t) -> uint16x8x4_t {
+    static_assert_imm3!(LANE);
+    transmute(vld4q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_lane_u32<const LANE: i32>(a: *const u32, b: uint32x4x4_t) -> uint32x4x4_t {
+    static_assert_imm2!(LANE);
+    transmute(vld4q_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_lane_p8<const LANE: i32>(a: *const p8, b: poly8x8x4_t) -> poly8x8x4_t {
+    static_assert_imm3!(LANE);
+    transmute(vld4_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4_lane_p16<const LANE: i32>(a: *const p16, b: poly16x4x4_t) -> poly16x4x4_t {
+    static_assert_imm2!(LANE);
+    transmute(vld4_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vld4q_lane_p16<const LANE: i32>(a: *const p16, b: poly16x8x4_t) -> poly16x8x4_t {
+    static_assert_imm3!(LANE);
+    transmute(vld4q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld4_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x4_t) -> float32x2x4_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v2f32.p0i8")]
+        fn vld4_lane_f32_(ptr: *const i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, n: i32, size: i32) -> float32x2x4_t;
+    }
+vld4_lane_f32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4_lane_f32<const LANE: i32>(a: *const f32, b: float32x2x4_t) -> float32x2x4_t {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2f32.p0i8")]
+        fn vld4_lane_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, n: i64, ptr: *const i8) -> float32x2x4_t;
+    }
+vld4_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vld4q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x4_t) -> float32x4x4_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4f32.p0i8")]
+        fn vld4q_lane_f32_(ptr: *const i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, n: i32, size: i32) -> float32x4x4_t;
+    }
+vld4q_lane_f32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+
+/// Load multiple 4-element structures to four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ld4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vld4q_lane_f32<const LANE: i32>(a: *const f32, b: float32x4x4_t) -> float32x4x4_t {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v4f32.p0i8")]
+        fn vld4q_lane_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, n: i64, ptr: *const i8) -> float32x4x4_t;
+    }
+vld4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8_t) {
+    static_assert_imm3!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4_t) {
+    static_assert_imm2!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2_t) {
+    static_assert_imm1!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16_t) {
+    static_assert_imm4!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8_t) {
+    static_assert_imm3!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4_t) {
+    static_assert_imm2!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2_t) {
+    static_assert_imm1!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8_t) {
+    static_assert_imm3!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4_t) {
+    static_assert_imm2!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2_t) {
+    static_assert_imm1!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16_t) {
+    static_assert_imm4!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8_t) {
+    static_assert_imm3!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4_t) {
+    static_assert_imm2!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2_t) {
+    static_assert_imm1!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8_t) {
+    static_assert_imm3!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4_t) {
+    static_assert_imm2!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16_t) {
+    static_assert_imm4!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8_t) {
+    static_assert_imm3!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1_t) {
+    static_assert!(LANE : i32 where LANE == 0);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2_t) {
+    static_assert_imm1!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2_t) {
+    static_assert_imm1!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4_t) {
+    static_assert_imm2!(LANE);
+    *a = simd_extract(b, LANE as u32);
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s8_x2(a: *mut i8, b: int8x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i8.v8i8")]
+        fn vst1_s8_x2_(ptr: *mut i8, a: int8x8_t, b: int8x8_t);
+    }
+vst1_s8_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s8_x2(a: *mut i8, b: int8x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v8i8.p0i8")]
+        fn vst1_s8_x2_(a: int8x8_t, b: int8x8_t, ptr: *mut i8);
+    }
+vst1_s8_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s16_x2(a: *mut i16, b: int16x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i16.v4i16")]
+        fn vst1_s16_x2_(ptr: *mut i16, a: int16x4_t, b: int16x4_t);
+    }
+vst1_s16_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s16_x2(a: *mut i16, b: int16x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v4i16.p0i16")]
+        fn vst1_s16_x2_(a: int16x4_t, b: int16x4_t, ptr: *mut i16);
+    }
+vst1_s16_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s32_x2(a: *mut i32, b: int32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i32.v2i32")]
+        fn vst1_s32_x2_(ptr: *mut i32, a: int32x2_t, b: int32x2_t);
+    }
+vst1_s32_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s32_x2(a: *mut i32, b: int32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v2i32.p0i32")]
+        fn vst1_s32_x2_(a: int32x2_t, b: int32x2_t, ptr: *mut i32);
+    }
+vst1_s32_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s64_x2(a: *mut i64, b: int64x1x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i64.v1i64")]
+        fn vst1_s64_x2_(ptr: *mut i64, a: int64x1_t, b: int64x1_t);
+    }
+vst1_s64_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s64_x2(a: *mut i64, b: int64x1x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v1i64.p0i64")]
+        fn vst1_s64_x2_(a: int64x1_t, b: int64x1_t, ptr: *mut i64);
+    }
+vst1_s64_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s8_x2(a: *mut i8, b: int8x16x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i8.v16i8")]
+        fn vst1q_s8_x2_(ptr: *mut i8, a: int8x16_t, b: int8x16_t);
+    }
+vst1q_s8_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s8_x2(a: *mut i8, b: int8x16x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v16i8.p0i8")]
+        fn vst1q_s8_x2_(a: int8x16_t, b: int8x16_t, ptr: *mut i8);
+    }
+vst1q_s8_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s16_x2(a: *mut i16, b: int16x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i16.v8i16")]
+        fn vst1q_s16_x2_(ptr: *mut i16, a: int16x8_t, b: int16x8_t);
+    }
+vst1q_s16_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s16_x2(a: *mut i16, b: int16x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v8i16.p0i16")]
+        fn vst1q_s16_x2_(a: int16x8_t, b: int16x8_t, ptr: *mut i16);
+    }
+vst1q_s16_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s32_x2(a: *mut i32, b: int32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i32.v4i32")]
+        fn vst1q_s32_x2_(ptr: *mut i32, a: int32x4_t, b: int32x4_t);
+    }
+vst1q_s32_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s32_x2(a: *mut i32, b: int32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v4i32.p0i32")]
+        fn vst1q_s32_x2_(a: int32x4_t, b: int32x4_t, ptr: *mut i32);
+    }
+vst1q_s32_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s64_x2(a: *mut i64, b: int64x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0i64.v2i64")]
+        fn vst1q_s64_x2_(ptr: *mut i64, a: int64x2_t, b: int64x2_t);
+    }
+vst1q_s64_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s64_x2(a: *mut i64, b: int64x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v2i64.p0i64")]
+        fn vst1q_s64_x2_(a: int64x2_t, b: int64x2_t, ptr: *mut i64);
+    }
+vst1q_s64_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s8_x3(a: *mut i8, b: int8x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i8.v8i8")]
+        fn vst1_s8_x3_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t);
+    }
+vst1_s8_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s8_x3(a: *mut i8, b: int8x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v8i8.p0i8")]
+        fn vst1_s8_x3_(a: int8x8_t, b: int8x8_t, c: int8x8_t, ptr: *mut i8);
+    }
+vst1_s8_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s16_x3(a: *mut i16, b: int16x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i16.v4i16")]
+        fn vst1_s16_x3_(ptr: *mut i16, a: int16x4_t, b: int16x4_t, c: int16x4_t);
+    }
+vst1_s16_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s16_x3(a: *mut i16, b: int16x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v4i16.p0i16")]
+        fn vst1_s16_x3_(a: int16x4_t, b: int16x4_t, c: int16x4_t, ptr: *mut i16);
+    }
+vst1_s16_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s32_x3(a: *mut i32, b: int32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i32.v2i32")]
+        fn vst1_s32_x3_(ptr: *mut i32, a: int32x2_t, b: int32x2_t, c: int32x2_t);
+    }
+vst1_s32_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s32_x3(a: *mut i32, b: int32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v2i32.p0i32")]
+        fn vst1_s32_x3_(a: int32x2_t, b: int32x2_t, c: int32x2_t, ptr: *mut i32);
+    }
+vst1_s32_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s64_x3(a: *mut i64, b: int64x1x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i64.v1i64")]
+        fn vst1_s64_x3_(ptr: *mut i64, a: int64x1_t, b: int64x1_t, c: int64x1_t);
+    }
+vst1_s64_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s64_x3(a: *mut i64, b: int64x1x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v1i64.p0i64")]
+        fn vst1_s64_x3_(a: int64x1_t, b: int64x1_t, c: int64x1_t, ptr: *mut i64);
+    }
+vst1_s64_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s8_x3(a: *mut i8, b: int8x16x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i8.v16i8")]
+        fn vst1q_s8_x3_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t);
+    }
+vst1q_s8_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s8_x3(a: *mut i8, b: int8x16x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v16i8.p0i8")]
+        fn vst1q_s8_x3_(a: int8x16_t, b: int8x16_t, c: int8x16_t, ptr: *mut i8);
+    }
+vst1q_s8_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s16_x3(a: *mut i16, b: int16x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i16.v8i16")]
+        fn vst1q_s16_x3_(ptr: *mut i16, a: int16x8_t, b: int16x8_t, c: int16x8_t);
+    }
+vst1q_s16_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s16_x3(a: *mut i16, b: int16x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v8i16.p0i16")]
+        fn vst1q_s16_x3_(a: int16x8_t, b: int16x8_t, c: int16x8_t, ptr: *mut i16);
+    }
+vst1q_s16_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s32_x3(a: *mut i32, b: int32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i32.v4i32")]
+        fn vst1q_s32_x3_(ptr: *mut i32, a: int32x4_t, b: int32x4_t, c: int32x4_t);
+    }
+vst1q_s32_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s32_x3(a: *mut i32, b: int32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v4i32.p0i32")]
+        fn vst1q_s32_x3_(a: int32x4_t, b: int32x4_t, c: int32x4_t, ptr: *mut i32);
+    }
+vst1q_s32_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s64_x3(a: *mut i64, b: int64x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0i64.v2i64")]
+        fn vst1q_s64_x3_(ptr: *mut i64, a: int64x2_t, b: int64x2_t, c: int64x2_t);
+    }
+vst1q_s64_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s64_x3(a: *mut i64, b: int64x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v2i64.p0i64")]
+        fn vst1q_s64_x3_(a: int64x2_t, b: int64x2_t, c: int64x2_t, ptr: *mut i64);
+    }
+vst1q_s64_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s8_x4(a: *mut i8, b: int8x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i8.v8i8")]
+        fn vst1_s8_x4_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t);
+    }
+vst1_s8_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s8_x4(a: *mut i8, b: int8x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v8i8.p0i8")]
+        fn vst1_s8_x4_(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, ptr: *mut i8);
+    }
+vst1_s8_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s16_x4(a: *mut i16, b: int16x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i16.v4i16")]
+        fn vst1_s16_x4_(ptr: *mut i16, a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t);
+    }
+vst1_s16_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s16_x4(a: *mut i16, b: int16x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v4i16.p0i16")]
+        fn vst1_s16_x4_(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, ptr: *mut i16);
+    }
+vst1_s16_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s32_x4(a: *mut i32, b: int32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i32.v2i32")]
+        fn vst1_s32_x4_(ptr: *mut i32, a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t);
+    }
+vst1_s32_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s32_x4(a: *mut i32, b: int32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v2i32.p0i32")]
+        fn vst1_s32_x4_(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, ptr: *mut i32);
+    }
+vst1_s32_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_s64_x4(a: *mut i64, b: int64x1x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i64.v1i64")]
+        fn vst1_s64_x4_(ptr: *mut i64, a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t);
+    }
+vst1_s64_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_s64_x4(a: *mut i64, b: int64x1x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v1i64.p0i64")]
+        fn vst1_s64_x4_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, ptr: *mut i64);
+    }
+vst1_s64_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s8_x4(a: *mut i8, b: int8x16x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i8.v16i8")]
+        fn vst1q_s8_x4_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t);
+    }
+vst1q_s8_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s8_x4(a: *mut i8, b: int8x16x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v16i8.p0i8")]
+        fn vst1q_s8_x4_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, ptr: *mut i8);
+    }
+vst1q_s8_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s16_x4(a: *mut i16, b: int16x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i16.v8i16")]
+        fn vst1q_s16_x4_(ptr: *mut i16, a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t);
+    }
+vst1q_s16_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s16_x4(a: *mut i16, b: int16x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v8i16.p0i16")]
+        fn vst1q_s16_x4_(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, ptr: *mut i16);
+    }
+vst1q_s16_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s32_x4(a: *mut i32, b: int32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i32.v4i32")]
+        fn vst1q_s32_x4_(ptr: *mut i32, a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t);
+    }
+vst1q_s32_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s32_x4(a: *mut i32, b: int32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v4i32.p0i32")]
+        fn vst1q_s32_x4_(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, ptr: *mut i32);
+    }
+vst1q_s32_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_s64_x4(a: *mut i64, b: int64x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0i64.v2i64")]
+        fn vst1q_s64_x4_(ptr: *mut i64, a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t);
+    }
+vst1q_s64_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_s64_x4(a: *mut i64, b: int64x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v2i64.p0i64")]
+        fn vst1q_s64_x4_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, ptr: *mut i64);
+    }
+vst1q_s64_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u8_x2(a: *mut u8, b: uint8x8x2_t) {
+    vst1_s8_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u16_x2(a: *mut u16, b: uint16x4x2_t) {
+    vst1_s16_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u32_x2(a: *mut u32, b: uint32x2x2_t) {
+    vst1_s32_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u64_x2(a: *mut u64, b: uint64x1x2_t) {
+    vst1_s64_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u8_x2(a: *mut u8, b: uint8x16x2_t) {
+    vst1q_s8_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u16_x2(a: *mut u16, b: uint16x8x2_t) {
+    vst1q_s16_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u32_x2(a: *mut u32, b: uint32x4x2_t) {
+    vst1q_s32_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u64_x2(a: *mut u64, b: uint64x2x2_t) {
+    vst1q_s64_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u8_x3(a: *mut u8, b: uint8x8x3_t) {
+    vst1_s8_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u16_x3(a: *mut u16, b: uint16x4x3_t) {
+    vst1_s16_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u32_x3(a: *mut u32, b: uint32x2x3_t) {
+    vst1_s32_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u64_x3(a: *mut u64, b: uint64x1x3_t) {
+    vst1_s64_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u8_x3(a: *mut u8, b: uint8x16x3_t) {
+    vst1q_s8_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u16_x3(a: *mut u16, b: uint16x8x3_t) {
+    vst1q_s16_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u32_x3(a: *mut u32, b: uint32x4x3_t) {
+    vst1q_s32_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u64_x3(a: *mut u64, b: uint64x2x3_t) {
+    vst1q_s64_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u8_x4(a: *mut u8, b: uint8x8x4_t) {
+    vst1_s8_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u16_x4(a: *mut u16, b: uint16x4x4_t) {
+    vst1_s16_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u32_x4(a: *mut u32, b: uint32x2x4_t) {
+    vst1_s32_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_u64_x4(a: *mut u64, b: uint64x1x4_t) {
+    vst1_s64_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u8_x4(a: *mut u8, b: uint8x16x4_t) {
+    vst1q_s8_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u16_x4(a: *mut u16, b: uint16x8x4_t) {
+    vst1q_s16_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u32_x4(a: *mut u32, b: uint32x4x4_t) {
+    vst1q_s32_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_u64_x4(a: *mut u64, b: uint64x2x4_t) {
+    vst1q_s64_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_p8_x2(a: *mut p8, b: poly8x8x2_t) {
+    vst1_s8_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_p8_x3(a: *mut p8, b: poly8x8x3_t) {
+    vst1_s8_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_p8_x4(a: *mut p8, b: poly8x8x4_t) {
+    vst1_s8_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_p8_x2(a: *mut p8, b: poly8x16x2_t) {
+    vst1q_s8_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_p8_x3(a: *mut p8, b: poly8x16x3_t) {
+    vst1q_s8_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_p8_x4(a: *mut p8, b: poly8x16x4_t) {
+    vst1q_s8_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_p16_x2(a: *mut p16, b: poly16x4x2_t) {
+    vst1_s16_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_p16_x3(a: *mut p16, b: poly16x4x3_t) {
+    vst1_s16_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_p16_x4(a: *mut p16, b: poly16x4x4_t) {
+    vst1_s16_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_p16_x2(a: *mut p16, b: poly16x8x2_t) {
+    vst1q_s16_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_p16_x3(a: *mut p16, b: poly16x8x3_t) {
+    vst1q_s16_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_p16_x4(a: *mut p16, b: poly16x8x4_t) {
+    vst1q_s16_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_p64_x2(a: *mut p64, b: poly64x1x2_t) {
+    vst1_s64_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_p64_x3(a: *mut p64, b: poly64x1x3_t) {
+    vst1_s64_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1_p64_x4(a: *mut p64, b: poly64x1x4_t) {
+    vst1_s64_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_p64_x2(a: *mut p64, b: poly64x2x2_t) {
+    vst1q_s64_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_p64_x3(a: *mut p64, b: poly64x2x3_t) {
+    vst1q_s64_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst1q_p64_x4(a: *mut p64, b: poly64x2x4_t) {
+    vst1q_s64_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_f32_x2(a: *mut f32, b: float32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0f32.v2f32")]
+        fn vst1_f32_x2_(ptr: *mut f32, a: float32x2_t, b: float32x2_t);
+    }
+vst1_f32_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f32_x2(a: *mut f32, b: float32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v2f32.p0f32")]
+        fn vst1_f32_x2_(a: float32x2_t, b: float32x2_t, ptr: *mut f32);
+    }
+vst1_f32_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_f32_x2(a: *mut f32, b: float32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x2.p0f32.v4f32")]
+        fn vst1q_f32_x2_(ptr: *mut f32, a: float32x4_t, b: float32x4_t);
+    }
+vst1q_f32_x2_(a, b.0, b.1)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f32_x2(a: *mut f32, b: float32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v4f32.p0f32")]
+        fn vst1q_f32_x2_(a: float32x4_t, b: float32x4_t, ptr: *mut f32);
+    }
+vst1q_f32_x2_(b.0, b.1, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_f32_x3(a: *mut f32, b: float32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0f32.v2f32")]
+        fn vst1_f32_x3_(ptr: *mut f32, a: float32x2_t, b: float32x2_t, c: float32x2_t);
+    }
+vst1_f32_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f32_x3(a: *mut f32, b: float32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v2f32.p0f32")]
+        fn vst1_f32_x3_(a: float32x2_t, b: float32x2_t, c: float32x2_t, ptr: *mut f32);
+    }
+vst1_f32_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_f32_x3(a: *mut f32, b: float32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x3.p0f32.v4f32")]
+        fn vst1q_f32_x3_(ptr: *mut f32, a: float32x4_t, b: float32x4_t, c: float32x4_t);
+    }
+vst1q_f32_x3_(a, b.0, b.1, b.2)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f32_x3(a: *mut f32, b: float32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v4f32.p0f32")]
+        fn vst1q_f32_x3_(a: float32x4_t, b: float32x4_t, c: float32x4_t, ptr: *mut f32);
+    }
+vst1q_f32_x3_(b.0, b.1, b.2, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1_f32_x4(a: *mut f32, b: float32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0f32.v2f32")]
+        fn vst1_f32_x4_(ptr: *mut f32, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t);
+    }
+vst1_f32_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1_f32_x4(a: *mut f32, b: float32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v2f32.p0f32")]
+        fn vst1_f32_x4_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, ptr: *mut f32);
+    }
+vst1_f32_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst1))]
+pub unsafe fn vst1q_f32_x4(a: *mut f32, b: float32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst1x4.p0f32.v4f32")]
+        fn vst1q_f32_x4_(ptr: *mut f32, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t);
+    }
+vst1q_f32_x4_(a, b.0, b.1, b.2, b.3)
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st1))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst1q_f32_x4(a: *mut f32, b: float32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v4f32.p0f32")]
+        fn vst1q_f32_x4_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, ptr: *mut f32);
+    }
+vst1q_f32_x4_(b.0, b.1, b.2, b.3, a)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2_s8(a: *mut i8, b: int8x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v8i8")]
+        fn vst2_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, size: i32);
+    }
+vst2_s8_(a as _, b.0, b.1, 1)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_s8(a: *mut i8, b: int8x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v8i8.p0i8")]
+        fn vst2_s8_(a: int8x8_t, b: int8x8_t, ptr: *mut i8);
+    }
+vst2_s8_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2_s16(a: *mut i16, b: int16x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v4i16")]
+        fn vst2_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, size: i32);
+    }
+vst2_s16_(a as _, b.0, b.1, 2)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_s16(a: *mut i16, b: int16x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v4i16.p0i8")]
+        fn vst2_s16_(a: int16x4_t, b: int16x4_t, ptr: *mut i8);
+    }
+vst2_s16_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v2i32")]
+        fn vst2_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, size: i32);
+    }
+vst2_s32_(a as _, b.0, b.1, 4)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2i32.p0i8")]
+        fn vst2_s32_(a: int32x2_t, b: int32x2_t, ptr: *mut i8);
+    }
+vst2_s32_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2q_s8(a: *mut i8, b: int8x16x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v16i8")]
+        fn vst2q_s8_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, size: i32);
+    }
+vst2q_s8_(a as _, b.0, b.1, 1)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_s8(a: *mut i8, b: int8x16x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v16i8.p0i8")]
+        fn vst2q_s8_(a: int8x16_t, b: int8x16_t, ptr: *mut i8);
+    }
+vst2q_s8_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2q_s16(a: *mut i16, b: int16x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v8i16")]
+        fn vst2q_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, size: i32);
+    }
+vst2q_s16_(a as _, b.0, b.1, 2)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_s16(a: *mut i16, b: int16x8x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v8i16.p0i8")]
+        fn vst2q_s16_(a: int16x8_t, b: int16x8_t, ptr: *mut i8);
+    }
+vst2q_s16_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2q_s32(a: *mut i32, b: int32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v4i32")]
+        fn vst2q_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, size: i32);
+    }
+vst2q_s32_(a as _, b.0, b.1, 4)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_s32(a: *mut i32, b: int32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v4i32.p0i8")]
+        fn vst2q_s32_(a: int32x4_t, b: int32x4_t, ptr: *mut i8);
+    }
+vst2q_s32_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v1i64")]
+        fn vst2_s64_(ptr: *mut i8, a: int64x1_t, b: int64x1_t, size: i32);
+    }
+vst2_s64_(a as _, b.0, b.1, 8)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v1i64.p0i8")]
+        fn vst2_s64_(a: int64x1_t, b: int64x1_t, ptr: *mut i8);
+    }
+vst2_s64_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_u8(a: *mut u8, b: uint8x8x2_t) {
+    transmute(vst2_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_u16(a: *mut u16, b: uint16x4x2_t) {
+    transmute(vst2_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_u32(a: *mut u32, b: uint32x2x2_t) {
+    transmute(vst2_s32(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2q_u8(a: *mut u8, b: uint8x16x2_t) {
+    transmute(vst2q_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2q_u16(a: *mut u16, b: uint16x8x2_t) {
+    transmute(vst2q_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2q_u32(a: *mut u32, b: uint32x4x2_t) {
+    transmute(vst2q_s32(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_p8(a: *mut p8, b: poly8x8x2_t) {
+    transmute(vst2_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_p16(a: *mut p16, b: poly16x4x2_t) {
+    transmute(vst2_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2q_p8(a: *mut p8, b: poly8x16x2_t) {
+    transmute(vst2q_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2q_p16(a: *mut p16, b: poly16x8x2_t) {
+    transmute(vst2q_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_u64(a: *mut u64, b: uint64x1x2_t) {
+    transmute(vst2_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_p64(a: *mut p64, b: poly64x1x2_t) {
+    transmute(vst2_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2_f32(a: *mut f32, b: float32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v2f32")]
+        fn vst2_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, size: i32);
+    }
+vst2_f32_(a as _, b.0, b.1, 4)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_f32(a: *mut f32, b: float32x2x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2f32.p0i8")]
+        fn vst2_f32_(a: float32x2_t, b: float32x2_t, ptr: *mut i8);
+    }
+vst2_f32_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2))]
+pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v4f32")]
+        fn vst2q_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, size: i32);
+    }
+vst2q_f32_(a as _, b.0, b.1, 4)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v4f32.p0i8")]
+        fn vst2q_f32_(a: float32x4_t, b: float32x4_t, ptr: *mut i8);
+    }
+vst2q_f32_(b.0, b.1, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst2_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x2_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v8i8")]
+        fn vst2_lane_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, n: i32, size: i32);
+    }
+vst2_lane_s8_(a as _, b.0, b.1, LANE, 1)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x2_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v8i8.p0i8")]
+        fn vst2_lane_s8_(a: int8x8_t, b: int8x8_t, n: i64, ptr: *mut i8);
+    }
+vst2_lane_s8_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst2_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x2_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v4i16")]
+        fn vst2_lane_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, n: i32, size: i32);
+    }
+vst2_lane_s16_(a as _, b.0, b.1, LANE, 2)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x2_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v4i16.p0i8")]
+        fn vst2_lane_s16_(a: int16x4_t, b: int16x4_t, n: i64, ptr: *mut i8);
+    }
+vst2_lane_s16_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst2_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x2_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v2i32")]
+        fn vst2_lane_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, n: i32, size: i32);
+    }
+vst2_lane_s32_(a as _, b.0, b.1, LANE, 4)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x2_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2i32.p0i8")]
+        fn vst2_lane_s32_(a: int32x2_t, b: int32x2_t, n: i64, ptr: *mut i8);
+    }
+vst2_lane_s32_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst2q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x2_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v8i16")]
+        fn vst2q_lane_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, n: i32, size: i32);
+    }
+vst2q_lane_s16_(a as _, b.0, b.1, LANE, 2)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x2_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v8i16.p0i8")]
+        fn vst2q_lane_s16_(a: int16x8_t, b: int16x8_t, n: i64, ptr: *mut i8);
+    }
+vst2q_lane_s16_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst2q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x2_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v4i32")]
+        fn vst2q_lane_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, n: i32, size: i32);
+    }
+vst2q_lane_s32_(a as _, b.0, b.1, LANE, 4)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x2_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v4i32.p0i8")]
+        fn vst2q_lane_s32_(a: int32x4_t, b: int32x4_t, n: i64, ptr: *mut i8);
+    }
+vst2q_lane_s32_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8x2_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst2_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4x2_t) {
+    static_assert_imm2!(LANE);
+    transmute(vst2_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2x2_t) {
+    static_assert_imm1!(LANE);
+    transmute(vst2_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8x2_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst2q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4x2_t) {
+    static_assert_imm2!(LANE);
+    transmute(vst2q_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8x2_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst2_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4x2_t) {
+    static_assert_imm2!(LANE);
+    transmute(vst2_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst2q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8x2_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst2q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst2_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x2_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v2f32")]
+        fn vst2_lane_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, n: i32, size: i32);
+    }
+vst2_lane_f32_(a as _, b.0, b.1, LANE, 4)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x2_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2f32.p0i8")]
+        fn vst2_lane_f32_(a: float32x2_t, b: float32x2_t, n: i64, ptr: *mut i8);
+    }
+vst2_lane_f32_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst2q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x2_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v4f32")]
+        fn vst2q_lane_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, n: i32, size: i32);
+    }
+vst2q_lane_f32_(a as _, b.0, b.1, LANE, 4)
+}
+
+/// Store multiple 2-element structures from two registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st2, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst2q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x2_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v4f32.p0i8")]
+        fn vst2q_lane_f32_(a: float32x4_t, b: float32x4_t, n: i64, ptr: *mut i8);
+    }
+vst2q_lane_f32_(b.0, b.1, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3_s8(a: *mut i8, b: int8x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v8i8")]
+        fn vst3_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, size: i32);
+    }
+vst3_s8_(a as _, b.0, b.1, b.2, 1)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_s8(a: *mut i8, b: int8x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v8i8.p0i8")]
+        fn vst3_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, ptr: *mut i8);
+    }
+vst3_s8_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3_s16(a: *mut i16, b: int16x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v4i16")]
+        fn vst3_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, size: i32);
+    }
+vst3_s16_(a as _, b.0, b.1, b.2, 2)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_s16(a: *mut i16, b: int16x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v4i16.p0i8")]
+        fn vst3_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, ptr: *mut i8);
+    }
+vst3_s16_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3_s32(a: *mut i32, b: int32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v2i32")]
+        fn vst3_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, size: i32);
+    }
+vst3_s32_(a as _, b.0, b.1, b.2, 4)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_s32(a: *mut i32, b: int32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2i32.p0i8")]
+        fn vst3_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, ptr: *mut i8);
+    }
+vst3_s32_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3q_s8(a: *mut i8, b: int8x16x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v16i8")]
+        fn vst3q_s8_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t, size: i32);
+    }
+vst3q_s8_(a as _, b.0, b.1, b.2, 1)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_s8(a: *mut i8, b: int8x16x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v16i8.p0i8")]
+        fn vst3q_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, ptr: *mut i8);
+    }
+vst3q_s8_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3q_s16(a: *mut i16, b: int16x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v8i16")]
+        fn vst3q_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, size: i32);
+    }
+vst3q_s16_(a as _, b.0, b.1, b.2, 2)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_s16(a: *mut i16, b: int16x8x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v8i16.p0i8")]
+        fn vst3q_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, ptr: *mut i8);
+    }
+vst3q_s16_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3q_s32(a: *mut i32, b: int32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v4i32")]
+        fn vst3q_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, size: i32);
+    }
+vst3q_s32_(a as _, b.0, b.1, b.2, 4)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_s32(a: *mut i32, b: int32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v4i32.p0i8")]
+        fn vst3q_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, ptr: *mut i8);
+    }
+vst3q_s32_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v1i64")]
+        fn vst3_s64_(ptr: *mut i8, a: int64x1_t, b: int64x1_t, c: int64x1_t, size: i32);
+    }
+vst3_s64_(a as _, b.0, b.1, b.2, 8)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v1i64.p0i8")]
+        fn vst3_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, ptr: *mut i8);
+    }
+vst3_s64_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_u8(a: *mut u8, b: uint8x8x3_t) {
+    transmute(vst3_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_u16(a: *mut u16, b: uint16x4x3_t) {
+    transmute(vst3_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_u32(a: *mut u32, b: uint32x2x3_t) {
+    transmute(vst3_s32(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3q_u8(a: *mut u8, b: uint8x16x3_t) {
+    transmute(vst3q_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3q_u16(a: *mut u16, b: uint16x8x3_t) {
+    transmute(vst3q_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3q_u32(a: *mut u32, b: uint32x4x3_t) {
+    transmute(vst3q_s32(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_p8(a: *mut p8, b: poly8x8x3_t) {
+    transmute(vst3_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_p16(a: *mut p16, b: poly16x4x3_t) {
+    transmute(vst3_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3q_p8(a: *mut p8, b: poly8x16x3_t) {
+    transmute(vst3q_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3q_p16(a: *mut p16, b: poly16x8x3_t) {
+    transmute(vst3q_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_u64(a: *mut u64, b: uint64x1x3_t) {
+    transmute(vst3_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_p64(a: *mut p64, b: poly64x1x3_t) {
+    transmute(vst3_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v2f32")]
+        fn vst3_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, size: i32);
+    }
+vst3_f32_(a as _, b.0, b.1, b.2, 4)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2f32.p0i8")]
+        fn vst3_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, ptr: *mut i8);
+    }
+vst3_f32_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3))]
+pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v4f32")]
+        fn vst3q_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, size: i32);
+    }
+vst3q_f32_(a as _, b.0, b.1, b.2, 4)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v4f32.p0i8")]
+        fn vst3q_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, ptr: *mut i8);
+    }
+vst3q_f32_(b.0, b.1, b.2, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x3_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v8i8")]
+        fn vst3_lane_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i32, size: i32);
+    }
+vst3_lane_s8_(a as _, b.0, b.1, b.2, LANE, 1)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x3_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v8i8.p0i8")]
+        fn vst3_lane_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i64, ptr: *mut i8);
+    }
+vst3_lane_s8_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x3_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v4i16")]
+        fn vst3_lane_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, n: i32, size: i32);
+    }
+vst3_lane_s16_(a as _, b.0, b.1, b.2, LANE, 2)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x3_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v4i16.p0i8")]
+        fn vst3_lane_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, n: i64, ptr: *mut i8);
+    }
+vst3_lane_s16_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x3_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v2i32")]
+        fn vst3_lane_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, n: i32, size: i32);
+    }
+vst3_lane_s32_(a as _, b.0, b.1, b.2, LANE, 4)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x3_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2i32.p0i8")]
+        fn vst3_lane_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, n: i64, ptr: *mut i8);
+    }
+vst3_lane_s32_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x3_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v8i16")]
+        fn vst3q_lane_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, n: i32, size: i32);
+    }
+vst3q_lane_s16_(a as _, b.0, b.1, b.2, LANE, 2)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x3_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v8i16.p0i8")]
+        fn vst3q_lane_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, n: i64, ptr: *mut i8);
+    }
+vst3q_lane_s16_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x3_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v4i32")]
+        fn vst3q_lane_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, n: i32, size: i32);
+    }
+vst3q_lane_s32_(a as _, b.0, b.1, b.2, LANE, 4)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x3_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v4i32.p0i8")]
+        fn vst3q_lane_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, n: i64, ptr: *mut i8);
+    }
+vst3q_lane_s32_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8x3_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst3_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4x3_t) {
+    static_assert_imm2!(LANE);
+    transmute(vst3_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2x3_t) {
+    static_assert_imm1!(LANE);
+    transmute(vst3_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8x3_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst3q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4x3_t) {
+    static_assert_imm2!(LANE);
+    transmute(vst3q_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8x3_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst3_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4x3_t) {
+    static_assert_imm2!(LANE);
+    transmute(vst3_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst3q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8x3_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst3q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x3_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v2f32")]
+        fn vst3_lane_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, n: i32, size: i32);
+    }
+vst3_lane_f32_(a as _, b.0, b.1, b.2, LANE, 4)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x3_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2f32.p0i8")]
+        fn vst3_lane_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, n: i64, ptr: *mut i8);
+    }
+vst3_lane_f32_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst3q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x3_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v4f32")]
+        fn vst3q_lane_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, n: i32, size: i32);
+    }
+vst3q_lane_f32_(a as _, b.0, b.1, b.2, LANE, 4)
+}
+
+/// Store multiple 3-element structures from three registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st3, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst3q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x3_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v4f32.p0i8")]
+        fn vst3q_lane_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, n: i64, ptr: *mut i8);
+    }
+vst3q_lane_f32_(b.0, b.1, b.2, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4_s8(a: *mut i8, b: int8x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v8i8")]
+        fn vst4_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, size: i32);
+    }
+vst4_s8_(a as _, b.0, b.1, b.2, b.3, 1)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_s8(a: *mut i8, b: int8x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v8i8.p0i8")]
+        fn vst4_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, ptr: *mut i8);
+    }
+vst4_s8_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4_s16(a: *mut i16, b: int16x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v4i16")]
+        fn vst4_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, size: i32);
+    }
+vst4_s16_(a as _, b.0, b.1, b.2, b.3, 2)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_s16(a: *mut i16, b: int16x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v4i16.p0i8")]
+        fn vst4_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, ptr: *mut i8);
+    }
+vst4_s16_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4_s32(a: *mut i32, b: int32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v2i32")]
+        fn vst4_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, size: i32);
+    }
+vst4_s32_(a as _, b.0, b.1, b.2, b.3, 4)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_s32(a: *mut i32, b: int32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2i32.p0i8")]
+        fn vst4_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, ptr: *mut i8);
+    }
+vst4_s32_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4q_s8(a: *mut i8, b: int8x16x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v16i8")]
+        fn vst4q_s8_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, size: i32);
+    }
+vst4q_s8_(a as _, b.0, b.1, b.2, b.3, 1)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_s8(a: *mut i8, b: int8x16x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v16i8.p0i8")]
+        fn vst4q_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, ptr: *mut i8);
+    }
+vst4q_s8_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4q_s16(a: *mut i16, b: int16x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v8i16")]
+        fn vst4q_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, size: i32);
+    }
+vst4q_s16_(a as _, b.0, b.1, b.2, b.3, 2)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_s16(a: *mut i16, b: int16x8x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v8i16.p0i8")]
+        fn vst4q_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, ptr: *mut i8);
+    }
+vst4q_s16_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4q_s32(a: *mut i32, b: int32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v4i32")]
+        fn vst4q_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, size: i32);
+    }
+vst4q_s32_(a as _, b.0, b.1, b.2, b.3, 4)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_s32(a: *mut i32, b: int32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v4i32.p0i8")]
+        fn vst4q_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, ptr: *mut i8);
+    }
+vst4q_s32_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(nop))]
+pub unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v1i64")]
+        fn vst4_s64_(ptr: *mut i8, a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, size: i32);
+    }
+vst4_s64_(a as _, b.0, b.1, b.2, b.3, 8)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(nop))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v1i64.p0i8")]
+        fn vst4_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, ptr: *mut i8);
+    }
+vst4_s64_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_u8(a: *mut u8, b: uint8x8x4_t) {
+    transmute(vst4_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_u16(a: *mut u16, b: uint16x4x4_t) {
+    transmute(vst4_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_u32(a: *mut u32, b: uint32x2x4_t) {
+    transmute(vst4_s32(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4q_u8(a: *mut u8, b: uint8x16x4_t) {
+    transmute(vst4q_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4q_u16(a: *mut u16, b: uint16x8x4_t) {
+    transmute(vst4q_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4q_u32(a: *mut u32, b: uint32x4x4_t) {
+    transmute(vst4q_s32(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_p8(a: *mut p8, b: poly8x8x4_t) {
+    transmute(vst4_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_p16(a: *mut p16, b: poly16x4x4_t) {
+    transmute(vst4_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4q_p8(a: *mut p8, b: poly8x16x4_t) {
+    transmute(vst4q_s8(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4q_p16(a: *mut p16, b: poly16x8x4_t) {
+    transmute(vst4q_s16(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_u64(a: *mut u64, b: uint64x1x4_t) {
+    transmute(vst4_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_p64(a: *mut p64, b: poly64x1x4_t) {
+    transmute(vst4_s64(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v2f32")]
+        fn vst4_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, size: i32);
+    }
+vst4_f32_(a as _, b.0, b.1, b.2, b.3, 4)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2f32.p0i8")]
+        fn vst4_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, ptr: *mut i8);
+    }
+vst4_f32_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4))]
+pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v4f32")]
+        fn vst4q_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, size: i32);
+    }
+vst4q_f32_(a as _, b.0, b.1, b.2, b.3, 4)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v4f32.p0i8")]
+        fn vst4q_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, ptr: *mut i8);
+    }
+vst4q_f32_(b.0, b.1, b.2, b.3, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x4_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v8i8")]
+        fn vst4_lane_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, n: i32, size: i32);
+    }
+vst4_lane_s8_(a as _, b.0, b.1, b.2, b.3, LANE, 1)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8x4_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v8i8.p0i8")]
+        fn vst4_lane_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, n: i64, ptr: *mut i8);
+    }
+vst4_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x4_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v4i16")]
+        fn vst4_lane_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, n: i32, size: i32);
+    }
+vst4_lane_s16_(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4x4_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v4i16.p0i8")]
+        fn vst4_lane_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, n: i64, ptr: *mut i8);
+    }
+vst4_lane_s16_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x4_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v2i32")]
+        fn vst4_lane_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, n: i32, size: i32);
+    }
+vst4_lane_s32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2x4_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2i32.p0i8")]
+        fn vst4_lane_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, n: i64, ptr: *mut i8);
+    }
+vst4_lane_s32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x4_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v8i16")]
+        fn vst4q_lane_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, n: i32, size: i32);
+    }
+vst4q_lane_s16_(a as _, b.0, b.1, b.2, b.3, LANE, 2)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8x4_t) {
+    static_assert_imm3!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v8i16.p0i8")]
+        fn vst4q_lane_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, n: i64, ptr: *mut i8);
+    }
+vst4q_lane_s16_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x4_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v4i32")]
+        fn vst4q_lane_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, n: i32, size: i32);
+    }
+vst4q_lane_s32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4x4_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v4i32.p0i8")]
+        fn vst4q_lane_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, n: i64, ptr: *mut i8);
+    }
+vst4q_lane_s32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8x4_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst4_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4x4_t) {
+    static_assert_imm2!(LANE);
+    transmute(vst4_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2x4_t) {
+    static_assert_imm1!(LANE);
+    transmute(vst4_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8x4_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst4q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4x4_t) {
+    static_assert_imm2!(LANE);
+    transmute(vst4q_lane_s32::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8x4_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst4_lane_s8::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4x4_t) {
+    static_assert_imm2!(LANE);
+    transmute(vst4_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vst4q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8x4_t) {
+    static_assert_imm3!(LANE);
+    transmute(vst4q_lane_s16::<LANE>(transmute(a), transmute(b)))
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x4_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v2f32")]
+        fn vst4_lane_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, n: i32, size: i32);
+    }
+vst4_lane_f32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2x4_t) {
+    static_assert_imm1!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2f32.p0i8")]
+        fn vst4_lane_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, n: i64, ptr: *mut i8);
+    }
+vst4_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vst4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vst4q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x4_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v4f32")]
+        fn vst4q_lane_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, n: i32, size: i32);
+    }
+vst4q_lane_f32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
+}
+
+/// Store multiple 4-element structures from four registers
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(st4, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vst4q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x4_t) {
+    static_assert_imm2!(LANE);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v4f32.p0i8")]
+        fn vst4q_lane_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, n: i64, ptr: *mut i8);
+    }
+vst4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_mul(a, b)
+}
+
+/// Polynomial multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v8i8")]
+        fn vmul_p8_(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t;
+    }
+vmul_p8_(a, b)
+}
+
+/// Polynomial multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v16i8")]
+        fn vmulq_p8_(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t;
+    }
+vmulq_p8_(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_mul(a, b)
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_mul(a, b)
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    simd_mul(a, vdup_n_s16(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    simd_mul(a, vdupq_n_s16(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    simd_mul(a, vdup_n_s32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    simd_mul(a, vdupq_n_s32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_n_u16(a: uint16x4_t, b: u16) -> uint16x4_t {
+    simd_mul(a, vdup_n_u16(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_n_u16(a: uint16x8_t, b: u16) -> uint16x8_t {
+    simd_mul(a, vdupq_n_u16(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_n_u32(a: uint32x2_t, b: u32) -> uint32x2_t {
+    simd_mul(a, vdup_n_u32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_n_u32(a: uint32x4_t, b: u32) -> uint32x4_t {
+    simd_mul(a, vdupq_n_u32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_n_f32(a: float32x2_t, b: f32) -> float32x2_t {
+    simd_mul(a, vdup_n_f32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_n_f32(a: float32x4_t, b: f32) -> float32x4_t {
+    simd_mul(a, vdupq_n_f32(b))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmul_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmulq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Signed multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v8i8")]
+        fn vmull_s8_(a: int8x8_t, b: int8x8_t) -> int16x8_t;
+    }
+vmull_s8_(a, b)
+}
+
+/// Signed multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v4i16")]
+        fn vmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
+    }
+vmull_s16_(a, b)
+}
+
+/// Signed multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v2i32")]
+        fn vmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
+    }
+vmull_s32_(a, b)
+}
+
+/// Unsigned multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v8i8")]
+        fn vmull_u8_(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t;
+    }
+vmull_u8_(a, b)
+}
+
+/// Unsigned multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v4i16")]
+        fn vmull_u16_(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t;
+    }
+vmull_u16_(a, b)
+}
+
+/// Unsigned multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v2i32")]
+        fn vmull_u32_(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t;
+    }
+vmull_u32_(a, b)
+}
+
+/// Polynomial multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.p8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullp.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmull.v8i8")]
+        fn vmull_p8_(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t;
+    }
+vmull_p8_(a, b)
+}
+
+/// Vector long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
+    vmull_s16(a, vdup_n_s16(b))
+}
+
+/// Vector long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
+    vmull_s32(a, vdup_n_s32(b))
+}
+
+/// Vector long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t {
+    vmull_u16(a, vdup_n_u16(b))
+}
+
+/// Vector long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t {
+    vmull_u32(a, vdup_n_u32(b))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_s16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_s16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_s32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_s32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_u16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_u16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_u32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmull_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_u32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f32")]
+        fn vfma_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+    }
+vfma_f32_(b, c, a)
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v4f32")]
+        fn vfmaq_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+    }
+vfmaq_f32_(b, c, a)
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vfma_f32(a, b, vdup_n_f32_vfp4(c))
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vfmaq_f32(a, b, vdupq_n_f32_vfp4(c))
+}
+
+/// Floating-point fused multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vfms_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    let b: float32x2_t = simd_neg(b);
+    vfma_f32(a, b, c)
+}
+
+/// Floating-point fused multiply-subtract from accumulator
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vfmsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    let b: float32x4_t = simd_neg(b);
+    vfmaq_f32(a, b, c)
+}
+
+/// Floating-point fused Multiply-subtract to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vfms_f32(a, b, vdup_n_f32_vfp4(c))
+}
+
+/// Floating-point fused Multiply-subtract to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vfmsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vfmsq_f32(a, b, vdupq_n_f32_vfp4(c))
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsub_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_sub(a, b)
+}
+
+/// Subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_sub(a, b)
+}
+
+/// Bitwise exclusive OR
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vadd_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    simd_xor(a, b)
+}
+
+/// Bitwise exclusive OR
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vadd_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    simd_xor(a, b)
+}
+
+/// Bitwise exclusive OR
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vaddq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    simd_xor(a, b)
+}
+
+/// Bitwise exclusive OR
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vaddq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    simd_xor(a, b)
+}
+
+/// Bitwise exclusive OR
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vadd_p64(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    simd_xor(a, b)
+}
+
+/// Bitwise exclusive OR
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vaddq_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    simd_xor(a, b)
+}
+
+/// Bitwise exclusive OR
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vaddq_p128(a: p128, b: p128) -> p128 {
+    a ^ b
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    let c: i16x8 = i16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    let c: i32x4 = i32x4::new(16, 16, 16, 16);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    let c: i64x2 = i64x2::new(32, 32);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    let c: u16x8 = u16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    let c: u32x4 = u32x4::new(16, 16, 16, 16);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    let c: u64x2 = u64x2::new(32, 32);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let d: int8x8_t = vsubhn_s16(b, c);
+    simd_shuffle16!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let d: int16x4_t = vsubhn_s32(b, c);
+    simd_shuffle8!(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let d: int32x2_t = vsubhn_s64(b, c);
+    simd_shuffle4!(a, d, [0, 1, 2, 3])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    let d: uint8x8_t = vsubhn_u16(b, c);
+    simd_shuffle16!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    let d: uint16x4_t = vsubhn_u32(b, c);
+    simd_shuffle8!(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    let d: uint32x2_t = vsubhn_u64(b, c);
+    simd_shuffle4!(a, d, [0, 1, 2, 3])
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i8")]
+        fn vhsub_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vhsub_u8_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v16i8")]
+        fn vhsubq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vhsubq_u8_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i16")]
+        fn vhsub_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vhsub_u16_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i16")]
+        fn vhsubq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vhsubq_u16_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v2i32")]
+        fn vhsub_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vhsub_u32_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i32")]
+        fn vhsubq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vhsubq_u32_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i8")]
+        fn vhsub_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vhsub_s8_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v16i8")]
+        fn vhsubq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vhsubq_s8_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i16")]
+        fn vhsub_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vhsub_s16_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i16")]
+        fn vhsubq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vhsubq_s16_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v2i32")]
+        fn vhsub_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vhsub_s32_(a, b)
+}
+
+/// Signed halving subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vhsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i32")]
+        fn vhsubq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vhsubq_s32_(a, b)
+}
+
+/// Signed Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    simd_sub(a, simd_cast(b))
+}
+
+/// Signed Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    simd_sub(a, simd_cast(b))
+}
+
+/// Signed Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    simd_sub(a, simd_cast(b))
+}
+
+/// Unsigned Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    simd_sub(a, simd_cast(b))
+}
+
+/// Unsigned Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    simd_sub(a, simd_cast(b))
+}
+
+/// Unsigned Subtract Wide
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    simd_sub(a, simd_cast(b))
+}
+
+/// Signed Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let c: int16x8_t = simd_cast(a);
+    let d: int16x8_t = simd_cast(b);
+    simd_sub(c, d)
+}
+
+/// Signed Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let c: int32x4_t = simd_cast(a);
+    let d: int32x4_t = simd_cast(b);
+    simd_sub(c, d)
+}
+
+/// Signed Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let c: int64x2_t = simd_cast(a);
+    let d: int64x2_t = simd_cast(b);
+    simd_sub(c, d)
+}
+
+/// Unsigned Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    let c: uint16x8_t = simd_cast(a);
+    let d: uint16x8_t = simd_cast(b);
+    simd_sub(c, d)
+}
+
+/// Unsigned Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    let c: uint32x4_t = simd_cast(a);
+    let d: uint32x4_t = simd_cast(b);
+    simd_sub(c, d)
+}
+
+/// Unsigned Subtract Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    let c: uint64x2_t = simd_cast(a);
+    let d: uint64x2_t = simd_cast(b);
+    simd_sub(c, d)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i8")]
+        fn vmax_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vmax_s8_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v16i8")]
+        fn vmaxq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vmaxq_s8_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i16")]
+        fn vmax_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vmax_s16_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i16")]
+        fn vmaxq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vmaxq_s16_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v2i32")]
+        fn vmax_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vmax_s32_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i32")]
+        fn vmaxq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vmaxq_s32_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i8")]
+        fn vmax_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vmax_u8_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v16i8")]
+        fn vmaxq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vmaxq_u8_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i16")]
+        fn vmax_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vmax_u16_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i16")]
+        fn vmaxq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vmaxq_u16_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v2i32")]
+        fn vmax_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vmax_u32_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i32")]
+        fn vmaxq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vmaxq_u32_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v2f32")]
+        fn vmax_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vmax_f32_(a, b)
+}
+
+/// Maximum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v4f32")]
+        fn vmaxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+vmaxq_f32_(a, b)
+}
+
+/// Floating-point Maximum Number (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v2f32")]
+        fn vmaxnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vmaxnm_f32_(a, b)
+}
+
+/// Floating-point Maximum Number (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v4f32")]
+        fn vmaxnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+vmaxnmq_f32_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i8")]
+        fn vmin_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vmin_s8_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v16i8")]
+        fn vminq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vminq_s8_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i16")]
+        fn vmin_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vmin_s16_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i16")]
+        fn vminq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vminq_s16_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v2i32")]
+        fn vmin_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vmin_s32_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i32")]
+        fn vminq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vminq_s32_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i8")]
+        fn vmin_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vmin_u8_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v16i8")]
+        fn vminq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vminq_u8_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i16")]
+        fn vmin_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vmin_u16_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i16")]
+        fn vminq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vminq_u16_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v2i32")]
+        fn vmin_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vmin_u32_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i32")]
+        fn vminq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vminq_u32_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v2f32")]
+        fn vmin_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vmin_f32_(a, b)
+}
+
+/// Minimum (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v4f32")]
+        fn vminq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+vminq_f32_(a, b)
+}
+
+/// Floating-point Minimum Number (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v2f32")]
+        fn vminnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vminnm_f32_(a, b)
+}
+
+/// Floating-point Minimum Number (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v4f32")]
+        fn vminnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+vminnmq_f32_(a, b)
+}
+
+/// Floating-point add pairwise
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(faddp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vpadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.faddp.v2f32")]
+        fn vpadd_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vpadd_f32_(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v4i32")]
+        fn vqdmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
+    }
+vqdmull_s16_(a, b)
+}
+
+/// Signed saturating doubling multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v2i64")]
+        fn vqdmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
+    }
+vqdmull_s32_(a, b)
+}
+
+/// Vector saturating doubling long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
+    vqdmull_s16(a, vdup_n_s16(b))
+}
+
+/// Vector saturating doubling long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
+    vqdmull_s32(a, vdup_n_s32(b))
+}
+
+/// Vector saturating doubling long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
+    vqdmull_s16(a, b)
+}
+
+/// Vector saturating doubling long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmull_lane_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
+    vqdmull_s32(a, b)
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_s16(b, c))
+}
+
+/// Signed saturating doubling multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_s32(b, c))
+}
+
+/// Vector widening saturating doubling multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_n_s16(b, c))
+}
+
+/// Vector widening saturating doubling multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_n_s32(b, c))
+}
+
+/// Vector widening saturating doubling multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 2))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlal_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    vqaddq_s32(a, vqdmull_lane_s16::<N>(b, c))
+}
+
+/// Vector widening saturating doubling multiply accumulate with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlal_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    vqaddq_s64(a, vqdmull_lane_s32::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_s16(b, c))
+}
+
+/// Signed saturating doubling multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_s32(b, c))
+}
+
+/// Vector widening saturating doubling multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_n_s16(b, c))
+}
+
+/// Vector widening saturating doubling multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_n_s32(b, c))
+}
+
+/// Vector widening saturating doubling multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 2))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlsl_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    vqsubq_s32(a, vqdmull_lane_s16::<N>(b, c))
+}
+
+/// Vector widening saturating doubling multiply subtract with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 1))]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmlsl_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    vqsubq_s64(a, vqdmull_lane_s32::<N>(b, c))
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i16")]
+        fn vqdmulh_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqdmulh_s16_(a, b)
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v8i16")]
+        fn vqdmulhq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqdmulhq_s16_(a, b)
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v2i32")]
+        fn vqdmulh_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqdmulh_s32_(a, b)
+}
+
+/// Signed saturating doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i32")]
+        fn vqdmulhq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqdmulhq_s32_(a, b)
+}
+
+/// Vector saturating doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    let b: int16x4_t = vdup_n_s16(b);
+    vqdmulh_s16(a, b)
+}
+
+/// Vector saturating doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    let b: int32x2_t = vdup_n_s32(b);
+    vqdmulh_s32(a, b)
+}
+
+/// Vector saturating doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulhq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    let b: int16x8_t = vdupq_n_s16(b);
+    vqdmulhq_s16(a, b)
+}
+
+/// Vector saturating doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    let b: int32x4_t = vdupq_n_s32(b);
+    vqdmulhq_s32(a, b)
+}
+
+/// Vector saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    vqdmulhq_s16(a, vdupq_n_s16(simd_extract(b, LANE as u32)))
+}
+
+/// Vector saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    vqdmulh_s16(a, vdup_n_s16(simd_extract(b, LANE as u32)))
+}
+
+/// Vector saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulhq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vqdmulhq_s32(a, vdupq_n_s32(simd_extract(b, LANE as u32)))
+}
+
+/// Vector saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqdmulh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    vqdmulh_s32(a, vdup_n_s32(simd_extract(b, LANE as u32)))
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqmovn_s16(a: int16x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtn.v8i8")]
+        fn vqmovn_s16_(a: int16x8_t) -> int8x8_t;
+    }
+vqmovn_s16_(a)
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqmovn_s32(a: int32x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtn.v4i16")]
+        fn vqmovn_s32_(a: int32x4_t) -> int16x4_t;
+    }
+vqmovn_s32_(a)
+}
+
+/// Signed saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqmovn_s64(a: int64x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovns.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtn.v2i32")]
+        fn vqmovn_s64_(a: int64x2_t) -> int32x2_t;
+    }
+vqmovn_s64_(a)
+}
+
+/// Unsigned saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqxtn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqmovn_u16(a: uint16x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v8i8")]
+        fn vqmovn_u16_(a: uint16x8_t) -> uint8x8_t;
+    }
+vqmovn_u16_(a)
+}
+
+/// Unsigned saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqxtn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqmovn_u32(a: uint32x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v4i16")]
+        fn vqmovn_u32_(a: uint32x4_t) -> uint16x4_t;
+    }
+vqmovn_u32_(a)
+}
+
+/// Unsigned saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqxtn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v2i32")]
+        fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t;
+    }
+vqmovn_u64_(a)
+}
+
+/// Signed saturating extract unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtun))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqmovun_s16(a: int16x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtun.v8i8")]
+        fn vqmovun_s16_(a: int16x8_t) -> uint8x8_t;
+    }
+vqmovun_s16_(a)
+}
+
+/// Signed saturating extract unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtun))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqmovun_s32(a: int32x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtun.v4i16")]
+        fn vqmovun_s32_(a: int32x4_t) -> uint16x4_t;
+    }
+vqmovun_s32_(a)
+}
+
+/// Signed saturating extract unsigned narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqmovun))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqxtun))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqmovun_s64(a: int64x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnsu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqxtun.v2i32")]
+        fn vqmovun_s64_(a: int64x2_t) -> uint32x2_t;
+    }
+vqmovun_s64_(a)
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v4i16")]
+        fn vqrdmulh_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqrdmulh_s16_(a, b)
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v8i16")]
+        fn vqrdmulhq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqrdmulhq_s16_(a, b)
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v2i32")]
+        fn vqrdmulh_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqrdmulh_s32_(a, b)
+}
+
+/// Signed saturating rounding doubling multiply returning high half
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrdmulh.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmulh.v4i32")]
+        fn vqrdmulhq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqrdmulhq_s32_(a, b)
+}
+
+/// Vector saturating rounding doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    vqrdmulh_s16(a, vdup_n_s16(b))
+}
+
+/// Vector saturating rounding doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulhq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    vqrdmulhq_s16(a, vdupq_n_s16(b))
+}
+
+/// Vector saturating rounding doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    vqrdmulh_s32(a, vdup_n_s32(b))
+}
+
+/// Vector saturating rounding doubling multiply high with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    vqrdmulhq_s32(a, vdupq_n_s32(b))
+}
+
+/// Vector rounding saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulh_s16(a, b)
+}
+
+/// Vector rounding saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulh_s16(a, b)
+}
+
+/// Vector rounding saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    let b: int16x8_t = simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulhq_s16(a, b)
+}
+
+/// Vector rounding saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    let b: int16x8_t = simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulhq_s16(a, b)
+}
+
+/// Vector rounding saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmulh_s32(a, b)
+}
+
+/// Vector rounding saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmulh_s32(a, b)
+}
+
+/// Vector rounding saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    let b: int32x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulhq_s32(a, b)
+}
+
+/// Vector rounding saturating doubling multiply high by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrdmulhq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    let b: int32x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmulhq_s32(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i8")]
+        fn vqrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqrshl_s8_(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v16i8")]
+        fn vqrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vqrshlq_s8_(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i16")]
+        fn vqrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqrshl_s16_(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i16")]
+        fn vqrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqrshlq_s16_(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i32")]
+        fn vqrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqrshl_s32_(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i32")]
+        fn vqrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqrshlq_s32_(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v1i64")]
+        fn vqrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vqrshl_s64_(a, b)
+}
+
+/// Signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i64")]
+        fn vqrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vqrshlq_s64_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i8")]
+        fn vqrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vqrshl_u8_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v16i8")]
+        fn vqrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vqrshlq_u8_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i16")]
+        fn vqrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vqrshl_u16_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i16")]
+        fn vqrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vqrshlq_u16_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i32")]
+        fn vqrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vqrshl_u32_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i32")]
+        fn vqrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vqrshlq_u32_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v1i64")]
+        fn vqrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vqrshl_u64_(a, b)
+}
+
+/// Unsigned signed saturating rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i64")]
+        fn vqrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vqrshlq_u64_(a, b)
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v8i8")]
+        fn vqrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+vqrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v8i8")]
+        fn vqrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+vqrshrn_n_s16_(a, N)
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v4i16")]
+        fn vqrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+vqrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v4i16")]
+        fn vqrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+vqrshrn_n_s32_(a, N)
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v2i32")]
+        fn vqrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+vqrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+}
+
+/// Signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v2i32")]
+        fn vqrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+vqrshrn_n_s64_(a, N)
+}
+
+/// Unsigned signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v8i8")]
+        fn vqrshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
+    }
+vqrshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
+}
+
+/// Unsigned signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v8i8")]
+        fn vqrshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
+    }
+vqrshrn_n_u16_(a, N)
+}
+
+/// Unsigned signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v4i16")]
+        fn vqrshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
+    }
+vqrshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
+}
+
+/// Unsigned signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v4i16")]
+        fn vqrshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
+    }
+vqrshrn_n_u32_(a, N)
+}
+
+/// Unsigned signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v2i32")]
+        fn vqrshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
+    }
+vqrshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
+}
+
+/// Unsigned signed saturating rounded shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v2i32")]
+        fn vqrshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
+    }
+vqrshrn_n_u64_(a, N)
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v8i8")]
+        fn vqrshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
+    }
+vqrshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v8i8")]
+        fn vqrshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
+    }
+vqrshrun_n_s16_(a, N)
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v4i16")]
+        fn vqrshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+    }
+vqrshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v4i16")]
+        fn vqrshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
+    }
+vqrshrun_n_s32_(a, N)
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v2i32")]
+        fn vqrshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+    }
+vqrshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+}
+
+/// Signed saturating rounded shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v2i32")]
+        fn vqrshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
+    }
+vqrshrun_n_s64_(a, N)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i8")]
+        fn vqshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqshl_s8_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v16i8")]
+        fn vqshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vqshlq_s8_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i16")]
+        fn vqshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqshl_s16_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i16")]
+        fn vqshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqshlq_s16_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i32")]
+        fn vqshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqshl_s32_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i32")]
+        fn vqshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqshlq_s32_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v1i64")]
+        fn vqshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vqshl_s64_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i64")]
+        fn vqshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vqshlq_s64_(a, b)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i8")]
+        fn vqshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vqshl_u8_(a, b)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v16i8")]
+        fn vqshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vqshlq_u8_(a, b)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i16")]
+        fn vqshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vqshl_u16_(a, b)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i16")]
+        fn vqshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vqshlq_u16_(a, b)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i32")]
+        fn vqshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vqshl_u32_(a, b)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i32")]
+        fn vqshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vqshlq_u32_(a, b)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v1i64")]
+        fn vqshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vqshl_u64_(a, b)
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i64")]
+        fn vqshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vqshlq_u64_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    vqshl_s8(a, vdup_n_s8(N as _))
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    vqshlq_s8(a, vdupq_n_s8(N as _))
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    vqshl_s16(a, vdup_n_s16(N as _))
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    vqshlq_s16(a, vdupq_n_s16(N as _))
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_imm5!(N);
+    vqshl_s32(a, vdup_n_s32(N as _))
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_imm5!(N);
+    vqshlq_s32(a, vdupq_n_s32(N as _))
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert_imm6!(N);
+    vqshl_s64(a, vdup_n_s64(N as _))
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_imm6!(N);
+    vqshlq_s64(a, vdupq_n_s64(N as _))
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    vqshl_u8(a, vdup_n_s8(N as _))
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    vqshlq_u8(a, vdupq_n_s8(N as _))
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    vqshl_u16(a, vdup_n_s16(N as _))
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    vqshlq_u16(a, vdupq_n_s16(N as _))
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_imm5!(N);
+    vqshl_u32(a, vdup_n_s32(N as _))
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_imm5!(N);
+    vqshlq_u32(a, vdupq_n_s32(N as _))
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert_imm6!(N);
+    vqshl_u64(a, vdup_n_s64(N as _))
+}
+
+/// Unsigned saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_imm6!(N);
+    vqshlq_u64(a, vdupq_n_s64(N as _))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlu_n_s8<const N: i32>(a: int8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v8i8")]
+        fn vqshlu_n_s8_(a: int8x8_t, n: int8x8_t) -> uint8x8_t;
+    }
+vqshlu_n_s8_(a, int8x8_t(N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlu_n_s8<const N: i32>(a: int8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshlu.v8i8")]
+        fn vqshlu_n_s8_(a: int8x8_t, n: int8x8_t) -> uint8x8_t;
+    }
+vqshlu_n_s8_(a, int8x8_t(N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlu_n_s16<const N: i32>(a: int16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v4i16")]
+        fn vqshlu_n_s16_(a: int16x4_t, n: int16x4_t) -> uint16x4_t;
+    }
+vqshlu_n_s16_(a, int16x4_t(N as i16, N as i16, N as i16, N as i16))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlu_n_s16<const N: i32>(a: int16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshlu.v4i16")]
+        fn vqshlu_n_s16_(a: int16x4_t, n: int16x4_t) -> uint16x4_t;
+    }
+vqshlu_n_s16_(a, int16x4_t(N as i16, N as i16, N as i16, N as i16))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlu_n_s32<const N: i32>(a: int32x2_t) -> uint32x2_t {
+    static_assert_imm5!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v2i32")]
+        fn vqshlu_n_s32_(a: int32x2_t, n: int32x2_t) -> uint32x2_t;
+    }
+vqshlu_n_s32_(a, int32x2_t(N as i32, N as i32))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlu_n_s32<const N: i32>(a: int32x2_t) -> uint32x2_t {
+    static_assert_imm5!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshlu.v2i32")]
+        fn vqshlu_n_s32_(a: int32x2_t, n: int32x2_t) -> uint32x2_t;
+    }
+vqshlu_n_s32_(a, int32x2_t(N as i32, N as i32))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlu_n_s64<const N: i32>(a: int64x1_t) -> uint64x1_t {
+    static_assert_imm6!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v1i64")]
+        fn vqshlu_n_s64_(a: int64x1_t, n: int64x1_t) -> uint64x1_t;
+    }
+vqshlu_n_s64_(a, int64x1_t(N as i64))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshlu_n_s64<const N: i32>(a: int64x1_t) -> uint64x1_t {
+    static_assert_imm6!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshlu.v1i64")]
+        fn vqshlu_n_s64_(a: int64x1_t, n: int64x1_t) -> uint64x1_t;
+    }
+vqshlu_n_s64_(a, int64x1_t(N as i64))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshluq_n_s8<const N: i32>(a: int8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v16i8")]
+        fn vqshluq_n_s8_(a: int8x16_t, n: int8x16_t) -> uint8x16_t;
+    }
+vqshluq_n_s8_(a, int8x16_t(N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshluq_n_s8<const N: i32>(a: int8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshlu.v16i8")]
+        fn vqshluq_n_s8_(a: int8x16_t, n: int8x16_t) -> uint8x16_t;
+    }
+vqshluq_n_s8_(a, int8x16_t(N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N as i8))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshluq_n_s16<const N: i32>(a: int16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v8i16")]
+        fn vqshluq_n_s16_(a: int16x8_t, n: int16x8_t) -> uint16x8_t;
+    }
+vqshluq_n_s16_(a, int16x8_t(N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshluq_n_s16<const N: i32>(a: int16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshlu.v8i16")]
+        fn vqshluq_n_s16_(a: int16x8_t, n: int16x8_t) -> uint16x8_t;
+    }
+vqshluq_n_s16_(a, int16x8_t(N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16, N as i16))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshluq_n_s32<const N: i32>(a: int32x4_t) -> uint32x4_t {
+    static_assert_imm5!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v4i32")]
+        fn vqshluq_n_s32_(a: int32x4_t, n: int32x4_t) -> uint32x4_t;
+    }
+vqshluq_n_s32_(a, int32x4_t(N as i32, N as i32, N as i32, N as i32))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshluq_n_s32<const N: i32>(a: int32x4_t) -> uint32x4_t {
+    static_assert_imm5!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshlu.v4i32")]
+        fn vqshluq_n_s32_(a: int32x4_t, n: int32x4_t) -> uint32x4_t;
+    }
+vqshluq_n_s32_(a, int32x4_t(N as i32, N as i32, N as i32, N as i32))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshluq_n_s64<const N: i32>(a: int64x2_t) -> uint64x2_t {
+    static_assert_imm6!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftsu.v2i64")]
+        fn vqshluq_n_s64_(a: int64x2_t, n: int64x2_t) -> uint64x2_t;
+    }
+vqshluq_n_s64_(a, int64x2_t(N as i64, N as i64))
+}
+
+/// Signed saturating shift left unsigned
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshlu, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshluq_n_s64<const N: i32>(a: int64x2_t) -> uint64x2_t {
+    static_assert_imm6!(N);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshlu.v2i64")]
+        fn vqshluq_n_s64_(a: int64x2_t, n: int64x2_t) -> uint64x2_t;
+    }
+vqshluq_n_s64_(a, int64x2_t(N as i64, N as i64))
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v8i8")]
+        fn vqshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+vqshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v8i8")]
+        fn vqshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+vqshrn_n_s16_(a, N)
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v4i16")]
+        fn vqshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+vqshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v4i16")]
+        fn vqshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+vqshrn_n_s32_(a, N)
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v2i32")]
+        fn vqshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+vqshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+}
+
+/// Signed saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v2i32")]
+        fn vqshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+vqshrn_n_s64_(a, N)
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v8i8")]
+        fn vqshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
+    }
+vqshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v8i8")]
+        fn vqshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
+    }
+vqshrn_n_u16_(a, N)
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v4i16")]
+        fn vqshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
+    }
+vqshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v4i16")]
+        fn vqshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
+    }
+vqshrn_n_u32_(a, N)
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v2i32")]
+        fn vqshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
+    }
+vqshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
+}
+
+/// Unsigned saturating shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v2i32")]
+        fn vqshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
+    }
+vqshrn_n_u64_(a, N)
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v8i8")]
+        fn vqshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
+    }
+vqshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v8i8")]
+        fn vqshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
+    }
+vqshrun_n_s16_(a, N)
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v4i16")]
+        fn vqshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+    }
+vqshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v4i16")]
+        fn vqshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
+    }
+vqshrun_n_s32_(a, N)
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v2i32")]
+        fn vqshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+    }
+vqshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+}
+
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v2i32")]
+        fn vqshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
+    }
+vqshrun_n_s64_(a, N)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
+        fn vrsqrte_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrsqrte_f32_(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsqrteq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v4f32")]
+        fn vrsqrteq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrsqrteq_f32_(a)
+}
+
+/// Unsigned reciprocal square root estimate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursqrte))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsqrte_u32(a: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ursqrte.v2i32")]
+        fn vrsqrte_u32_(a: uint32x2_t) -> uint32x2_t;
+    }
+vrsqrte_u32_(a)
+}
+
+/// Unsigned reciprocal square root estimate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursqrte))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsqrteq_u32(a: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ursqrte.v4i32")]
+        fn vrsqrteq_u32_(a: uint32x4_t) -> uint32x4_t;
+    }
+vrsqrteq_u32_(a)
+}
+
+/// Floating-point reciprocal square root step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrts))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrts))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsqrts_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrts.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrts.v2f32")]
+        fn vrsqrts_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vrsqrts_f32_(a, b)
+}
+
+/// Floating-point reciprocal square root step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrts))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrts))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsqrtsq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrts.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrts.v4f32")]
+        fn vrsqrtsq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+vrsqrtsq_f32_(a, b)
+}
+
+/// Reciprocal estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrecpe_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v2f32")]
+        fn vrecpe_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrecpe_f32_(a)
+}
+
+/// Reciprocal estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrecpeq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v4f32")]
+        fn vrecpeq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrecpeq_f32_(a)
+}
+
+/// Unsigned reciprocal estimate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urecpe))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrecpe_u32(a: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urecpe.v2i32")]
+        fn vrecpe_u32_(a: uint32x2_t) -> uint32x2_t;
+    }
+vrecpe_u32_(a)
+}
+
+/// Unsigned reciprocal estimate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urecpe))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrecpeq_u32(a: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urecpe.v4i32")]
+        fn vrecpeq_u32_(a: uint32x4_t) -> uint32x4_t;
+    }
+vrecpeq_u32_(a)
+}
+
+/// Floating-point reciprocal step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecps))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecps))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrecps_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecps.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecps.v2f32")]
+        fn vrecps_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vrecps_f32_(a, b)
+}
+
+/// Floating-point reciprocal step
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecps))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecps))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrecpsq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecps.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecps.v4f32")]
+        fn vrecpsq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+vrecpsq_f32_(a, b)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_u8(a: uint8x8_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_p8(a: poly8x8_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_p16(a: poly16x4_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_u16(a: uint16x4_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_u32(a: uint32x2_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_u64(a: uint64x1_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_u8(a: uint8x16_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_p8(a: poly8x16_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_p16(a: poly16x8_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_u16(a: uint16x8_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_u32(a: uint32x4_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_u64(a: uint64x2_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_p8(a: poly8x8_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_s8(a: int8x8_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_p16(a: poly16x4_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_s16(a: int16x4_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_s32(a: int32x2_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_s64(a: int64x1_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_p8(a: poly8x16_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_s8(a: int8x16_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_p16(a: poly16x8_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_s16(a: int16x8_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_s64(a: int64x2_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_s8(a: int8x8_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_u8(a: uint8x8_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_s16(a: int16x4_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_u16(a: uint16x4_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_s8(a: int8x16_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_u8(a: uint8x16_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_s16(a: int16x8_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_u16(a: uint16x8_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_s16(a: int16x4_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_u16(a: uint16x4_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_p16(a: poly16x4_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_s32(a: int32x2_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_u32(a: uint32x2_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_s64(a: int64x1_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_u64(a: uint64x1_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_s16(a: int16x8_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_u16(a: uint16x8_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_p16(a: poly16x8_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_s32(a: int32x4_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_u32(a: uint32x4_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_s64(a: int64x2_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_u64(a: uint64x2_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_p16(a: poly16x4_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_s16(a: int16x4_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_u16(a: uint16x4_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_s32(a: int32x2_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_u32(a: uint32x2_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_s64(a: int64x1_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_u64(a: uint64x1_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_p16(a: poly16x8_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_u16(a: uint16x8_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_s32(a: int32x4_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_u32(a: uint32x4_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_s64(a: int64x2_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_u64(a: uint64x2_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_p16(a: poly16x4_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_s16(a: int16x4_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_u16(a: uint16x4_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_s32(a: int32x2_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_u32(a: uint32x2_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_p16(a: poly16x8_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_s16(a: int16x8_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_u16(a: uint16x8_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_s32(a: int32x4_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_u32(a: uint32x4_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_p64(a: poly64x1_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_p64(a: poly64x1_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_p64(a: poly64x2_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_p64(a: poly64x2_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_p128(a: p128) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_p128(a: p128) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p64_p128(a: p128) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_p8(a: poly8x8_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_s8(a: int8x8_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_u8(a: uint8x8_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_p16(a: poly16x4_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_s16(a: int16x4_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_u16(a: uint16x4_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_s32(a: int32x2_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_u32(a: uint32x2_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_p8(a: poly8x16_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_s8(a: int8x16_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_u8(a: uint8x16_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_p16(a: poly16x8_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_s16(a: int16x8_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_u16(a: uint16x8_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_s32(a: int32x4_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_u32(a: uint32x4_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_p8(a: poly8x8_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_s8(a: int8x8_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_u8(a: uint8x8_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_p16(a: poly16x4_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_s16(a: int16x4_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_u16(a: uint16x4_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p64_s32(a: int32x2_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p64_u32(a: uint32x2_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p64_s32(a: int32x4_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p64_u32(a: uint32x4_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_s64(a: int64x2_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_u64(a: uint64x2_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_p64(a: poly64x2_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_p64(a: poly64x1_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_p64(a: poly64x1_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_p64(a: poly64x1_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_p64(a: poly64x2_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_p64(a: poly64x2_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_p64(a: poly64x2_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_p128(a: p128) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_p128(a: p128) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p64_p16(a: poly16x4_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p64_s16(a: int16x4_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p64_u16(a: uint16x4_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p64_p16(a: poly16x8_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p64_s16(a: int16x8_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p64_u16(a: uint16x8_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_s32(a: int32x4_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_u32(a: uint32x4_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_p64(a: poly64x1_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_p64(a: poly64x1_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_p64(a: poly64x1_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_p64(a: poly64x2_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_p64(a: poly64x2_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_p64(a: poly64x2_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_p128(a: p128) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_p128(a: p128) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_p128(a: p128) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p64_p8(a: poly8x8_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p64_s8(a: int8x8_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p64_u8(a: uint8x8_t) -> poly64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p64_p8(a: poly8x16_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p64_s8(a: int8x16_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p64_u8(a: uint8x16_t) -> poly64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_s16(a: int16x8_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_u16(a: uint16x8_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_p16(a: poly16x8_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_s8(a: int8x16_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_u8(a: uint8x16_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_p8(a: poly8x16_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_p128(a: p128) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_p128(a: p128) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_p128(a: p128) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_p128_f32(a: float32x4_t) -> p128 {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_p16(a: poly16x8_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vreinterpretq_f32_p128(a: p128) -> float32x4_t {
+    transmute(a)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i8")]
+        fn vrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vrshl_s8_(a, b)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v16i8")]
+        fn vrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vrshlq_s8_(a, b)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i16")]
+        fn vrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vrshl_s16_(a, b)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i16")]
+        fn vrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vrshlq_s16_(a, b)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i32")]
+        fn vrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vrshl_s32_(a, b)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i32")]
+        fn vrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vrshlq_s32_(a, b)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v1i64")]
+        fn vrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vrshl_s64_(a, b)
+}
+
+/// Signed rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i64")]
+        fn vrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vrshlq_s64_(a, b)
+}
+
+/// Unsigned rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i8")]
+        fn vrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vrshl_u8_(a, b)
+}
+
+/// Unsigned rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v16i8")]
+        fn vrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vrshlq_u8_(a, b)
+}
+
+/// Unsigned rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i16")]
+        fn vrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vrshl_u16_(a, b)
+}
+
+/// Unsigned rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i16")]
+        fn vrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vrshlq_u16_(a, b)
+}
+
+/// Unsigned rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i32")]
+        fn vrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vrshl_u32_(a, b)
+}
+
+/// Unsigned rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i32")]
+        fn vrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vrshlq_u32_(a, b)
+}
+
+/// Unsigned rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v1i64")]
+        fn vrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vrshl_u64_(a, b)
+}
+
+/// Unsigned rounding shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i64")]
+        fn vrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vrshlq_u64_(a, b)
+}
+
+/// Signed rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshl_s8(a, vdup_n_s8((-N) as _))
+}
+
+/// Signed rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshlq_s8(a, vdupq_n_s8((-N) as _))
+}
+
+/// Signed rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    vrshl_s16(a, vdup_n_s16((-N) as _))
+}
+
+/// Signed rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    vrshlq_s16(a, vdupq_n_s16((-N) as _))
+}
+
+/// Signed rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    vrshl_s32(a, vdup_n_s32((-N) as _))
+}
+
+/// Signed rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    vrshlq_s32(a, vdupq_n_s32((-N) as _))
+}
+
+/// Signed rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshl_s64(a, vdup_n_s64((-N) as _))
+}
+
+/// Signed rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshlq_s64(a, vdupq_n_s64((-N) as _))
+}
+
+/// Unsigned rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshl_u8(a, vdup_n_s8((-N) as _))
+}
+
+/// Unsigned rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshlq_u8(a, vdupq_n_s8((-N) as _))
+}
+
+/// Unsigned rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    vrshl_u16(a, vdup_n_s16((-N) as _))
+}
+
+/// Unsigned rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    vrshlq_u16(a, vdupq_n_s16((-N) as _))
+}
+
+/// Unsigned rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    vrshl_u32(a, vdup_n_s32((-N) as _))
+}
+
+/// Unsigned rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    vrshlq_u32(a, vdupq_n_s32((-N) as _))
+}
+
+/// Unsigned rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshl_u64(a, vdup_n_s64((-N) as _))
+}
+
+/// Unsigned rounding shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshlq_u64(a, vdupq_n_s64((-N) as _))
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v8i8")]
+        fn vrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+vrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v8i8")]
+        fn vrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+vrshrn_n_s16_(a, N)
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v4i16")]
+        fn vrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+vrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v4i16")]
+        fn vrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+vrshrn_n_s32_(a, N)
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v2i32")]
+        fn vrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+vrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v2i32")]
+        fn vrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+vrshrn_n_s64_(a, N)
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    transmute(vrshrn_n_s16::<N>(transmute(a)))
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    transmute(vrshrn_n_s32::<N>(transmute(a)))
+}
+
+/// Rounding shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    transmute(vrshrn_n_s64::<N>(transmute(a)))
+}
+
+/// Signed rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshr_n_s8::<N>(b))
+}
+
+/// Signed rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshrq_n_s8::<N>(b))
+}
+
+/// Signed rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vrshr_n_s16::<N>(b))
+}
+
+/// Signed rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vrshrq_n_s16::<N>(b))
+}
+
+/// Signed rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vrshr_n_s32::<N>(b))
+}
+
+/// Signed rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vrshrq_n_s32::<N>(b))
+}
+
+/// Signed rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshr_n_s64::<N>(b))
+}
+
+/// Signed rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshrq_n_s64::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshr_n_u8::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshrq_n_u8::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vrshr_n_u16::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vrshrq_n_u16::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vrshr_n_u32::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vrshrq_n_u32::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshr_n_u64::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshrq_n_u64::<N>(b))
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rsubhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsubhn.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rsubhn.v8i8")]
+        fn vrsubhn_s16_(a: int16x8_t, b: int16x8_t) -> int8x8_t;
+    }
+vrsubhn_s16_(a, b)
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rsubhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsubhn.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rsubhn.v4i16")]
+        fn vrsubhn_s32_(a: int32x4_t, b: int32x4_t) -> int16x4_t;
+    }
+vrsubhn_s32_(a, b)
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rsubhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsubhn.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rsubhn.v2i32")]
+        fn vrsubhn_s64_(a: int64x2_t, b: int64x2_t) -> int32x2_t;
+    }
+vrsubhn_s64_(a, b)
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rsubhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    transmute(vrsubhn_s16(transmute(a), transmute(b)))
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rsubhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    transmute(vrsubhn_s32(transmute(a), transmute(b)))
+}
+
+/// Rounding subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rsubhn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vrsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    transmute(vrsubhn_s64(transmute(a), transmute(b)))
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_s8<const LANE: i32>(a: i8, b: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_s64<const LANE: i32>(a: i64, b: int64x1_t) -> int64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_u8<const LANE: i32>(a: u8, b: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_u16<const LANE: i32>(a: u16, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_u32<const LANE: i32>(a: u32, b: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_u64<const LANE: i32>(a: u64, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_p8<const LANE: i32>(a: p8, b: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_p16<const LANE: i32>(a: p16, b: poly16x4_t) -> poly16x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_p64<const LANE: i32>(a: p64, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_s8<const LANE: i32>(a: i8, b: int8x16_t) -> int8x16_t {
+    static_assert_imm4!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_s16<const LANE: i32>(a: i16, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_s32<const LANE: i32>(a: i32, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_s64<const LANE: i32>(a: i64, b: int64x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_u8<const LANE: i32>(a: u8, b: uint8x16_t) -> uint8x16_t {
+    static_assert_imm4!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_u16<const LANE: i32>(a: u16, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_u32<const LANE: i32>(a: u32, b: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_u64<const LANE: i32>(a: u64, b: uint64x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_p8<const LANE: i32>(a: p8, b: poly8x16_t) -> poly8x16_t {
+    static_assert_imm4!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_p16<const LANE: i32>(a: p16, b: poly16x8_t) -> poly16x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_p64<const LANE: i32>(a: p64, b: poly64x2_t) -> poly64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vset_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Insert vector element from another vector element
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsetq_lane_f32<const LANE: i32>(a: f32, b: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(b, LANE as u32, a)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i8")]
+        fn vshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vshl_s8_(a, b)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v16i8")]
+        fn vshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vshlq_s8_(a, b)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i16")]
+        fn vshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vshl_s16_(a, b)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i16")]
+        fn vshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vshlq_s16_(a, b)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i32")]
+        fn vshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vshl_s32_(a, b)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i32")]
+        fn vshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vshlq_s32_(a, b)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v1i64")]
+        fn vshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vshl_s64_(a, b)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i64")]
+        fn vshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vshlq_s64_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i8")]
+        fn vshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vshl_u8_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v16i8")]
+        fn vshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vshlq_u8_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i16")]
+        fn vshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vshl_u16_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i16")]
+        fn vshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vshlq_u16_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i32")]
+        fn vshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vshl_u32_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i32")]
+        fn vshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vshlq_u32_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v1i64")]
+        fn vshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vshl_u64_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i64")]
+        fn vshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vshlq_u64_(a, b)
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdup_n_s8(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdupq_n_s8(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdup_n_s16(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdupq_n_s16(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdup_n_s32(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdupq_n_s32(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdup_n_u8(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdupq_n_u8(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdup_n_u16(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdupq_n_u16(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdup_n_u32(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdupq_n_u32(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdup_n_s64(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdupq_n_s64(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdup_n_u64(N as _))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdupq_n_u64(N as _))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshll_n_s8<const N: i32>(a: int8x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 0 && N <= 8);
+    simd_shl(simd_cast(a), vdupq_n_s16(N as _))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshll_n_s16<const N: i32>(a: int16x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 0 && N <= 16);
+    simd_shl(simd_cast(a), vdupq_n_s32(N as _))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshll_n_s32<const N: i32>(a: int32x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 0 && N <= 32);
+    simd_shl(simd_cast(a), vdupq_n_s64(N as _))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshll_n_u8<const N: i32>(a: uint8x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 0 && N <= 8);
+    simd_shl(simd_cast(a), vdupq_n_u16(N as _))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshll_n_u16<const N: i32>(a: uint16x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 0 && N <= 16);
+    simd_shl(simd_cast(a), vdupq_n_u32(N as _))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshll_n_u32<const N: i32>(a: uint32x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 0 && N <= 32);
+    simd_shl(simd_cast(a), vdupq_n_u64(N as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    let n: i32 = if N == 8 { 7 } else { N };
+    simd_shr(a, vdup_n_s8(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    let n: i32 = if N == 8 { 7 } else { N };
+    simd_shr(a, vdupq_n_s8(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    let n: i32 = if N == 16 { 15 } else { N };
+    simd_shr(a, vdup_n_s16(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    let n: i32 = if N == 16 { 15 } else { N };
+    simd_shr(a, vdupq_n_s16(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    let n: i32 = if N == 32 { 31 } else { N };
+    simd_shr(a, vdup_n_s32(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    let n: i32 = if N == 32 { 31 } else { N };
+    simd_shr(a, vdupq_n_s32(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    let n: i32 = if N == 64 { 63 } else { N };
+    simd_shr(a, vdup_n_s64(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    let n: i32 = if N == 64 { 63 } else { N };
+    simd_shr(a, vdupq_n_s64(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    let n: i32 = if N == 8 { return vdup_n_u8(0); } else { N };
+    simd_shr(a, vdup_n_u8(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    let n: i32 = if N == 8 { return vdupq_n_u8(0); } else { N };
+    simd_shr(a, vdupq_n_u8(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    let n: i32 = if N == 16 { return vdup_n_u16(0); } else { N };
+    simd_shr(a, vdup_n_u16(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    let n: i32 = if N == 16 { return vdupq_n_u16(0); } else { N };
+    simd_shr(a, vdupq_n_u16(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    let n: i32 = if N == 32 { return vdup_n_u32(0); } else { N };
+    simd_shr(a, vdup_n_u32(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    let n: i32 = if N == 32 { return vdupq_n_u32(0); } else { N };
+    simd_shr(a, vdupq_n_u32(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    let n: i32 = if N == 64 { return vdup_n_u64(0); } else { N };
+    simd_shr(a, vdup_n_u64(n as _))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    let n: i32 = if N == 64 { return vdupq_n_u64(0); } else { N };
+    simd_shr(a, vdupq_n_u64(n as _))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_cast(simd_shr(a, vdupq_n_s16(N as _)))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_cast(simd_shr(a, vdupq_n_s32(N as _)))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_cast(simd_shr(a, vdupq_n_s64(N as _)))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_cast(simd_shr(a, vdupq_n_u16(N as _)))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_cast(simd_shr(a, vdupq_n_u32(N as _)))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_cast(simd_shr(a, vdupq_n_u64(N as _)))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshr_n_s8::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshrq_n_s8::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshr_n_s16::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshrq_n_s16::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshr_n_s32::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshrq_n_s32::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshr_n_s64::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshrq_n_s64::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshr_n_u8::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshrq_n_u8::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshr_n_u16::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshrq_n_u16::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshr_n_u32::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshrq_n_u32::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshr_n_u64::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshrq_n_u64::<N>(b))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrn_s8(a: int8x8_t, b: int8x8_t) -> int8x8x2_t {
+    let a1: int8x8_t = simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+    let b1: int8x8_t = simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrn_s16(a: int16x4_t, b: int16x4_t) -> int16x4x2_t {
+    let a1: int16x4_t = simd_shuffle4!(a, b, [0, 4, 2, 6]);
+    let b1: int16x4_t = simd_shuffle4!(a, b, [1, 5, 3, 7]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrnq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t {
+    let a1: int8x16_t = simd_shuffle16!(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]);
+    let b1: int8x16_t = simd_shuffle16!(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrnq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t {
+    let a1: int16x8_t = simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+    let b1: int16x8_t = simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrnq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t {
+    let a1: int32x4_t = simd_shuffle4!(a, b, [0, 4, 2, 6]);
+    let b1: int32x4_t = simd_shuffle4!(a, b, [1, 5, 3, 7]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrn_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8x2_t {
+    let a1: uint8x8_t = simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+    let b1: uint8x8_t = simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrn_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4x2_t {
+    let a1: uint16x4_t = simd_shuffle4!(a, b, [0, 4, 2, 6]);
+    let b1: uint16x4_t = simd_shuffle4!(a, b, [1, 5, 3, 7]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrnq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t {
+    let a1: uint8x16_t = simd_shuffle16!(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]);
+    let b1: uint8x16_t = simd_shuffle16!(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrnq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t {
+    let a1: uint16x8_t = simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+    let b1: uint16x8_t = simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrnq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t {
+    let a1: uint32x4_t = simd_shuffle4!(a, b, [0, 4, 2, 6]);
+    let b1: uint32x4_t = simd_shuffle4!(a, b, [1, 5, 3, 7]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrn_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8x2_t {
+    let a1: poly8x8_t = simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+    let b1: poly8x8_t = simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrn_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4x2_t {
+    let a1: poly16x4_t = simd_shuffle4!(a, b, [0, 4, 2, 6]);
+    let b1: poly16x4_t = simd_shuffle4!(a, b, [1, 5, 3, 7]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrnq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t {
+    let a1: poly8x16_t = simd_shuffle16!(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]);
+    let b1: poly8x16_t = simd_shuffle16!(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrnq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t {
+    let a1: poly16x8_t = simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14]);
+    let b1: poly16x8_t = simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrn_s32(a: int32x2_t, b: int32x2_t) -> int32x2x2_t {
+    let a1: int32x2_t = simd_shuffle2!(a, b, [0, 2]);
+    let b1: int32x2_t = simd_shuffle2!(a, b, [1, 3]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrn_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t {
+    let a1: uint32x2_t = simd_shuffle2!(a, b, [0, 2]);
+    let b1: uint32x2_t = simd_shuffle2!(a, b, [1, 3]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrn_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t {
+    let a1: float32x2_t = simd_shuffle2!(a, b, [0, 2]);
+    let b1: float32x2_t = simd_shuffle2!(a, b, [1, 3]);
+    transmute((a1, b1))
+}
+
+/// Transpose elements
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(trn))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vtrnq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t {
+    let a1: float32x4_t = simd_shuffle4!(a, b, [0, 4, 2, 6]);
+    let b1: float32x4_t = simd_shuffle4!(a, b, [1, 5, 3, 7]);
+    transmute((a1, b1))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzip_s8(a: int8x8_t, b: int8x8_t) -> int8x8x2_t {
+    let a0: int8x8_t = simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+    let b0: int8x8_t = simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzip_s16(a: int16x4_t, b: int16x4_t) -> int16x4x2_t {
+    let a0: int16x4_t = simd_shuffle4!(a, b, [0, 4, 1, 5]);
+    let b0: int16x4_t = simd_shuffle4!(a, b, [2, 6, 3, 7]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzip_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8x2_t {
+    let a0: uint8x8_t = simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+    let b0: uint8x8_t = simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzip_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4x2_t {
+    let a0: uint16x4_t = simd_shuffle4!(a, b, [0, 4, 1, 5]);
+    let b0: uint16x4_t = simd_shuffle4!(a, b, [2, 6, 3, 7]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzip_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8x2_t {
+    let a0: poly8x8_t = simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+    let b0: poly8x8_t = simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vzip))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzip_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4x2_t {
+    let a0: poly16x4_t = simd_shuffle4!(a, b, [0, 4, 1, 5]);
+    let b0: poly16x4_t = simd_shuffle4!(a, b, [2, 6, 3, 7]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzip_s32(a: int32x2_t, b: int32x2_t) -> int32x2x2_t {
+    let a0: int32x2_t = simd_shuffle2!(a, b, [0, 2]);
+    let b0: int32x2_t = simd_shuffle2!(a, b, [1, 3]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzip_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t {
+    let a0: uint32x2_t = simd_shuffle2!(a, b, [0, 2]);
+    let b0: uint32x2_t = simd_shuffle2!(a, b, [1, 3]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzipq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t {
+    let a0: int8x16_t = simd_shuffle16!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]);
+    let b0: int8x16_t = simd_shuffle16!(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzipq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t {
+    let a0: int16x8_t = simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+    let b0: int16x8_t = simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzipq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t {
+    let a0: int32x4_t = simd_shuffle4!(a, b, [0, 4, 1, 5]);
+    let b0: int32x4_t = simd_shuffle4!(a, b, [2, 6, 3, 7]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzipq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t {
+    let a0: uint8x16_t = simd_shuffle16!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]);
+    let b0: uint8x16_t = simd_shuffle16!(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzipq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t {
+    let a0: uint16x8_t = simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+    let b0: uint16x8_t = simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzipq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t {
+    let a0: uint32x4_t = simd_shuffle4!(a, b, [0, 4, 1, 5]);
+    let b0: uint32x4_t = simd_shuffle4!(a, b, [2, 6, 3, 7]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzipq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t {
+    let a0: poly8x16_t = simd_shuffle16!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]);
+    let b0: poly8x16_t = simd_shuffle16!(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzipq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t {
+    let a0: poly16x8_t = simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
+    let b0: poly16x8_t = simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzip_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t {
+    let a0: float32x2_t = simd_shuffle2!(a, b, [0, 2]);
+    let b0: float32x2_t = simd_shuffle2!(a, b, [1, 3]);
+    transmute((a0, b0))
+}
+
+/// Zip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vzipq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t {
+    let a0: float32x4_t = simd_shuffle4!(a, b, [0, 4, 1, 5]);
+    let b0: float32x4_t = simd_shuffle4!(a, b, [2, 6, 3, 7]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzp_s8(a: int8x8_t, b: int8x8_t) -> int8x8x2_t {
+    let a0: int8x8_t = simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+    let b0: int8x8_t = simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzp_s16(a: int16x4_t, b: int16x4_t) -> int16x4x2_t {
+    let a0: int16x4_t = simd_shuffle4!(a, b, [0, 2, 4, 6]);
+    let b0: int16x4_t = simd_shuffle4!(a, b, [1, 3, 5, 7]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzpq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t {
+    let a0: int8x16_t = simd_shuffle16!(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]);
+    let b0: int8x16_t = simd_shuffle16!(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzpq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t {
+    let a0: int16x8_t = simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+    let b0: int16x8_t = simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzpq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t {
+    let a0: int32x4_t = simd_shuffle4!(a, b, [0, 2, 4, 6]);
+    let b0: int32x4_t = simd_shuffle4!(a, b, [1, 3, 5, 7]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzp_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8x2_t {
+    let a0: uint8x8_t = simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+    let b0: uint8x8_t = simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzp_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4x2_t {
+    let a0: uint16x4_t = simd_shuffle4!(a, b, [0, 2, 4, 6]);
+    let b0: uint16x4_t = simd_shuffle4!(a, b, [1, 3, 5, 7]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzpq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t {
+    let a0: uint8x16_t = simd_shuffle16!(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]);
+    let b0: uint8x16_t = simd_shuffle16!(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzpq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t {
+    let a0: uint16x8_t = simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+    let b0: uint16x8_t = simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzpq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t {
+    let a0: uint32x4_t = simd_shuffle4!(a, b, [0, 2, 4, 6]);
+    let b0: uint32x4_t = simd_shuffle4!(a, b, [1, 3, 5, 7]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzp_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8x2_t {
+    let a0: poly8x8_t = simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+    let b0: poly8x8_t = simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzp_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4x2_t {
+    let a0: poly16x4_t = simd_shuffle4!(a, b, [0, 2, 4, 6]);
+    let b0: poly16x4_t = simd_shuffle4!(a, b, [1, 3, 5, 7]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzpq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t {
+    let a0: poly8x16_t = simd_shuffle16!(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]);
+    let b0: poly8x16_t = simd_shuffle16!(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzpq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t {
+    let a0: poly16x8_t = simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+    let b0: poly16x8_t = simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzp_s32(a: int32x2_t, b: int32x2_t) -> int32x2x2_t {
+    let a0: int32x2_t = simd_shuffle2!(a, b, [0, 2]);
+    let b0: int32x2_t = simd_shuffle2!(a, b, [1, 3]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzp_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t {
+    let a0: uint32x2_t = simd_shuffle2!(a, b, [0, 2]);
+    let b0: uint32x2_t = simd_shuffle2!(a, b, [1, 3]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vtrn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzp_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t {
+    let a0: float32x2_t = simd_shuffle2!(a, b, [0, 2]);
+    let b0: float32x2_t = simd_shuffle2!(a, b, [1, 3]);
+    transmute((a0, b0))
+}
+
+/// Unzip vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vuzp))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uzp))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vuzpq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t {
+    let a0: float32x4_t = simd_shuffle4!(a, b, [0, 2, 4, 6]);
+    let b0: float32x4_t = simd_shuffle4!(a, b, [1, 3, 5, 7]);
+    transmute((a0, b0))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    let d: uint8x8_t = vabd_u8(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    let d: uint16x4_t = vabd_u16(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    let d: uint32x2_t = vabd_u32(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    let d: int8x8_t = vabd_s8(b, c);
+    let e: uint8x8_t = simd_cast(d);
+    simd_add(a, simd_cast(e))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    let d: int16x4_t = vabd_s16(b, c);
+    let e: uint16x4_t = simd_cast(d);
+    simd_add(a, simd_cast(e))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vabal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    let d: int32x2_t = vabd_s32(b, c);
+    let e: uint32x2_t = simd_cast(d);
+    simd_add(a, simd_cast(e))
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqabs_s8(a: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i8")]
+        fn vqabs_s8_(a: int8x8_t) -> int8x8_t;
+    }
+vqabs_s8_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqabsq_s8(a: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v16i8")]
+        fn vqabsq_s8_(a: int8x16_t) -> int8x16_t;
+    }
+vqabsq_s8_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqabs_s16(a: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i16")]
+        fn vqabs_s16_(a: int16x4_t) -> int16x4_t;
+    }
+vqabs_s16_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqabsq_s16(a: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i16")]
+        fn vqabsq_s16_(a: int16x8_t) -> int16x8_t;
+    }
+vqabsq_s16_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqabs_s32(a: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v2i32")]
+        fn vqabs_s32_(a: int32x2_t) -> int32x2_t;
+    }
+vqabs_s32_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+#[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+pub unsafe fn vqabsq_s32(a: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i32")]
+        fn vqabsq_s32_(a: int32x4_t) -> int32x4_t;
+    }
+vqabsq_s32_(a)
+}
+
+#[cfg(test)]
+#[allow(overflowing_literals)]
+mod test {
+    use super::*;
+    use crate::core_arch::simd::*;
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s8() {
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: i8x16 = i8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s16() {
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s16() {
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s32() {
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x0F, 0x0F);
+        let e: i32x2 = i32x2::new(0x00, 0x01);
+        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x00, 0x00);
+        let e: i32x2 = i32x2::new(0x00, 0x00);
+        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s32() {
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u8() {
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: u8x16 = u8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u16() {
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u32() {
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x0F, 0x0F);
+        let e: u32x2 = u32x2::new(0x00, 0x01);
+        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x00, 0x00);
+        let e: u32x2 = u32x2::new(0x00, 0x00);
+        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s64() {
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x0F);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x00);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x0F, 0x0F);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x00);
+        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u64() {
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x0F);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x00);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x0F, 0x0F);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x00);
+        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s8() {
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i8x8 = transmute(vorr_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: i8x16 = transmute(vorrq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s16() {
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i16x4 = transmute(vorr_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s16() {
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i16x8 = transmute(vorrq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s32() {
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x00, 0x00);
+        let e: i32x2 = i32x2::new(0x00, 0x01);
+        let r: i32x2 = transmute(vorr_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s32() {
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(vorrq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u8() {
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u8x8 = transmute(vorr_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: u8x16 = transmute(vorrq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u16() {
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u16x4 = transmute(vorr_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(vorrq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u32() {
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x00, 0x00);
+        let e: u32x2 = u32x2::new(0x00, 0x01);
+        let r: u32x2 = transmute(vorr_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(vorrq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s64() {
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x00);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(vorr_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(vorrq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u64() {
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x00);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(vorr_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(vorrq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s8() {
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i8x8 = transmute(veor_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: i8x16 = transmute(veorq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s16() {
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i16x4 = transmute(veor_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s16() {
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i16x8 = transmute(veorq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s32() {
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x00, 0x00);
+        let e: i32x2 = i32x2::new(0x00, 0x01);
+        let r: i32x2 = transmute(veor_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s32() {
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(veorq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u8() {
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u8x8 = transmute(veor_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: u8x16 = transmute(veorq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u16() {
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u16x4 = transmute(veor_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(veorq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u32() {
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x00, 0x00);
+        let e: u32x2 = u32x2::new(0x00, 0x01);
+        let r: u32x2 = transmute(veor_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(veorq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s64() {
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x00);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(veor_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(veorq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u64() {
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x00);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(veor_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(veorq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i8x8 = i8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: i8x8 = transmute(vabd_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: i8x16 = i8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
+        let r: i8x16 = transmute(vabdq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(16, 15, 14, 13);
+        let e: i16x4 = i16x4::new(15, 13, 11, 9);
+        let r: i16x4 = transmute(vabd_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i16x8 = i16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: i16x8 = transmute(vabdq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(16, 15);
+        let e: i32x2 = i32x2::new(15, 13);
+        let r: i32x2 = transmute(vabd_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(16, 15, 14, 13);
+        let e: i32x4 = i32x4::new(15, 13, 11, 9);
+        let r: i32x4 = transmute(vabdq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u8x8 = u8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: u8x8 = transmute(vabd_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: u8x16 = u8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
+        let r: u8x16 = transmute(vabdq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(16, 15, 14, 13);
+        let e: u16x4 = u16x4::new(15, 13, 11, 9);
+        let r: u16x4 = transmute(vabd_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u16x8 = u16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: u16x8 = transmute(vabdq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(16, 15);
+        let e: u32x2 = u32x2::new(15, 13);
+        let r: u32x2 = transmute(vabd_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(16, 15, 14, 13);
+        let e: u32x4 = u32x4::new(15, 13, 11, 9);
+        let r: u32x4 = transmute(vabdq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let b: f32x2 = f32x2::new(9.0, 3.0);
+        let e: f32x2 = f32x2::new(8.0, 1.0);
+        let r: f32x2 = transmute(vabd_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 5.0, -4.0);
+        let b: f32x4 = f32x4::new(9.0, 3.0, 2.0, 8.0);
+        let e: f32x4 = f32x4::new(8.0, 1.0, 3.0, 12.0);
+        let r: f32x4 = transmute(vabdq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
+        let b: u8x8 = u8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: u16x8 = u16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
+        let r: u16x8 = transmute(vabdl_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(10, 10, 10, 10);
+        let e: u32x4 = u32x4::new(9, 8, 7, 6);
+        let r: u32x4 = transmute(vabdl_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(10, 10);
+        let e: u64x2 = u64x2::new(9, 8);
+        let r: u64x2 = transmute(vabdl_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
+        let b: i8x8 = i8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: i16x8 = i16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
+        let r: i16x8 = transmute(vabdl_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 11, 12);
+        let b: i16x4 = i16x4::new(10, 10, 10, 10);
+        let e: i32x4 = i32x4::new(9, 8, 1, 2);
+        let r: i32x4 = transmute(vabdl_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdl_s32() {
+        let a: i32x2 = i32x2::new(1, 11);
+        let b: i32x2 = i32x2::new(10, 10);
+        let e: i64x2 = i64x2::new(9, 1);
+        let r: i64x2 = transmute(vabdl_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u8() {
+        let a: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x8 = u8x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x16 = u8x16::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, 0);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u16() {
+        let a: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x4 = u16x4::new(0, 0, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0, 0xFF_FF, 0x02, 0x04);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x8 = u16x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0, 0xFF_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u32() {
+        let a: u32x2 = u32x2::new(0, 0x01);
+        let b: u32x2 = u32x2::new(0, 0x01);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x2 = u32x2::new(0, 0);
+        let b: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x4 = u32x4::new(0, 0, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0x02, 0x04);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s8() {
+        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i16x4 = i16x4::new(-32768, -32768, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(-32768, 0x7F_FF, 0x02, 0x04);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i16x8 = i16x8::new(-32768, -32768, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(-32768, 0x7F_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0x01);
+        let b: i32x2 = i32x2::new(-2147483648, 0x01);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i32x2 = i32x2::new(-2147483648, -2147483648);
+        let b: i32x2 = i32x2::new(-2147483648, 0x7F_FF_FF_FF);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i32x4 = i32x4::new(-2147483648, -2147483648, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(-2147483648, 0x7F_FF_FF_FF, 0x02, 0x04);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_p8() {
+        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_p8() {
+        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_f32() {
+        let a: f32x2 = f32x2::new(1.2, 3.4);
+        let b: f32x2 = f32x2::new(1.2, 3.4);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_f32() {
+        let a: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
+        let b: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vceqq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_s8() {
+        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vtst_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vtstq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let b: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vtst_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vtstq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0x00);
+        let b: i32x2 = i32x2::new(-2147483648, 0x00);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vtst_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
+        let b: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vtstq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_p8() {
+        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vtst_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_p8() {
+        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vtstq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_p16() {
+        let a: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let b: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vtst_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_p16() {
+        let a: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vtstq_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_u8() {
+        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vtst_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
+        let e: u8x16 = u8x16::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vtstq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_u16() {
+        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
+        let b: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0, 0, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vtst_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vtstq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtst_u32() {
+        let a: u32x2 = u32x2::new(0, 0x00);
+        let b: u32x2 = u32x2::new(0, 0x00);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vtst_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtstq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
+        let b: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vtstq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabs_f32() {
+        let a: f32x2 = f32x2::new(-0.1, -2.2);
+        let e: f32x2 = f32x2::new(0.1, 2.2);
+        let r: f32x2 = transmute(vabs_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabsq_f32() {
+        let a: f32x4 = f32x4::new(-0.1, -2.2, -3.3, -6.6);
+        let e: f32x4 = f32x4::new(0.1, 2.2, 3.3, 6.6);
+        let r: f32x4 = transmute(vabsq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcgt_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgtq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcgt_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgtq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcgt_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcgt_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgtq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcgt_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgtq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcgt_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_f32() {
+        let a: f32x2 = f32x2::new(1.2, 2.3);
+        let b: f32x2 = f32x2::new(0.1, 1.2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcgt_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_f32() {
+        let a: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let b: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vclt_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcltq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vclt_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcltq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vclt_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcltq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vclt_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcltq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vclt_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcltq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vclt_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcltq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_f32() {
+        let a: f32x2 = f32x2::new(0.1, 1.2);
+        let b: f32x2 = f32x2::new(1.2, 2.3);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vclt_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_f32() {
+        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcltq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcle_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcleq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcle_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcleq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcle_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcleq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcle_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcleq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcle_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcleq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcle_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcleq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_f32() {
+        let a: f32x2 = f32x2::new(0.1, 1.2);
+        let b: f32x2 = f32x2::new(1.2, 2.3);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcle_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_f32() {
+        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcleq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcge_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgeq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcge_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgeq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcge_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgeq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcge_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgeq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcge_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgeq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcge_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgeq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_f32() {
+        let a: f32x2 = f32x2::new(1.2, 2.3);
+        let b: f32x2 = f32x2::new(0.1, 1.2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcge_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_f32() {
+        let a: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let b: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgeq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcls_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0, 7, 7, 7, 7, 7, 7, 7);
+        let r: i8x8 = transmute(vcls_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclsq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7F);
+        let e: i8x16 = i8x16::new(0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
+        let r: i8x16 = transmute(vclsq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcls_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0, 15, 15, 15);
+        let r: i16x4 = transmute(vcls_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclsq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0, 15, 15, 15, 15, 15, 15, 15);
+        let r: i16x8 = transmute(vclsq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcls_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: i32x2 = i32x2::new(0, 31);
+        let r: i32x2 = transmute(vcls_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclsq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0, 31, 31, 31);
+        let r: i32x4 = transmute(vclsq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcls_u8() {
+        let a: u8x8 = u8x8::new(0, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(7, 7, 7, 7, 7, 7, 7, 7);
+        let r: i8x8 = transmute(vcls_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclsq_u8() {
+        let a: u8x16 = u8x16::new(0, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF);
+        let e: i8x16 = i8x16::new(7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7);
+        let r: i8x16 = transmute(vclsq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcls_u16() {
+        let a: u16x4 = u16x4::new(0, 0xFF_FF, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(15, 15, 15, 15);
+        let r: i16x4 = transmute(vcls_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclsq_u16() {
+        let a: u16x8 = u16x8::new(0, 0xFF_FF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(15, 15, 15, 15, 15, 15, 15, 15);
+        let r: i16x8 = transmute(vclsq_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcls_u32() {
+        let a: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let e: i32x2 = i32x2::new(31, 31);
+        let r: i32x2 = transmute(vcls_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclsq_u32() {
+        let a: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(31, 31, 31, 31);
+        let r: i32x4 = transmute(vclsq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: i8x8 = i8x8::new(0, 0, 8, 7, 7, 7, 7, 7);
+        let r: i8x8 = transmute(vclz_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x7F);
+        let e: i8x16 = i8x16::new(0, 0, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1);
+        let r: i8x16 = transmute(vclzq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x01);
+        let e: i16x4 = i16x4::new(0, 0, 16, 15);
+        let r: i16x4 = transmute(vclz_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: i16x8 = i16x8::new(0, 0, 16, 15, 15, 15, 15, 15);
+        let r: i16x8 = transmute(vclzq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vclz_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x01);
+        let e: i32x4 = i32x4::new(0, 0, 32, 31);
+        let r: i32x4 = transmute(vclzq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_u8() {
+        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: u8x8 = u8x8::new(8, 8, 7, 7, 7, 7, 7, 7);
+        let r: u8x8 = transmute(vclz_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xFF);
+        let e: u8x16 = u8x16::new(8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
+        let r: u8x16 = transmute(vclzq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_u16() {
+        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x01);
+        let e: u16x4 = u16x4::new(16, 16, 15, 15);
+        let r: u16x4 = transmute(vclz_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: u16x8 = u16x8::new(16, 16, 15, 15, 15, 15, 15, 15);
+        let r: u16x8 = transmute(vclzq_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_u32() {
+        let a: u32x2 = u32x2::new(0, 0x00);
+        let e: u32x2 = u32x2::new(32, 32);
+        let r: u32x2 = transmute(vclz_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x01);
+        let e: u32x4 = u32x4::new(32, 32, 31, 31);
+        let r: u32x4 = transmute(vclzq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcagt_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(!0, 0);
+        let r: u32x2 = transmute(vcagt_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcagtq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(!0, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vcagtq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcage_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(!0, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcage_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcageq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(!0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vcageq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcalt_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vcalt_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaltq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcaltq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcale_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcale_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaleq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcaleq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_s8() {
+        let a: u64 = 1;
+        let e: i8x8 = i8x8::new(1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vcreate_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_s16() {
+        let a: u64 = 1;
+        let e: i16x4 = i16x4::new(1, 0, 0, 0);
+        let r: i16x4 = transmute(vcreate_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_s32() {
+        let a: u64 = 1;
+        let e: i32x2 = i32x2::new(1, 0);
+        let r: i32x2 = transmute(vcreate_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_s64() {
+        let a: u64 = 1;
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vcreate_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_u8() {
+        let a: u64 = 1;
+        let e: u8x8 = u8x8::new(1, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vcreate_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_u16() {
+        let a: u64 = 1;
+        let e: u16x4 = u16x4::new(1, 0, 0, 0);
+        let r: u16x4 = transmute(vcreate_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_u32() {
+        let a: u64 = 1;
+        let e: u32x2 = u32x2::new(1, 0);
+        let r: u32x2 = transmute(vcreate_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_u64() {
+        let a: u64 = 1;
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vcreate_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_p8() {
+        let a: u64 = 1;
+        let e: i8x8 = i8x8::new(1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vcreate_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_p16() {
+        let a: u64 = 1;
+        let e: i16x4 = i16x4::new(1, 0, 0, 0);
+        let r: i16x4 = transmute(vcreate_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_p64() {
+        let a: u64 = 1;
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vcreate_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcreate_f32() {
+        let a: u64 = 0;
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vcreate_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_f32_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let e: f32x2 = f32x2::new(1., 2.);
+        let r: f32x2 = transmute(vcvt_f32_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_f32_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let r: f32x4 = transmute(vcvtq_f32_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_f32_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let e: f32x2 = f32x2::new(1., 2.);
+        let r: f32x2 = transmute(vcvt_f32_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_f32_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let r: f32x4 = transmute(vcvtq_f32_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_n_f32_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let e: f32x2 = f32x2::new(0.25, 0.5);
+        let r: f32x2 = transmute(vcvt_n_f32_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_n_f32_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
+        let r: f32x4 = transmute(vcvtq_n_f32_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_n_f32_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let e: f32x2 = f32x2::new(0.25, 0.5);
+        let r: f32x2 = transmute(vcvt_n_f32_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_n_f32_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
+        let r: f32x4 = transmute(vcvtq_n_f32_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_n_s32_f32() {
+        let a: f32x2 = f32x2::new(0.25, 0.5);
+        let e: i32x2 = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vcvt_n_s32_f32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_n_s32_f32() {
+        let a: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vcvtq_n_s32_f32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_n_u32_f32() {
+        let a: f32x2 = f32x2::new(0.25, 0.5);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vcvt_n_u32_f32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_n_u32_f32() {
+        let a: f32x4 = f32x4::new(0.25, 0.5, 0.75, 1.);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vcvtq_n_u32_f32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_s32_f32() {
+        let a: f32x2 = f32x2::new(-1.1, 2.1);
+        let e: i32x2 = i32x2::new(-1, 2);
+        let r: i32x2 = transmute(vcvt_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_s32_f32() {
+        let a: f32x4 = f32x4::new(-1.1, 2.1, -2.9, 3.9);
+        let e: i32x4 = i32x4::new(-1, 2, -2, 3);
+        let r: i32x4 = transmute(vcvtq_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvt_u32_f32() {
+        let a: f32x2 = f32x2::new(1.1, 2.1);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vcvt_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_u32_f32() {
+        let a: f32x4 = f32x4::new(1.1, 2.1, 2.9, 3.9);
+        let e: u32x4 = u32x4::new(1, 2, 2, 3);
+        let r: u32x4 = transmute(vcvtq_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_s8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_lane_s8::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_s8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_laneq_s8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_lane_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_laneq_s16::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vdup_lane_s32::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 4);
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vdupq_laneq_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_s8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_laneq_s8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_laneq_s16::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 4);
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vdup_laneq_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_s8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_lane_s8::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_lane_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vdupq_lane_s32::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_u8() {
+        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x8 = transmute(vdup_lane_u8::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_u8() {
+        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x16 = transmute(vdupq_laneq_u8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 1, 1, 4);
+        let e: u16x4 = u16x4::new(1, 1, 1, 1);
+        let r: u16x4 = transmute(vdup_lane_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u16x8 = transmute(vdupq_laneq_u16::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 1);
+        let e: u32x2 = u32x2::new(1, 1);
+        let r: u32x2 = transmute(vdup_lane_u32::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 1, 1, 4);
+        let e: u32x4 = u32x4::new(1, 1, 1, 1);
+        let r: u32x4 = transmute(vdupq_laneq_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_u8() {
+        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x8 = transmute(vdup_laneq_u8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u16x4 = u16x4::new(1, 1, 1, 1);
+        let r: u16x4 = transmute(vdup_laneq_u16::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 1, 1, 4);
+        let e: u32x2 = u32x2::new(1, 1);
+        let r: u32x2 = transmute(vdup_laneq_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_u8() {
+        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x16 = transmute(vdupq_lane_u8::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 1, 1, 4);
+        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u16x8 = transmute(vdupq_lane_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 1);
+        let e: u32x4 = u32x4::new(1, 1, 1, 1);
+        let r: u32x4 = transmute(vdupq_lane_u32::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_p8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_lane_p8::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_p8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_laneq_p8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_p16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_lane_p16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_p16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_laneq_p16::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_p8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_laneq_p8::<8>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_p16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_laneq_p16::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_p8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_lane_p8::<4>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_p16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_lane_p16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_s64() {
+        let a: i64x2 = i64x2::new(1, 1);
+        let e: i64x2 = i64x2::new(1, 1);
+        let r: i64x2 = transmute(vdupq_laneq_s64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let e: i64x2 = i64x2::new(1, 1);
+        let r: i64x2 = transmute(vdupq_lane_s64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_u64() {
+        let a: u64x2 = u64x2::new(1, 1);
+        let e: u64x2 = u64x2::new(1, 1);
+        let r: u64x2 = transmute(vdupq_laneq_u64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let e: u64x2 = u64x2::new(1, 1);
+        let r: u64x2 = transmute(vdupq_lane_u64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(1., 1.);
+        let r: f32x2 = transmute(vdup_lane_f32::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
+        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let r: f32x4 = transmute(vdupq_laneq_f32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
+        let e: f32x2 = f32x2::new(1., 1.);
+        let r: f32x2 = transmute(vdup_laneq_f32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let r: f32x4 = transmute(vdupq_lane_f32::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vdup_lane_s64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_lane_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vdup_lane_u64::<0>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vdup_laneq_s64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_laneq_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vdup_laneq_u64::<1>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_s8() {
+        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i8x8 = transmute(vext_s8::<4>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_s8() {
+        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
+        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
+        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
+        let r: i8x16 = transmute(vextq_s8::<8>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_s16() {
+        let a: i16x4 = i16x4::new(0, 8, 8, 9);
+        let b: i16x4 = i16x4::new(9, 11, 14, 15);
+        let e: i16x4 = i16x4::new(8, 9, 9, 11);
+        let r: i16x4 = transmute(vext_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_s16() {
+        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i16x8 = transmute(vextq_s16::<4>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_s32() {
+        let a: i32x2 = i32x2::new(0, 8);
+        let b: i32x2 = i32x2::new(9, 11);
+        let e: i32x2 = i32x2::new(8, 9);
+        let r: i32x2 = transmute(vext_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_s32() {
+        let a: i32x4 = i32x4::new(0, 8, 8, 9);
+        let b: i32x4 = i32x4::new(9, 11, 14, 15);
+        let e: i32x4 = i32x4::new(8, 9, 9, 11);
+        let r: i32x4 = transmute(vextq_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_u8() {
+        let a: u8x8 = u8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: u8x8 = u8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: u8x8 = u8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: u8x8 = transmute(vext_u8::<4>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_u8() {
+        let a: u8x16 = u8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
+        let b: u8x16 = u8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
+        let e: u8x16 = u8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
+        let r: u8x16 = transmute(vextq_u8::<8>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_u16() {
+        let a: u16x4 = u16x4::new(0, 8, 8, 9);
+        let b: u16x4 = u16x4::new(9, 11, 14, 15);
+        let e: u16x4 = u16x4::new(8, 9, 9, 11);
+        let r: u16x4 = transmute(vext_u16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_u16() {
+        let a: u16x8 = u16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: u16x8 = u16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: u16x8 = u16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: u16x8 = transmute(vextq_u16::<4>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_u32() {
+        let a: u32x2 = u32x2::new(0, 8);
+        let b: u32x2 = u32x2::new(9, 11);
+        let e: u32x2 = u32x2::new(8, 9);
+        let r: u32x2 = transmute(vext_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_u32() {
+        let a: u32x4 = u32x4::new(0, 8, 8, 9);
+        let b: u32x4 = u32x4::new(9, 11, 14, 15);
+        let e: u32x4 = u32x4::new(8, 9, 9, 11);
+        let r: u32x4 = transmute(vextq_u32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_p8() {
+        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i8x8 = transmute(vext_p8::<4>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_p8() {
+        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
+        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
+        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
+        let r: i8x16 = transmute(vextq_p8::<8>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_p16() {
+        let a: i16x4 = i16x4::new(0, 8, 8, 9);
+        let b: i16x4 = i16x4::new(9, 11, 14, 15);
+        let e: i16x4 = i16x4::new(8, 9, 9, 11);
+        let r: i16x4 = transmute(vext_p16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_p16() {
+        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i16x8 = transmute(vextq_p16::<4>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_s64() {
+        let a: i64x2 = i64x2::new(0, 8);
+        let b: i64x2 = i64x2::new(9, 11);
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vextq_s64::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_u64() {
+        let a: u64x2 = u64x2::new(0, 8);
+        let b: u64x2 = u64x2::new(9, 11);
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vextq_u64::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_f32() {
+        let a: f32x2 = f32x2::new(0., 2.);
+        let b: f32x2 = f32x2::new(3., 4.);
+        let e: f32x2 = f32x2::new(2., 3.);
+        let r: f32x2 = transmute(vext_f32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_f32() {
+        let a: f32x4 = f32x4::new(0., 2., 2., 3.);
+        let b: f32x4 = f32x4::new(3., 4., 5., 6.);
+        let e: f32x4 = f32x4::new(2., 3., 3., 4.);
+        let r: f32x4 = transmute(vextq_f32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i8x8 = transmute(vmla_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x16 = i8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
+        let r: i8x16 = transmute(vmlaq_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(3, 3, 3, 3);
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(3, 3);
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(3, 3, 3, 3);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u8x8 = transmute(vmla_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x16 = u8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
+        let r: u8x16 = transmute(vmlaq_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(3, 3, 3, 3);
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(3, 3);
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(3, 3, 3, 3);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x2 = f32x2::new(3., 3.);
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x4 = f32x4::new(3., 3., 3., 3.);
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_n_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_n_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_n_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_n_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32 = 3;
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_n_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_n_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_n_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_n_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32 = 3;
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_n_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32 = 3.;
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_n_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32 = 3.;
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_lane_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_laneq_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_lane_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_laneq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_lane_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_laneq_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_lane_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_laneq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_lane_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_laneq_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_lane_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_laneq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_lane_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_laneq_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_lane_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_laneq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_lane_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x2 = f32x2::new(0., 3.);
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmla_laneq_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_lane_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x2 = f32x2::new(0., 3.);
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlaq_laneq_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_s8() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlal_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(3, 3, 3, 3);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(3, 3);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_u8() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlal_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(3, 3, 3, 3);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(3, 3);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_n_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_n_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_n_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_n_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_lane_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_laneq_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_lane_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_laneq_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_lane_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_laneq_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_lane_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_laneq_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_s8() {
+        let a: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vmls_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_s8() {
+        let a: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x16 = i8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vmlsq_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(3, 3, 3, 3);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(3, 3);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(3, 3, 3, 3);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_u8() {
+        let a: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vmls_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_u8() {
+        let a: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
+        let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x16 = u8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vmlsq_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(3, 3, 3, 3);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(3, 3);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(3, 3, 3, 3);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x2 = f32x2::new(3., 3.);
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x4 = f32x4::new(3., 3., 3., 3.);
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_n_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_n_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_n_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_n_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32 = 3;
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_n_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_n_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_n_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_n_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32 = 3;
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_n_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32 = 3.;
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_n_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32 = 3.;
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_lane_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_laneq_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_lane_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_laneq_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_lane_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_laneq_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_lane_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_laneq_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_lane_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_laneq_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_lane_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_laneq_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_lane_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_laneq_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_lane_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_laneq_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_lane_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x2 = f32x2::new(0., 3.);
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_laneq_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_lane_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x2 = f32x2::new(0., 3.);
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_laneq_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_s8() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsl_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(3, 3, 3, 3);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(3, 3);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_u8() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsl_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(3, 3, 3, 3);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(3, 3);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_n_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_n_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_n_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_n_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_lane_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_laneq_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_lane_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_laneq_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_lane_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_laneq_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_lane_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_laneq_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vneg_s8() {
+        let a: i8x8 = i8x8::new(0, 1, -1, 2, -2, 3, -3, 4);
+        let e: i8x8 = i8x8::new(0, -1, 1, -2, 2, -3, 3, -4);
+        let r: i8x8 = transmute(vneg_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vnegq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8);
+        let e: i8x16 = i8x16::new(0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8);
+        let r: i8x16 = transmute(vnegq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vneg_s16() {
+        let a: i16x4 = i16x4::new(0, 1, -1, 2);
+        let e: i16x4 = i16x4::new(0, -1, 1, -2);
+        let r: i16x4 = transmute(vneg_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vnegq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, -1, 2, -2, 3, -3, 4);
+        let e: i16x8 = i16x8::new(0, -1, 1, -2, 2, -3, 3, -4);
+        let r: i16x8 = transmute(vnegq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vneg_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: i32x2 = i32x2::new(0, -1);
+        let r: i32x2 = transmute(vneg_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vnegq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, -1, 2);
+        let e: i32x4 = i32x4::new(0, -1, 1, -2);
+        let r: i32x4 = transmute(vnegq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vneg_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let e: f32x2 = f32x2::new(0., -1.);
+        let r: f32x2 = transmute(vneg_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vnegq_f32() {
+        let a: f32x4 = f32x4::new(0., 1., -1., 2.);
+        let e: f32x4 = f32x4::new(0., -1., 1., -2.);
+        let r: f32x4 = transmute(vnegq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqneg_s8() {
+        let a: i8x8 = i8x8::new(-128, 0, 1, -1, 2, -2, 3, -3);
+        let e: i8x8 = i8x8::new(0x7F, 0, -1, 1, -2, 2, -3, 3);
+        let r: i8x8 = transmute(vqneg_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqnegq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7);
+        let e: i8x16 = i8x16::new(0x7F, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
+        let r: i8x16 = transmute(vqnegq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqneg_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0, 1, -1);
+        let e: i16x4 = i16x4::new(0x7F_FF, 0, -1, 1);
+        let r: i16x4 = transmute(vqneg_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqnegq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0, 1, -1, 2, -2, 3, -3);
+        let e: i16x8 = i16x8::new(0x7F_FF, 0, -1, 1, -2, 2, -3, 3);
+        let r: i16x8 = transmute(vqnegq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqneg_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0);
+        let e: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0);
+        let r: i32x2 = transmute(vqneg_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqnegq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0, 1, -1);
+        let e: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0, -1, 1);
+        let r: i32x4 = transmute(vqnegq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u8() {
+        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(41, 40, 39, 38, 37, 36, 35, 34);
+        let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u8() {
+        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26);
+        let r: u8x16 = transmute(vqsubq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u16() {
+        let a: u16x4 = u16x4::new(42, 42, 42, 42);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(41, 40, 39, 38);
+        let r: u16x4 = transmute(vqsub_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u16() {
+        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(41, 40, 39, 38, 37, 36, 35, 34);
+        let r: u16x8 = transmute(vqsubq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u32() {
+        let a: u32x2 = u32x2::new(42, 42);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(41, 40);
+        let r: u32x2 = transmute(vqsub_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u32() {
+        let a: u32x4 = u32x4::new(42, 42, 42, 42);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(41, 40, 39, 38);
+        let r: u32x4 = transmute(vqsubq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u64() {
+        let a: u64x1 = u64x1::new(42);
+        let b: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(41);
+        let r: u64x1 = transmute(vqsub_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u64() {
+        let a: u64x2 = u64x2::new(42, 42);
+        let b: u64x2 = u64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(41, 40);
+        let r: u64x2 = transmute(vqsubq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s8() {
+        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(41, 40, 39, 38, 37, 36, 35, 34);
+        let r: i8x8 = transmute(vqsub_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s8() {
+        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26);
+        let r: i8x16 = transmute(vqsubq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s16() {
+        let a: i16x4 = i16x4::new(42, 42, 42, 42);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(41, 40, 39, 38);
+        let r: i16x4 = transmute(vqsub_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s16() {
+        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(41, 40, 39, 38, 37, 36, 35, 34);
+        let r: i16x8 = transmute(vqsubq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s32() {
+        let a: i32x2 = i32x2::new(42, 42);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(41, 40);
+        let r: i32x2 = transmute(vqsub_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s32() {
+        let a: i32x4 = i32x4::new(42, 42, 42, 42);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(41, 40, 39, 38);
+        let r: i32x4 = transmute(vqsubq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s64() {
+        let a: i64x1 = i64x1::new(42);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(41);
+        let r: i64x1 = transmute(vqsub_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s64() {
+        let a: i64x2 = i64x2::new(42, 42);
+        let b: i64x2 = i64x2::new(1, 2);
+        let e: i64x2 = i64x2::new(41, 40);
+        let r: i64x2 = transmute(vqsubq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_u8() {
+        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(21, 22, 22, 23, 23, 24, 24, 25);
+        let r: u8x8 = transmute(vhadd_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_u8() {
+        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29);
+        let r: u8x16 = transmute(vhaddq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_u16() {
+        let a: u16x4 = u16x4::new(42, 42, 42, 42);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(21, 22, 22, 23);
+        let r: u16x4 = transmute(vhadd_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_u16() {
+        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(21, 22, 22, 23, 23, 24, 24, 25);
+        let r: u16x8 = transmute(vhaddq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_u32() {
+        let a: u32x2 = u32x2::new(42, 42);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(21, 22);
+        let r: u32x2 = transmute(vhadd_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_u32() {
+        let a: u32x4 = u32x4::new(42, 42, 42, 42);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(21, 22, 22, 23);
+        let r: u32x4 = transmute(vhaddq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_s8() {
+        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(21, 22, 22, 23, 23, 24, 24, 25);
+        let r: i8x8 = transmute(vhadd_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_s8() {
+        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29);
+        let r: i8x16 = transmute(vhaddq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_s16() {
+        let a: i16x4 = i16x4::new(42, 42, 42, 42);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(21, 22, 22, 23);
+        let r: i16x4 = transmute(vhadd_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_s16() {
+        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(21, 22, 22, 23, 23, 24, 24, 25);
+        let r: i16x8 = transmute(vhaddq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_s32() {
+        let a: i32x2 = i32x2::new(42, 42);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(21, 22);
+        let r: i32x2 = transmute(vhadd_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_s32() {
+        let a: i32x4 = i32x4::new(42, 42, 42, 42);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(21, 22, 22, 23);
+        let r: i32x4 = transmute(vhaddq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_u8() {
+        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(22, 22, 23, 23, 24, 24, 25, 25);
+        let r: u8x8 = transmute(vrhadd_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_u8() {
+        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29);
+        let r: u8x16 = transmute(vrhaddq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_u16() {
+        let a: u16x4 = u16x4::new(42, 42, 42, 42);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(22, 22, 23, 23);
+        let r: u16x4 = transmute(vrhadd_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_u16() {
+        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(22, 22, 23, 23, 24, 24, 25, 25);
+        let r: u16x8 = transmute(vrhaddq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_u32() {
+        let a: u32x2 = u32x2::new(42, 42);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(22, 22);
+        let r: u32x2 = transmute(vrhadd_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_u32() {
+        let a: u32x4 = u32x4::new(42, 42, 42, 42);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(22, 22, 23, 23);
+        let r: u32x4 = transmute(vrhaddq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_s8() {
+        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(22, 22, 23, 23, 24, 24, 25, 25);
+        let r: i8x8 = transmute(vrhadd_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_s8() {
+        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29);
+        let r: i8x16 = transmute(vrhaddq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_s16() {
+        let a: i16x4 = i16x4::new(42, 42, 42, 42);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(22, 22, 23, 23);
+        let r: i16x4 = transmute(vrhadd_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_s16() {
+        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(22, 22, 23, 23, 24, 24, 25, 25);
+        let r: i16x8 = transmute(vrhaddq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_s32() {
+        let a: i32x2 = i32x2::new(42, 42);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(22, 22);
+        let r: i32x2 = transmute(vrhadd_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_s32() {
+        let a: i32x4 = i32x4::new(42, 42, 42, 42);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(22, 22, 23, 23);
+        let r: i32x4 = transmute(vrhaddq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndn_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 0.0);
+        let r: f32x2 = transmute(vrndn_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndnq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 0.0, 2.0, 2.0);
+        let r: f32x4 = transmute(vrndnq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u8() {
+        let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(43, 44, 45, 46, 47, 48, 49, 50);
+        let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u8() {
+        let a: u8x16 = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58);
+        let r: u8x16 = transmute(vqaddq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u16() {
+        let a: u16x4 = u16x4::new(42, 42, 42, 42);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(43, 44, 45, 46);
+        let r: u16x4 = transmute(vqadd_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u16() {
+        let a: u16x8 = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(43, 44, 45, 46, 47, 48, 49, 50);
+        let r: u16x8 = transmute(vqaddq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u32() {
+        let a: u32x2 = u32x2::new(42, 42);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(43, 44);
+        let r: u32x2 = transmute(vqadd_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u32() {
+        let a: u32x4 = u32x4::new(42, 42, 42, 42);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(43, 44, 45, 46);
+        let r: u32x4 = transmute(vqaddq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u64() {
+        let a: u64x1 = u64x1::new(42);
+        let b: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(43);
+        let r: u64x1 = transmute(vqadd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u64() {
+        let a: u64x2 = u64x2::new(42, 42);
+        let b: u64x2 = u64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(43, 44);
+        let r: u64x2 = transmute(vqaddq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s8() {
+        let a: i8x8 = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(43, 44, 45, 46, 47, 48, 49, 50);
+        let r: i8x8 = transmute(vqadd_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s8() {
+        let a: i8x16 = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58);
+        let r: i8x16 = transmute(vqaddq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s16() {
+        let a: i16x4 = i16x4::new(42, 42, 42, 42);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(43, 44, 45, 46);
+        let r: i16x4 = transmute(vqadd_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s16() {
+        let a: i16x8 = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(43, 44, 45, 46, 47, 48, 49, 50);
+        let r: i16x8 = transmute(vqaddq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s32() {
+        let a: i32x2 = i32x2::new(42, 42);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(43, 44);
+        let r: i32x2 = transmute(vqadd_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s32() {
+        let a: i32x4 = i32x4::new(42, 42, 42, 42);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(43, 44, 45, 46);
+        let r: i32x4 = transmute(vqaddq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s64() {
+        let a: i64x1 = i64x1::new(42);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(43);
+        let r: i64x1 = transmute(vqadd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s64() {
+        let a: i64x2 = i64x2::new(42, 42);
+        let b: i64x2 = i64x2::new(1, 2);
+        let e: i64x2 = i64x2::new(43, 44);
+        let r: i64x2 = transmute(vqaddq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s8_x2() {
+        let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8x8; 2] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i8x8; 2] = transmute(vld1_s8_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s16_x2() {
+        let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i16x4; 2] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8)];
+        let r: [i16x4; 2] = transmute(vld1_s16_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s32_x2() {
+        let a: [i32; 5] = [0, 1, 2, 3, 4];
+        let e: [i32x2; 2] = [i32x2::new(1, 2), i32x2::new(3, 4)];
+        let r: [i32x2; 2] = transmute(vld1_s32_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s64_x2() {
+        let a: [i64; 3] = [0, 1, 2];
+        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)];
+        let r: [i64x1; 2] = transmute(vld1_s64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s8_x2() {
+        let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x16; 2] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x16; 2] = transmute(vld1q_s8_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s16_x2() {
+        let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16x8; 2] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i16x8; 2] = transmute(vld1q_s16_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s32_x2() {
+        let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i32x4; 2] = [i32x4::new(1, 2, 3, 4), i32x4::new(5, 6, 7, 8)];
+        let r: [i32x4; 2] = transmute(vld1q_s32_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s64_x2() {
+        let a: [i64; 5] = [0, 1, 2, 3, 4];
+        let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(3, 4)];
+        let r: [i64x2; 2] = transmute(vld1q_s64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s8_x3() {
+        let a: [i8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i8x8; 3] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [i8x8; 3] = transmute(vld1_s8_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s16_x3() {
+        let a: [i16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [i16x4; 3] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12)];
+        let r: [i16x4; 3] = transmute(vld1_s16_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s32_x3() {
+        let a: [i32; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [i32x2; 3] = [i32x2::new(1, 2), i32x2::new(3, 4), i32x2::new(5, 6)];
+        let r: [i32x2; 3] = transmute(vld1_s32_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s64_x3() {
+        let a: [i64; 4] = [0, 1, 2, 3];
+        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(3)];
+        let r: [i64x1; 3] = transmute(vld1_s64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s8_x3() {
+        let a: [i8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8x16; 3] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i8x16; 3] = transmute(vld1q_s8_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s16_x3() {
+        let a: [i16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i16x8; 3] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [i16x8; 3] = transmute(vld1q_s16_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s32_x3() {
+        let a: [i32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [i32x4; 3] = [i32x4::new(1, 2, 3, 4), i32x4::new(5, 6, 7, 8), i32x4::new(9, 10, 11, 12)];
+        let r: [i32x4; 3] = transmute(vld1q_s32_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s64_x3() {
+        let a: [i64; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6)];
+        let r: [i64x2; 3] = transmute(vld1q_s64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s8_x4() {
+        let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x8; 4] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24), i8x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x8; 4] = transmute(vld1_s8_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s16_x4() {
+        let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16x4; 4] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12), i16x4::new(13, 14, 15, 16)];
+        let r: [i16x4; 4] = transmute(vld1_s16_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s32_x4() {
+        let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i32x2; 4] = [i32x2::new(1, 2), i32x2::new(3, 4), i32x2::new(5, 6), i32x2::new(7, 8)];
+        let r: [i32x2; 4] = transmute(vld1_s32_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s64_x4() {
+        let a: [i64; 5] = [0, 1, 2, 3, 4];
+        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(3), i64x1::new(4)];
+        let r: [i64x1; 4] = transmute(vld1_s64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s8_x4() {
+        let a: [i8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x16; 4] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x16; 4] = transmute(vld1q_s8_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s16_x4() {
+        let a: [i16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i16x8; 4] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24), i16x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i16x8; 4] = transmute(vld1q_s16_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s32_x4() {
+        let a: [i32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i32x4; 4] = [i32x4::new(1, 2, 3, 4), i32x4::new(5, 6, 7, 8), i32x4::new(9, 10, 11, 12), i32x4::new(13, 14, 15, 16)];
+        let r: [i32x4; 4] = transmute(vld1q_s32_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s64_x4() {
+        let a: [i64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6), i64x2::new(7, 8)];
+        let r: [i64x2; 4] = transmute(vld1q_s64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u8_x2() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8x8; 2] = [u8x8::new(1, 2, 3, 4, 5, 6, 7, 8), u8x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [u8x8; 2] = transmute(vld1_u8_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u16_x2() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u16x4; 2] = [u16x4::new(1, 2, 3, 4), u16x4::new(5, 6, 7, 8)];
+        let r: [u16x4; 2] = transmute(vld1_u16_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u32_x2() {
+        let a: [u32; 5] = [0, 1, 2, 3, 4];
+        let e: [u32x2; 2] = [u32x2::new(1, 2), u32x2::new(3, 4)];
+        let r: [u32x2; 2] = transmute(vld1_u32_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u64_x2() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(2)];
+        let r: [u64x1; 2] = transmute(vld1_u64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u8_x2() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8x16; 2] = [u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [u8x16; 2] = transmute(vld1q_u8_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u16_x2() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u16x8; 2] = [u16x8::new(1, 2, 3, 4, 5, 6, 7, 8), u16x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [u16x8; 2] = transmute(vld1q_u16_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u32_x2() {
+        let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u32x4; 2] = [u32x4::new(1, 2, 3, 4), u32x4::new(5, 6, 7, 8)];
+        let r: [u32x4; 2] = transmute(vld1q_u32_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u64_x2() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64x2; 2] = [u64x2::new(1, 2), u64x2::new(3, 4)];
+        let r: [u64x2; 2] = transmute(vld1q_u64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u8_x3() {
+        let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [u8x8; 3] = [u8x8::new(1, 2, 3, 4, 5, 6, 7, 8), u8x8::new(9, 10, 11, 12, 13, 14, 15, 16), u8x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [u8x8; 3] = transmute(vld1_u8_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u16_x3() {
+        let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [u16x4; 3] = [u16x4::new(1, 2, 3, 4), u16x4::new(5, 6, 7, 8), u16x4::new(9, 10, 11, 12)];
+        let r: [u16x4; 3] = transmute(vld1_u16_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u32_x3() {
+        let a: [u32; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [u32x2; 3] = [u32x2::new(1, 2), u32x2::new(3, 4), u32x2::new(5, 6)];
+        let r: [u32x2; 3] = transmute(vld1_u32_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u64_x3() {
+        let a: [u64; 4] = [0, 1, 2, 3];
+        let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(2), u64x1::new(3)];
+        let r: [u64x1; 3] = transmute(vld1_u64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u8_x3() {
+        let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8x16; 3] = [u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [u8x16; 3] = transmute(vld1q_u8_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u16_x3() {
+        let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [u16x8; 3] = [u16x8::new(1, 2, 3, 4, 5, 6, 7, 8), u16x8::new(9, 10, 11, 12, 13, 14, 15, 16), u16x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [u16x8; 3] = transmute(vld1q_u16_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u32_x3() {
+        let a: [u32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [u32x4; 3] = [u32x4::new(1, 2, 3, 4), u32x4::new(5, 6, 7, 8), u32x4::new(9, 10, 11, 12)];
+        let r: [u32x4; 3] = transmute(vld1q_u32_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u64_x3() {
+        let a: [u64; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [u64x2; 3] = [u64x2::new(1, 2), u64x2::new(3, 4), u64x2::new(5, 6)];
+        let r: [u64x2; 3] = transmute(vld1q_u64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u8_x4() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8x8; 4] = [u8x8::new(1, 2, 3, 4, 5, 6, 7, 8), u8x8::new(9, 10, 11, 12, 13, 14, 15, 16), u8x8::new(17, 18, 19, 20, 21, 22, 23, 24), u8x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [u8x8; 4] = transmute(vld1_u8_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u16_x4() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u16x4; 4] = [u16x4::new(1, 2, 3, 4), u16x4::new(5, 6, 7, 8), u16x4::new(9, 10, 11, 12), u16x4::new(13, 14, 15, 16)];
+        let r: [u16x4; 4] = transmute(vld1_u16_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u32_x4() {
+        let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u32x2; 4] = [u32x2::new(1, 2), u32x2::new(3, 4), u32x2::new(5, 6), u32x2::new(7, 8)];
+        let r: [u32x2; 4] = transmute(vld1_u32_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u64_x4() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(2), u64x1::new(3), u64x1::new(4)];
+        let r: [u64x1; 4] = transmute(vld1_u64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u8_x4() {
+        let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8x16; 4] = [u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), u8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [u8x16; 4] = transmute(vld1q_u8_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u16_x4() {
+        let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u16x8; 4] = [u16x8::new(1, 2, 3, 4, 5, 6, 7, 8), u16x8::new(9, 10, 11, 12, 13, 14, 15, 16), u16x8::new(17, 18, 19, 20, 21, 22, 23, 24), u16x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [u16x8; 4] = transmute(vld1q_u16_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u32_x4() {
+        let a: [u32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u32x4; 4] = [u32x4::new(1, 2, 3, 4), u32x4::new(5, 6, 7, 8), u32x4::new(9, 10, 11, 12), u32x4::new(13, 14, 15, 16)];
+        let r: [u32x4; 4] = transmute(vld1q_u32_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u64_x4() {
+        let a: [u64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u64x2; 4] = [u64x2::new(1, 2), u64x2::new(3, 4), u64x2::new(5, 6), u64x2::new(7, 8)];
+        let r: [u64x2; 4] = transmute(vld1q_u64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p8_x2() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8x8; 2] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i8x8; 2] = transmute(vld1_p8_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p8_x3() {
+        let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i8x8; 3] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [i8x8; 3] = transmute(vld1_p8_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p8_x4() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x8; 4] = [i8x8::new(1, 2, 3, 4, 5, 6, 7, 8), i8x8::new(9, 10, 11, 12, 13, 14, 15, 16), i8x8::new(17, 18, 19, 20, 21, 22, 23, 24), i8x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x8; 4] = transmute(vld1_p8_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p8_x2() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x16; 2] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x16; 2] = transmute(vld1q_p8_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p8_x3() {
+        let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8x16; 3] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i8x16; 3] = transmute(vld1q_p8_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p8_x4() {
+        let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8x16; 4] = [i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i8x16; 4] = transmute(vld1q_p8_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p16_x2() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i16x4; 2] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8)];
+        let r: [i16x4; 2] = transmute(vld1_p16_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p16_x3() {
+        let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [i16x4; 3] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12)];
+        let r: [i16x4; 3] = transmute(vld1_p16_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p16_x4() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16x4; 4] = [i16x4::new(1, 2, 3, 4), i16x4::new(5, 6, 7, 8), i16x4::new(9, 10, 11, 12), i16x4::new(13, 14, 15, 16)];
+        let r: [i16x4; 4] = transmute(vld1_p16_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p16_x2() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16x8; 2] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16)];
+        let r: [i16x8; 2] = transmute(vld1q_p16_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p16_x3() {
+        let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i16x8; 3] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24)];
+        let r: [i16x8; 3] = transmute(vld1q_p16_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p16_x4() {
+        let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i16x8; 4] = [i16x8::new(1, 2, 3, 4, 5, 6, 7, 8), i16x8::new(9, 10, 11, 12, 13, 14, 15, 16), i16x8::new(17, 18, 19, 20, 21, 22, 23, 24), i16x8::new(25, 26, 27, 28, 29, 30, 31, 32)];
+        let r: [i16x8; 4] = transmute(vld1q_p16_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p64_x2() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)];
+        let r: [i64x1; 2] = transmute(vld1_p64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p64_x3() {
+        let a: [u64; 4] = [0, 1, 2, 3];
+        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(3)];
+        let r: [i64x1; 3] = transmute(vld1_p64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p64_x4() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(3), i64x1::new(4)];
+        let r: [i64x1; 4] = transmute(vld1_p64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p64_x2() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(3, 4)];
+        let r: [i64x2; 2] = transmute(vld1q_p64_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p64_x3() {
+        let a: [u64; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6)];
+        let r: [i64x2; 3] = transmute(vld1q_p64_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p64_x4() {
+        let a: [u64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6), i64x2::new(7, 8)];
+        let r: [i64x2; 4] = transmute(vld1q_p64_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f32_x2() {
+        let a: [f32; 5] = [0., 1., 2., 3., 4.];
+        let e: [f32x2; 2] = [f32x2::new(1., 2.), f32x2::new(3., 4.)];
+        let r: [f32x2; 2] = transmute(vld1_f32_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f32_x2() {
+        let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let e: [f32x4; 2] = [f32x4::new(1., 2., 3., 4.), f32x4::new(5., 6., 7., 8.)];
+        let r: [f32x4; 2] = transmute(vld1q_f32_x2(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f32_x3() {
+        let a: [f32; 7] = [0., 1., 2., 3., 4., 5., 6.];
+        let e: [f32x2; 3] = [f32x2::new(1., 2.), f32x2::new(3., 4.), f32x2::new(5., 6.)];
+        let r: [f32x2; 3] = transmute(vld1_f32_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f32_x3() {
+        let a: [f32; 13] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.];
+        let e: [f32x4; 3] = [f32x4::new(1., 2., 3., 4.), f32x4::new(5., 6., 7., 8.), f32x4::new(9., 10., 11., 12.)];
+        let r: [f32x4; 3] = transmute(vld1q_f32_x3(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f32_x4() {
+        let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let e: [f32x2; 4] = [f32x2::new(1., 2.), f32x2::new(3., 4.), f32x2::new(5., 6.), f32x2::new(7., 8.)];
+        let r: [f32x2; 4] = transmute(vld1_f32_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f32_x4() {
+        let a: [f32; 17] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.];
+        let e: [f32x4; 4] = [f32x4::new(1., 2., 3., 4.), f32x4::new(5., 6., 7., 8.), f32x4::new(9., 10., 11., 12.), f32x4::new(13., 14., 15., 16.)];
+        let r: [f32x4; 4] = transmute(vld1q_f32_x4(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_s8() {
+        let a: [i8; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i8x8; 2] = [i8x8::new(1, 2, 2, 3, 2, 3, 4, 5), i8x8::new(2, 3, 4, 5, 6, 7, 8, 9)];
+        let r: [i8x8; 2] = transmute(vld2_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_s16() {
+        let a: [i16; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5];
+        let e: [i16x4; 2] = [i16x4::new(1, 2, 2, 3), i16x4::new(2, 3, 4, 5)];
+        let r: [i16x4; 2] = transmute(vld2_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_s32() {
+        let a: [i32; 5] = [0, 1, 2, 2, 3];
+        let e: [i32x2; 2] = [i32x2::new(1, 2), i32x2::new(2, 3)];
+        let r: [i32x2; 2] = transmute(vld2_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_s8() {
+        let a: [i8; 33] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17];
+        let e: [i8x16; 2] = [i8x16::new(1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9), i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)];
+        let r: [i8x16; 2] = transmute(vld2q_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_s16() {
+        let a: [i16; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i16x8; 2] = [i16x8::new(1, 2, 2, 3, 2, 3, 4, 5), i16x8::new(2, 3, 4, 5, 6, 7, 8, 9)];
+        let r: [i16x8; 2] = transmute(vld2q_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_s32() {
+        let a: [i32; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5];
+        let e: [i32x4; 2] = [i32x4::new(1, 2, 2, 3), i32x4::new(2, 3, 4, 5)];
+        let r: [i32x4; 2] = transmute(vld2q_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_s64() {
+        let a: [i64; 3] = [0, 1, 2];
+        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)];
+        let r: [i64x1; 2] = transmute(vld2_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_u8() {
+        let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let e: [u8x8; 2] = [u8x8::new(1, 2, 2, 3, 2, 3, 4, 5), u8x8::new(2, 3, 4, 5, 6, 7, 8, 9)];
+        let r: [u8x8; 2] = transmute(vld2_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_u16() {
+        let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5];
+        let e: [u16x4; 2] = [u16x4::new(1, 2, 2, 3), u16x4::new(2, 3, 4, 5)];
+        let r: [u16x4; 2] = transmute(vld2_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_u32() {
+        let a: [u32; 5] = [0, 1, 2, 2, 3];
+        let e: [u32x2; 2] = [u32x2::new(1, 2), u32x2::new(2, 3)];
+        let r: [u32x2; 2] = transmute(vld2_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_u8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17];
+        let e: [u8x16; 2] = [u8x16::new(1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9), u8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)];
+        let r: [u8x16; 2] = transmute(vld2q_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_u16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let e: [u16x8; 2] = [u16x8::new(1, 2, 2, 3, 2, 3, 4, 5), u16x8::new(2, 3, 4, 5, 6, 7, 8, 9)];
+        let r: [u16x8; 2] = transmute(vld2q_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_u32() {
+        let a: [u32; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5];
+        let e: [u32x4; 2] = [u32x4::new(1, 2, 2, 3), u32x4::new(2, 3, 4, 5)];
+        let r: [u32x4; 2] = transmute(vld2q_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_p8() {
+        let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i8x8; 2] = [i8x8::new(1, 2, 2, 3, 2, 3, 4, 5), i8x8::new(2, 3, 4, 5, 6, 7, 8, 9)];
+        let r: [i8x8; 2] = transmute(vld2_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_p16() {
+        let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5];
+        let e: [i16x4; 2] = [i16x4::new(1, 2, 2, 3), i16x4::new(2, 3, 4, 5)];
+        let r: [i16x4; 2] = transmute(vld2_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_p8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17];
+        let e: [i8x16; 2] = [i8x16::new(1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9), i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)];
+        let r: [i8x16; 2] = transmute(vld2q_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_p16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i16x8; 2] = [i16x8::new(1, 2, 2, 3, 2, 3, 4, 5), i16x8::new(2, 3, 4, 5, 6, 7, 8, 9)];
+        let r: [i16x8; 2] = transmute(vld2q_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_u64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(2)];
+        let r: [u64x1; 2] = transmute(vld2_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_p64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)];
+        let r: [i64x1; 2] = transmute(vld2_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_f32() {
+        let a: [f32; 5] = [0., 1., 2., 2., 3.];
+        let e: [f32x2; 2] = [f32x2::new(1., 2.), f32x2::new(2., 3.)];
+        let r: [f32x2; 2] = transmute(vld2_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_f32() {
+        let a: [f32; 9] = [0., 1., 2., 2., 3., 2., 4., 3., 5.];
+        let e: [f32x4; 2] = [f32x4::new(1., 2., 2., 3.), f32x4::new(2., 3., 4., 5.)];
+        let r: [f32x4; 2] = transmute(vld2q_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_s8() {
+        let a: [i8; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i8x8; 2] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x8; 2] = transmute(vld2_dup_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_s16() {
+        let a: [i16; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5];
+        let e: [i16x4; 2] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)];
+        let r: [i16x4; 2] = transmute(vld2_dup_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_s32() {
+        let a: [i32; 5] = [0, 1, 1, 2, 3];
+        let e: [i32x2; 2] = [i32x2::new(1, 1), i32x2::new(1, 1)];
+        let r: [i32x2; 2] = transmute(vld2_dup_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_s8() {
+        let a: [i8; 33] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17];
+        let e: [i8x16; 2] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x16; 2] = transmute(vld2q_dup_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_s16() {
+        let a: [i16; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i16x8; 2] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i16x8; 2] = transmute(vld2q_dup_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_s32() {
+        let a: [i32; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5];
+        let e: [i32x4; 2] = [i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1)];
+        let r: [i32x4; 2] = transmute(vld2q_dup_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_s64() {
+        let a: [i64; 3] = [0, 1, 1];
+        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(1)];
+        let r: [i64x1; 2] = transmute(vld2_dup_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_u8() {
+        let a: [u8; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9];
+        let e: [u8x8; 2] = [u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [u8x8; 2] = transmute(vld2_dup_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_u16() {
+        let a: [u16; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5];
+        let e: [u16x4; 2] = [u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1)];
+        let r: [u16x4; 2] = transmute(vld2_dup_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_u32() {
+        let a: [u32; 5] = [0, 1, 1, 2, 3];
+        let e: [u32x2; 2] = [u32x2::new(1, 1), u32x2::new(1, 1)];
+        let r: [u32x2; 2] = transmute(vld2_dup_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_u8() {
+        let a: [u8; 33] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17];
+        let e: [u8x16; 2] = [u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [u8x16; 2] = transmute(vld2q_dup_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_u16() {
+        let a: [u16; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9];
+        let e: [u16x8; 2] = [u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [u16x8; 2] = transmute(vld2q_dup_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_u32() {
+        let a: [u32; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5];
+        let e: [u32x4; 2] = [u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1)];
+        let r: [u32x4; 2] = transmute(vld2q_dup_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_p8() {
+        let a: [u8; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i8x8; 2] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x8; 2] = transmute(vld2_dup_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_p16() {
+        let a: [u16; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5];
+        let e: [i16x4; 2] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)];
+        let r: [i16x4; 2] = transmute(vld2_dup_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_p8() {
+        let a: [u8; 33] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17];
+        let e: [i8x16; 2] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x16; 2] = transmute(vld2q_dup_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_p16() {
+        let a: [u16; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i16x8; 2] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i16x8; 2] = transmute(vld2q_dup_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_u64() {
+        let a: [u64; 3] = [0, 1, 1];
+        let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(1)];
+        let r: [u64x1; 2] = transmute(vld2_dup_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_p64() {
+        let a: [u64; 3] = [0, 1, 1];
+        let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(1)];
+        let r: [i64x1; 2] = transmute(vld2_dup_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_dup_f32() {
+        let a: [f32; 5] = [0., 1., 1., 2., 3.];
+        let e: [f32x2; 2] = [f32x2::new(1., 1.), f32x2::new(1., 1.)];
+        let r: [f32x2; 2] = transmute(vld2_dup_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_dup_f32() {
+        let a: [f32; 9] = [0., 1., 1., 2., 3., 1., 4., 3., 5.];
+        let e: [f32x4; 2] = [f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.)];
+        let r: [f32x4; 2] = transmute(vld2q_dup_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_s8() {
+        let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x8; 2] = [i8x8::new(0, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i8x8; 2] = [i8x8::new(1, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i8x8; 2] = transmute(vld2_lane_s8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_s16() {
+        let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x4; 2] = [i16x4::new(0, 2, 2, 14), i16x4::new(2, 16, 17, 18)];
+        let e: [i16x4; 2] = [i16x4::new(1, 2, 2, 14), i16x4::new(2, 16, 17, 18)];
+        let r: [i16x4; 2] = transmute(vld2_lane_s16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_s32() {
+        let a: [i32; 5] = [0, 1, 2, 3, 4];
+        let b: [i32x2; 2] = [i32x2::new(0, 2), i32x2::new(2, 14)];
+        let e: [i32x2; 2] = [i32x2::new(1, 2), i32x2::new(2, 14)];
+        let r: [i32x2; 2] = transmute(vld2_lane_s32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_s16() {
+        let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x8; 2] = [i16x8::new(0, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i16x8; 2] = [i16x8::new(1, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i16x8; 2] = transmute(vld2q_lane_s16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_s32() {
+        let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i32x4; 2] = [i32x4::new(0, 2, 2, 14), i32x4::new(2, 16, 17, 18)];
+        let e: [i32x4; 2] = [i32x4::new(1, 2, 2, 14), i32x4::new(2, 16, 17, 18)];
+        let r: [i32x4; 2] = transmute(vld2q_lane_s32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_u8() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u8x8; 2] = [u8x8::new(0, 2, 2, 14, 2, 16, 17, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [u8x8; 2] = [u8x8::new(1, 2, 2, 14, 2, 16, 17, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [u8x8; 2] = transmute(vld2_lane_u8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_u16() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u16x4; 2] = [u16x4::new(0, 2, 2, 14), u16x4::new(2, 16, 17, 18)];
+        let e: [u16x4; 2] = [u16x4::new(1, 2, 2, 14), u16x4::new(2, 16, 17, 18)];
+        let r: [u16x4; 2] = transmute(vld2_lane_u16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_u32() {
+        let a: [u32; 5] = [0, 1, 2, 3, 4];
+        let b: [u32x2; 2] = [u32x2::new(0, 2), u32x2::new(2, 14)];
+        let e: [u32x2; 2] = [u32x2::new(1, 2), u32x2::new(2, 14)];
+        let r: [u32x2; 2] = transmute(vld2_lane_u32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_u16() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u16x8; 2] = [u16x8::new(0, 2, 2, 14, 2, 16, 17, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [u16x8; 2] = [u16x8::new(1, 2, 2, 14, 2, 16, 17, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [u16x8; 2] = transmute(vld2q_lane_u16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_u32() {
+        let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u32x4; 2] = [u32x4::new(0, 2, 2, 14), u32x4::new(2, 16, 17, 18)];
+        let e: [u32x4; 2] = [u32x4::new(1, 2, 2, 14), u32x4::new(2, 16, 17, 18)];
+        let r: [u32x4; 2] = transmute(vld2q_lane_u32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_p8() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x8; 2] = [i8x8::new(0, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i8x8; 2] = [i8x8::new(1, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i8x8; 2] = transmute(vld2_lane_p8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_p16() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x4; 2] = [i16x4::new(0, 2, 2, 14), i16x4::new(2, 16, 17, 18)];
+        let e: [i16x4; 2] = [i16x4::new(1, 2, 2, 14), i16x4::new(2, 16, 17, 18)];
+        let r: [i16x4; 2] = transmute(vld2_lane_p16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_p16() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x8; 2] = [i16x8::new(0, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i16x8; 2] = [i16x8::new(1, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i16x8; 2] = transmute(vld2q_lane_p16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2_lane_f32() {
+        let a: [f32; 5] = [0., 1., 2., 3., 4.];
+        let b: [f32x2; 2] = [f32x2::new(0., 2.), f32x2::new(2., 14.)];
+        let e: [f32x2; 2] = [f32x2::new(1., 2.), f32x2::new(2., 14.)];
+        let r: [f32x2; 2] = transmute(vld2_lane_f32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld2q_lane_f32() {
+        let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let b: [f32x4; 2] = [f32x4::new(0., 2., 2., 14.), f32x4::new(2., 16., 17., 18.)];
+        let e: [f32x4; 2] = [f32x4::new(1., 2., 2., 14.), f32x4::new(2., 16., 17., 18.)];
+        let r: [f32x4; 2] = transmute(vld2q_lane_f32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_s8() {
+        let a: [i8; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let e: [i8x8; 3] = [i8x8::new(1, 2, 2, 4, 2, 4, 7, 8), i8x8::new(2, 4, 7, 8, 13, 14, 15, 16), i8x8::new(2, 4, 7, 8, 13, 14, 15, 16)];
+        let r: [i8x8; 3] = transmute(vld3_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_s16() {
+        let a: [i16; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let e: [i16x4; 3] = [i16x4::new(1, 2, 2, 4), i16x4::new(2, 4, 7, 8), i16x4::new(2, 4, 7, 8)];
+        let r: [i16x4; 3] = transmute(vld3_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_s32() {
+        let a: [i32; 7] = [0, 1, 2, 2, 2, 4, 4];
+        let e: [i32x2; 3] = [i32x2::new(1, 2), i32x2::new(2, 4), i32x2::new(2, 4)];
+        let r: [i32x2; 3] = transmute(vld3_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_s8() {
+        let a: [i8; 49] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48];
+        let e: [i8x16; 3] = [i8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16), i8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48)];
+        let r: [i8x16; 3] = transmute(vld3q_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_s16() {
+        let a: [i16; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let e: [i16x8; 3] = [i16x8::new(1, 2, 2, 4, 2, 4, 7, 8), i16x8::new(2, 4, 7, 8, 13, 14, 15, 16), i16x8::new(2, 4, 7, 8, 13, 14, 15, 16)];
+        let r: [i16x8; 3] = transmute(vld3q_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_s32() {
+        let a: [i32; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let e: [i32x4; 3] = [i32x4::new(1, 2, 2, 4), i32x4::new(2, 4, 7, 8), i32x4::new(2, 4, 7, 8)];
+        let r: [i32x4; 3] = transmute(vld3q_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_s64() {
+        let a: [i64; 4] = [0, 1, 2, 2];
+        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)];
+        let r: [i64x1; 3] = transmute(vld3_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_u8() {
+        let a: [u8; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let e: [u8x8; 3] = [u8x8::new(1, 2, 2, 4, 2, 4, 7, 8), u8x8::new(2, 4, 7, 8, 13, 14, 15, 16), u8x8::new(2, 4, 7, 8, 13, 14, 15, 16)];
+        let r: [u8x8; 3] = transmute(vld3_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_u16() {
+        let a: [u16; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let e: [u16x4; 3] = [u16x4::new(1, 2, 2, 4), u16x4::new(2, 4, 7, 8), u16x4::new(2, 4, 7, 8)];
+        let r: [u16x4; 3] = transmute(vld3_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_u32() {
+        let a: [u32; 7] = [0, 1, 2, 2, 2, 4, 4];
+        let e: [u32x2; 3] = [u32x2::new(1, 2), u32x2::new(2, 4), u32x2::new(2, 4)];
+        let r: [u32x2; 3] = transmute(vld3_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_u8() {
+        let a: [u8; 49] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48];
+        let e: [u8x16; 3] = [u8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16), u8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32), u8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48)];
+        let r: [u8x16; 3] = transmute(vld3q_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_u16() {
+        let a: [u16; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let e: [u16x8; 3] = [u16x8::new(1, 2, 2, 4, 2, 4, 7, 8), u16x8::new(2, 4, 7, 8, 13, 14, 15, 16), u16x8::new(2, 4, 7, 8, 13, 14, 15, 16)];
+        let r: [u16x8; 3] = transmute(vld3q_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_u32() {
+        let a: [u32; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let e: [u32x4; 3] = [u32x4::new(1, 2, 2, 4), u32x4::new(2, 4, 7, 8), u32x4::new(2, 4, 7, 8)];
+        let r: [u32x4; 3] = transmute(vld3q_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_p8() {
+        let a: [u8; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let e: [i8x8; 3] = [i8x8::new(1, 2, 2, 4, 2, 4, 7, 8), i8x8::new(2, 4, 7, 8, 13, 14, 15, 16), i8x8::new(2, 4, 7, 8, 13, 14, 15, 16)];
+        let r: [i8x8; 3] = transmute(vld3_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_p16() {
+        let a: [u16; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let e: [i16x4; 3] = [i16x4::new(1, 2, 2, 4), i16x4::new(2, 4, 7, 8), i16x4::new(2, 4, 7, 8)];
+        let r: [i16x4; 3] = transmute(vld3_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_p8() {
+        let a: [u8; 49] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48];
+        let e: [i8x16; 3] = [i8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16), i8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48)];
+        let r: [i8x16; 3] = transmute(vld3q_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_p16() {
+        let a: [u16; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let e: [i16x8; 3] = [i16x8::new(1, 2, 2, 4, 2, 4, 7, 8), i16x8::new(2, 4, 7, 8, 13, 14, 15, 16), i16x8::new(2, 4, 7, 8, 13, 14, 15, 16)];
+        let r: [i16x8; 3] = transmute(vld3q_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_u64() {
+        let a: [u64; 4] = [0, 1, 2, 2];
+        let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(2), u64x1::new(2)];
+        let r: [u64x1; 3] = transmute(vld3_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_p64() {
+        let a: [u64; 4] = [0, 1, 2, 2];
+        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)];
+        let r: [i64x1; 3] = transmute(vld3_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_f32() {
+        let a: [f32; 7] = [0., 1., 2., 2., 2., 4., 4.];
+        let e: [f32x2; 3] = [f32x2::new(1., 2.), f32x2::new(2., 4.), f32x2::new(2., 4.)];
+        let r: [f32x2; 3] = transmute(vld3_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_f32() {
+        let a: [f32; 13] = [0., 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8.];
+        let e: [f32x4; 3] = [f32x4::new(1., 2., 2., 4.), f32x4::new(2., 4., 7., 8.), f32x4::new(2., 4., 7., 8.)];
+        let r: [f32x4; 3] = transmute(vld3q_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_s8() {
+        let a: [i8; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13];
+        let e: [i8x8; 3] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x8; 3] = transmute(vld3_dup_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_s16() {
+        let a: [i16; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7];
+        let e: [i16x4; 3] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)];
+        let r: [i16x4; 3] = transmute(vld3_dup_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_s32() {
+        let a: [i32; 7] = [0, 1, 1, 1, 3, 1, 4];
+        let e: [i32x2; 3] = [i32x2::new(1, 1), i32x2::new(1, 1), i32x2::new(1, 1)];
+        let r: [i32x2; 3] = transmute(vld3_dup_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_s8() {
+        let a: [i8; 49] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17];
+        let e: [i8x16; 3] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x16; 3] = transmute(vld3q_dup_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_s16() {
+        let a: [i16; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13];
+        let e: [i16x8; 3] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i16x8; 3] = transmute(vld3q_dup_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_s32() {
+        let a: [i32; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7];
+        let e: [i32x4; 3] = [i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1)];
+        let r: [i32x4; 3] = transmute(vld3q_dup_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_s64() {
+        let a: [i64; 4] = [0, 1, 1, 1];
+        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(1), i64x1::new(1)];
+        let r: [i64x1; 3] = transmute(vld3_dup_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_u8() {
+        let a: [u8; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13];
+        let e: [u8x8; 3] = [u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [u8x8; 3] = transmute(vld3_dup_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_u16() {
+        let a: [u16; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7];
+        let e: [u16x4; 3] = [u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1)];
+        let r: [u16x4; 3] = transmute(vld3_dup_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_u32() {
+        let a: [u32; 7] = [0, 1, 1, 1, 3, 1, 4];
+        let e: [u32x2; 3] = [u32x2::new(1, 1), u32x2::new(1, 1), u32x2::new(1, 1)];
+        let r: [u32x2; 3] = transmute(vld3_dup_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_u8() {
+        let a: [u8; 49] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17];
+        let e: [u8x16; 3] = [u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [u8x16; 3] = transmute(vld3q_dup_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_u16() {
+        let a: [u16; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13];
+        let e: [u16x8; 3] = [u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [u16x8; 3] = transmute(vld3q_dup_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_u32() {
+        let a: [u32; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7];
+        let e: [u32x4; 3] = [u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1)];
+        let r: [u32x4; 3] = transmute(vld3q_dup_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_p8() {
+        let a: [u8; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13];
+        let e: [i8x8; 3] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x8; 3] = transmute(vld3_dup_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_p16() {
+        let a: [u16; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7];
+        let e: [i16x4; 3] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)];
+        let r: [i16x4; 3] = transmute(vld3_dup_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_p8() {
+        let a: [u8; 49] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17];
+        let e: [i8x16; 3] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x16; 3] = transmute(vld3q_dup_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_p16() {
+        let a: [u16; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13];
+        let e: [i16x8; 3] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i16x8; 3] = transmute(vld3q_dup_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_u64() {
+        let a: [u64; 4] = [0, 1, 1, 1];
+        let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(1), u64x1::new(1)];
+        let r: [u64x1; 3] = transmute(vld3_dup_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_p64() {
+        let a: [u64; 4] = [0, 1, 1, 1];
+        let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(1), i64x1::new(1)];
+        let r: [i64x1; 3] = transmute(vld3_dup_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_dup_f32() {
+        let a: [f32; 7] = [0., 1., 1., 1., 3., 1., 4.];
+        let e: [f32x2; 3] = [f32x2::new(1., 1.), f32x2::new(1., 1.), f32x2::new(1., 1.)];
+        let r: [f32x2; 3] = transmute(vld3_dup_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_dup_f32() {
+        let a: [f32; 13] = [0., 1., 1., 1., 3., 1., 4., 3., 5., 1., 4., 3., 5.];
+        let e: [f32x4; 3] = [f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.)];
+        let r: [f32x4; 3] = transmute(vld3q_dup_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_s8() {
+        let a: [i8; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x8; 3] = [i8x8::new(0, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26), i8x8::new(11, 12, 13, 14, 15, 16, 17, 18)];
+        let e: [i8x8; 3] = [i8x8::new(1, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26), i8x8::new(2, 12, 13, 14, 15, 16, 17, 18)];
+        let r: [i8x8; 3] = transmute(vld3_lane_s8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_s16() {
+        let a: [i16; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4];
+        let b: [i16x4; 3] = [i16x4::new(0, 2, 2, 14), i16x4::new(2, 16, 17, 18), i16x4::new(2, 20, 21, 22)];
+        let e: [i16x4; 3] = [i16x4::new(1, 2, 2, 14), i16x4::new(2, 16, 17, 18), i16x4::new(2, 20, 21, 22)];
+        let r: [i16x4; 3] = transmute(vld3_lane_s16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_s32() {
+        let a: [i32; 7] = [0, 1, 2, 2, 4, 5, 6];
+        let b: [i32x2; 3] = [i32x2::new(0, 2), i32x2::new(2, 14), i32x2::new(2, 16)];
+        let e: [i32x2; 3] = [i32x2::new(1, 2), i32x2::new(2, 14), i32x2::new(2, 16)];
+        let r: [i32x2; 3] = transmute(vld3_lane_s32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_s16() {
+        let a: [i16; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x8; 3] = [i16x8::new(0, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26), i16x8::new(11, 12, 13, 14, 15, 16, 17, 18)];
+        let e: [i16x8; 3] = [i16x8::new(1, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26), i16x8::new(2, 12, 13, 14, 15, 16, 17, 18)];
+        let r: [i16x8; 3] = transmute(vld3q_lane_s16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_s32() {
+        let a: [i32; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4];
+        let b: [i32x4; 3] = [i32x4::new(0, 2, 2, 14), i32x4::new(2, 16, 17, 18), i32x4::new(2, 20, 21, 22)];
+        let e: [i32x4; 3] = [i32x4::new(1, 2, 2, 14), i32x4::new(2, 16, 17, 18), i32x4::new(2, 20, 21, 22)];
+        let r: [i32x4; 3] = transmute(vld3q_lane_s32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_u8() {
+        let a: [u8; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u8x8; 3] = [u8x8::new(0, 2, 2, 14, 2, 16, 17, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26), u8x8::new(11, 12, 13, 14, 15, 16, 17, 18)];
+        let e: [u8x8; 3] = [u8x8::new(1, 2, 2, 14, 2, 16, 17, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26), u8x8::new(2, 12, 13, 14, 15, 16, 17, 18)];
+        let r: [u8x8; 3] = transmute(vld3_lane_u8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_u16() {
+        let a: [u16; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4];
+        let b: [u16x4; 3] = [u16x4::new(0, 2, 2, 14), u16x4::new(2, 16, 17, 18), u16x4::new(2, 20, 21, 22)];
+        let e: [u16x4; 3] = [u16x4::new(1, 2, 2, 14), u16x4::new(2, 16, 17, 18), u16x4::new(2, 20, 21, 22)];
+        let r: [u16x4; 3] = transmute(vld3_lane_u16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_u32() {
+        let a: [u32; 7] = [0, 1, 2, 2, 4, 5, 6];
+        let b: [u32x2; 3] = [u32x2::new(0, 2), u32x2::new(2, 14), u32x2::new(2, 16)];
+        let e: [u32x2; 3] = [u32x2::new(1, 2), u32x2::new(2, 14), u32x2::new(2, 16)];
+        let r: [u32x2; 3] = transmute(vld3_lane_u32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_u16() {
+        let a: [u16; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u16x8; 3] = [u16x8::new(0, 2, 2, 14, 2, 16, 17, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26), u16x8::new(11, 12, 13, 14, 15, 16, 17, 18)];
+        let e: [u16x8; 3] = [u16x8::new(1, 2, 2, 14, 2, 16, 17, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26), u16x8::new(2, 12, 13, 14, 15, 16, 17, 18)];
+        let r: [u16x8; 3] = transmute(vld3q_lane_u16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_u32() {
+        let a: [u32; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4];
+        let b: [u32x4; 3] = [u32x4::new(0, 2, 2, 14), u32x4::new(2, 16, 17, 18), u32x4::new(2, 20, 21, 22)];
+        let e: [u32x4; 3] = [u32x4::new(1, 2, 2, 14), u32x4::new(2, 16, 17, 18), u32x4::new(2, 20, 21, 22)];
+        let r: [u32x4; 3] = transmute(vld3q_lane_u32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_p8() {
+        let a: [u8; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x8; 3] = [i8x8::new(0, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26), i8x8::new(11, 12, 13, 14, 15, 16, 17, 18)];
+        let e: [i8x8; 3] = [i8x8::new(1, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26), i8x8::new(2, 12, 13, 14, 15, 16, 17, 18)];
+        let r: [i8x8; 3] = transmute(vld3_lane_p8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_p16() {
+        let a: [u16; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4];
+        let b: [i16x4; 3] = [i16x4::new(0, 2, 2, 14), i16x4::new(2, 16, 17, 18), i16x4::new(2, 20, 21, 22)];
+        let e: [i16x4; 3] = [i16x4::new(1, 2, 2, 14), i16x4::new(2, 16, 17, 18), i16x4::new(2, 20, 21, 22)];
+        let r: [i16x4; 3] = transmute(vld3_lane_p16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_p16() {
+        let a: [u16; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x8; 3] = [i16x8::new(0, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26), i16x8::new(11, 12, 13, 14, 15, 16, 17, 18)];
+        let e: [i16x8; 3] = [i16x8::new(1, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26), i16x8::new(2, 12, 13, 14, 15, 16, 17, 18)];
+        let r: [i16x8; 3] = transmute(vld3q_lane_p16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3_lane_f32() {
+        let a: [f32; 7] = [0., 1., 2., 2., 4., 5., 6.];
+        let b: [f32x2; 3] = [f32x2::new(0., 2.), f32x2::new(2., 14.), f32x2::new(9., 16.)];
+        let e: [f32x2; 3] = [f32x2::new(1., 2.), f32x2::new(2., 14.), f32x2::new(2., 16.)];
+        let r: [f32x2; 3] = transmute(vld3_lane_f32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld3q_lane_f32() {
+        let a: [f32; 13] = [0., 1., 2., 2., 4., 5., 6., 7., 8., 5., 6., 7., 8.];
+        let b: [f32x4; 3] = [f32x4::new(0., 2., 2., 14.), f32x4::new(9., 16., 17., 18.), f32x4::new(5., 6., 7., 8.)];
+        let e: [f32x4; 3] = [f32x4::new(1., 2., 2., 14.), f32x4::new(2., 16., 17., 18.), f32x4::new(2., 6., 7., 8.)];
+        let r: [f32x4; 3] = transmute(vld3q_lane_f32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_s8() {
+        let a: [i8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [i8x8; 4] = [i8x8::new(1, 2, 2, 6, 2, 6, 6, 8), i8x8::new(2, 6, 6, 8, 6, 8, 8, 16), i8x8::new(2, 6, 6, 8, 6, 8, 8, 16), i8x8::new(6, 8, 8, 16, 8, 16, 16, 32)];
+        let r: [i8x8; 4] = transmute(vld4_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_s16() {
+        let a: [i16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [i16x4; 4] = [i16x4::new(1, 2, 2, 6), i16x4::new(2, 6, 6, 8), i16x4::new(2, 6, 6, 8), i16x4::new(6, 8, 8, 16)];
+        let r: [i16x4; 4] = transmute(vld4_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_s32() {
+        let a: [i32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [i32x2; 4] = [i32x2::new(1, 2), i32x2::new(2, 6), i32x2::new(2, 6), i32x2::new(6, 8)];
+        let r: [i32x2; 4] = transmute(vld4_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_s8() {
+        let a: [i8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let e: [i8x16; 4] = [i8x16::new(1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16), i8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32), i8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48), i8x16::new(6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64)];
+        let r: [i8x16; 4] = transmute(vld4q_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_s16() {
+        let a: [i16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [i16x8; 4] = [i16x8::new(1, 2, 2, 6, 2, 6, 6, 8), i16x8::new(2, 6, 6, 8, 6, 8, 8, 16), i16x8::new(2, 6, 6, 8, 6, 8, 8, 16), i16x8::new(6, 8, 8, 16, 8, 16, 16, 32)];
+        let r: [i16x8; 4] = transmute(vld4q_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_s32() {
+        let a: [i32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [i32x4; 4] = [i32x4::new(1, 2, 2, 6), i32x4::new(2, 6, 6, 8), i32x4::new(2, 6, 6, 8), i32x4::new(6, 8, 8, 16)];
+        let r: [i32x4; 4] = transmute(vld4q_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_s64() {
+        let a: [i64; 5] = [0, 1, 2, 2, 6];
+        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(6)];
+        let r: [i64x1; 4] = transmute(vld4_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_u8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u8x8; 4] = [u8x8::new(1, 2, 2, 6, 2, 6, 6, 8), u8x8::new(2, 6, 6, 8, 6, 8, 8, 16), u8x8::new(2, 6, 6, 8, 6, 8, 8, 16), u8x8::new(6, 8, 8, 16, 8, 16, 16, 32)];
+        let r: [u8x8; 4] = transmute(vld4_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_u16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [u16x4; 4] = [u16x4::new(1, 2, 2, 6), u16x4::new(2, 6, 6, 8), u16x4::new(2, 6, 6, 8), u16x4::new(6, 8, 8, 16)];
+        let r: [u16x4; 4] = transmute(vld4_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_u32() {
+        let a: [u32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [u32x2; 4] = [u32x2::new(1, 2), u32x2::new(2, 6), u32x2::new(2, 6), u32x2::new(6, 8)];
+        let r: [u32x2; 4] = transmute(vld4_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_u8() {
+        let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let e: [u8x16; 4] = [u8x16::new(1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16), u8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32), u8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48), u8x16::new(6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64)];
+        let r: [u8x16; 4] = transmute(vld4q_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_u16() {
+        let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u16x8; 4] = [u16x8::new(1, 2, 2, 6, 2, 6, 6, 8), u16x8::new(2, 6, 6, 8, 6, 8, 8, 16), u16x8::new(2, 6, 6, 8, 6, 8, 8, 16), u16x8::new(6, 8, 8, 16, 8, 16, 16, 32)];
+        let r: [u16x8; 4] = transmute(vld4q_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_u32() {
+        let a: [u32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [u32x4; 4] = [u32x4::new(1, 2, 2, 6), u32x4::new(2, 6, 6, 8), u32x4::new(2, 6, 6, 8), u32x4::new(6, 8, 8, 16)];
+        let r: [u32x4; 4] = transmute(vld4q_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_p8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [i8x8; 4] = [i8x8::new(1, 2, 2, 6, 2, 6, 6, 8), i8x8::new(2, 6, 6, 8, 6, 8, 8, 16), i8x8::new(2, 6, 6, 8, 6, 8, 8, 16), i8x8::new(6, 8, 8, 16, 8, 16, 16, 32)];
+        let r: [i8x8; 4] = transmute(vld4_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_p16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [i16x4; 4] = [i16x4::new(1, 2, 2, 6), i16x4::new(2, 6, 6, 8), i16x4::new(2, 6, 6, 8), i16x4::new(6, 8, 8, 16)];
+        let r: [i16x4; 4] = transmute(vld4_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_p8() {
+        let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let e: [i8x16; 4] = [i8x16::new(1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16), i8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32), i8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48), i8x16::new(6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64)];
+        let r: [i8x16; 4] = transmute(vld4q_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_p16() {
+        let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [i16x8; 4] = [i16x8::new(1, 2, 2, 6, 2, 6, 6, 8), i16x8::new(2, 6, 6, 8, 6, 8, 8, 16), i16x8::new(2, 6, 6, 8, 6, 8, 8, 16), i16x8::new(6, 8, 8, 16, 8, 16, 16, 32)];
+        let r: [i16x8; 4] = transmute(vld4q_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_u64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 6];
+        let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(2), u64x1::new(2), u64x1::new(6)];
+        let r: [u64x1; 4] = transmute(vld4_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_p64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 6];
+        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(6)];
+        let r: [i64x1; 4] = transmute(vld4_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_f32() {
+        let a: [f32; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.];
+        let e: [f32x2; 4] = [f32x2::new(1., 2.), f32x2::new(2., 6.), f32x2::new(2., 6.), f32x2::new(6., 8.)];
+        let r: [f32x2; 4] = transmute(vld4_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_f32() {
+        let a: [f32; 17] = [0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 15., 16.];
+        let e: [f32x4; 4] = [f32x4::new(1., 2., 2., 6.), f32x4::new(2., 6., 6., 8.), f32x4::new(2., 6., 6., 15.), f32x4::new(6., 8., 8., 16.)];
+        let r: [f32x4; 4] = transmute(vld4q_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_s8() {
+        let a: [i8; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i8x8; 4] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x8; 4] = transmute(vld4_dup_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_s16() {
+        let a: [i16; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i16x4; 4] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)];
+        let r: [i16x4; 4] = transmute(vld4_dup_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_s32() {
+        let a: [i32; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5];
+        let e: [i32x2; 4] = [i32x2::new(1, 1), i32x2::new(1, 1), i32x2::new(1, 1), i32x2::new(1, 1)];
+        let r: [i32x2; 4] = transmute(vld4_dup_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_s8() {
+        let a: [i8; 65] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i8x16; 4] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x16; 4] = transmute(vld4q_dup_s8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_s16() {
+        let a: [i16; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i16x8; 4] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i16x8; 4] = transmute(vld4q_dup_s16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_s32() {
+        let a: [i32; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i32x4; 4] = [i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1)];
+        let r: [i32x4; 4] = transmute(vld4q_dup_s32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_s64() {
+        let a: [i64; 5] = [0, 1, 1, 1, 1];
+        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(1), i64x1::new(1), i64x1::new(1)];
+        let r: [i64x1; 4] = transmute(vld4_dup_s64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_u8() {
+        let a: [u8; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [u8x8; 4] = [u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [u8x8; 4] = transmute(vld4_dup_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_u16() {
+        let a: [u16; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [u16x4; 4] = [u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1)];
+        let r: [u16x4; 4] = transmute(vld4_dup_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_u32() {
+        let a: [u32; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5];
+        let e: [u32x2; 4] = [u32x2::new(1, 1), u32x2::new(1, 1), u32x2::new(1, 1), u32x2::new(1, 1)];
+        let r: [u32x2; 4] = transmute(vld4_dup_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_u8() {
+        let a: [u8; 65] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [u8x16; 4] = [u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [u8x16; 4] = transmute(vld4q_dup_u8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_u16() {
+        let a: [u16; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [u16x8; 4] = [u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [u16x8; 4] = transmute(vld4q_dup_u16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_u32() {
+        let a: [u32; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [u32x4; 4] = [u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1)];
+        let r: [u32x4; 4] = transmute(vld4q_dup_u32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_p8() {
+        let a: [u8; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i8x8; 4] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x8; 4] = transmute(vld4_dup_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_p16() {
+        let a: [u16; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i16x4; 4] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)];
+        let r: [i16x4; 4] = transmute(vld4_dup_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_p8() {
+        let a: [u8; 65] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i8x16; 4] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i8x16; 4] = transmute(vld4q_dup_p8(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_p16() {
+        let a: [u16; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9];
+        let e: [i16x8; 4] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)];
+        let r: [i16x8; 4] = transmute(vld4q_dup_p16(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_u64() {
+        let a: [u64; 5] = [0, 1, 1, 1, 1];
+        let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(1), u64x1::new(1), u64x1::new(1)];
+        let r: [u64x1; 4] = transmute(vld4_dup_u64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_p64() {
+        let a: [u64; 5] = [0, 1, 1, 1, 1];
+        let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(1), i64x1::new(1), i64x1::new(1)];
+        let r: [i64x1; 4] = transmute(vld4_dup_p64(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_dup_f32() {
+        let a: [f32; 9] = [0., 1., 1., 1., 1., 6., 4., 3., 5.];
+        let e: [f32x2; 4] = [f32x2::new(1., 1.), f32x2::new(1., 1.), f32x2::new(1., 1.), f32x2::new(1., 1.)];
+        let r: [f32x2; 4] = transmute(vld4_dup_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_dup_f32() {
+        let a: [f32; 17] = [0., 1., 1., 1., 1., 6., 4., 3., 5., 7., 4., 3., 5., 8., 4., 3., 5.];
+        let e: [f32x4; 4] = [f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.)];
+        let r: [f32x4; 4] = transmute(vld4q_dup_f32(a[1..].as_ptr()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_s8() {
+        let a: [i8; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x8; 4] = [i8x8::new(0, 2, 2, 2, 2, 16, 2, 18), i8x8::new(2, 20, 21, 22, 2, 24, 25, 26), i8x8::new(11, 12, 13, 14, 15, 16, 2, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i8x8; 4] = [i8x8::new(1, 2, 2, 2, 2, 16, 2, 18), i8x8::new(2, 20, 21, 22, 2, 24, 25, 26), i8x8::new(2, 12, 13, 14, 15, 16, 2, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i8x8; 4] = transmute(vld4_lane_s8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_s16() {
+        let a: [i16; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x4; 4] = [i16x4::new(0, 2, 2, 2), i16x4::new(2, 16, 2, 18), i16x4::new(2, 20, 21, 22), i16x4::new(2, 24, 25, 26)];
+        let e: [i16x4; 4] = [i16x4::new(1, 2, 2, 2), i16x4::new(2, 16, 2, 18), i16x4::new(2, 20, 21, 22), i16x4::new(2, 24, 25, 26)];
+        let r: [i16x4; 4] = transmute(vld4_lane_s16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_s32() {
+        let a: [i32; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8];
+        let b: [i32x2; 4] = [i32x2::new(0, 2), i32x2::new(2, 2), i32x2::new(2, 16), i32x2::new(2, 18)];
+        let e: [i32x2; 4] = [i32x2::new(1, 2), i32x2::new(2, 2), i32x2::new(2, 16), i32x2::new(2, 18)];
+        let r: [i32x2; 4] = transmute(vld4_lane_s32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_s16() {
+        let a: [i16; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x8; 4] = [i16x8::new(0, 2, 2, 2, 2, 16, 2, 18), i16x8::new(2, 20, 21, 22, 2, 24, 25, 26), i16x8::new(11, 12, 13, 14, 15, 16, 2, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i16x8; 4] = [i16x8::new(1, 2, 2, 2, 2, 16, 2, 18), i16x8::new(2, 20, 21, 22, 2, 24, 25, 26), i16x8::new(2, 12, 13, 14, 15, 16, 2, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i16x8; 4] = transmute(vld4q_lane_s16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_s32() {
+        let a: [i32; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i32x4; 4] = [i32x4::new(0, 2, 2, 2), i32x4::new(2, 16, 2, 18), i32x4::new(2, 20, 21, 22), i32x4::new(2, 24, 25, 26)];
+        let e: [i32x4; 4] = [i32x4::new(1, 2, 2, 2), i32x4::new(2, 16, 2, 18), i32x4::new(2, 20, 21, 22), i32x4::new(2, 24, 25, 26)];
+        let r: [i32x4; 4] = transmute(vld4q_lane_s32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_u8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u8x8; 4] = [u8x8::new(0, 2, 2, 2, 2, 16, 2, 18), u8x8::new(2, 20, 21, 22, 2, 24, 25, 26), u8x8::new(11, 12, 13, 14, 15, 16, 2, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [u8x8; 4] = [u8x8::new(1, 2, 2, 2, 2, 16, 2, 18), u8x8::new(2, 20, 21, 22, 2, 24, 25, 26), u8x8::new(2, 12, 13, 14, 15, 16, 2, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [u8x8; 4] = transmute(vld4_lane_u8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_u16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u16x4; 4] = [u16x4::new(0, 2, 2, 2), u16x4::new(2, 16, 2, 18), u16x4::new(2, 20, 21, 22), u16x4::new(2, 24, 25, 26)];
+        let e: [u16x4; 4] = [u16x4::new(1, 2, 2, 2), u16x4::new(2, 16, 2, 18), u16x4::new(2, 20, 21, 22), u16x4::new(2, 24, 25, 26)];
+        let r: [u16x4; 4] = transmute(vld4_lane_u16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_u32() {
+        let a: [u32; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8];
+        let b: [u32x2; 4] = [u32x2::new(0, 2), u32x2::new(2, 2), u32x2::new(2, 16), u32x2::new(2, 18)];
+        let e: [u32x2; 4] = [u32x2::new(1, 2), u32x2::new(2, 2), u32x2::new(2, 16), u32x2::new(2, 18)];
+        let r: [u32x2; 4] = transmute(vld4_lane_u32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_u16() {
+        let a: [u16; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u16x8; 4] = [u16x8::new(0, 2, 2, 2, 2, 16, 2, 18), u16x8::new(2, 20, 21, 22, 2, 24, 25, 26), u16x8::new(11, 12, 13, 14, 15, 16, 2, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [u16x8; 4] = [u16x8::new(1, 2, 2, 2, 2, 16, 2, 18), u16x8::new(2, 20, 21, 22, 2, 24, 25, 26), u16x8::new(2, 12, 13, 14, 15, 16, 2, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [u16x8; 4] = transmute(vld4q_lane_u16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_u32() {
+        let a: [u32; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [u32x4; 4] = [u32x4::new(0, 2, 2, 2), u32x4::new(2, 16, 2, 18), u32x4::new(2, 20, 21, 22), u32x4::new(2, 24, 25, 26)];
+        let e: [u32x4; 4] = [u32x4::new(1, 2, 2, 2), u32x4::new(2, 16, 2, 18), u32x4::new(2, 20, 21, 22), u32x4::new(2, 24, 25, 26)];
+        let r: [u32x4; 4] = transmute(vld4q_lane_u32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_p8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i8x8; 4] = [i8x8::new(0, 2, 2, 2, 2, 16, 2, 18), i8x8::new(2, 20, 21, 22, 2, 24, 25, 26), i8x8::new(11, 12, 13, 14, 15, 16, 2, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i8x8; 4] = [i8x8::new(1, 2, 2, 2, 2, 16, 2, 18), i8x8::new(2, 20, 21, 22, 2, 24, 25, 26), i8x8::new(2, 12, 13, 14, 15, 16, 2, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i8x8; 4] = transmute(vld4_lane_p8::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_p16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x4; 4] = [i16x4::new(0, 2, 2, 2), i16x4::new(2, 16, 2, 18), i16x4::new(2, 20, 21, 22), i16x4::new(2, 24, 25, 26)];
+        let e: [i16x4; 4] = [i16x4::new(1, 2, 2, 2), i16x4::new(2, 16, 2, 18), i16x4::new(2, 20, 21, 22), i16x4::new(2, 24, 25, 26)];
+        let r: [i16x4; 4] = transmute(vld4_lane_p16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_p16() {
+        let a: [u16; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b: [i16x8; 4] = [i16x8::new(0, 2, 2, 2, 2, 16, 2, 18), i16x8::new(2, 20, 21, 22, 2, 24, 25, 26), i16x8::new(11, 12, 13, 14, 15, 16, 2, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let e: [i16x8; 4] = [i16x8::new(1, 2, 2, 2, 2, 16, 2, 18), i16x8::new(2, 20, 21, 22, 2, 24, 25, 26), i16x8::new(2, 12, 13, 14, 15, 16, 2, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)];
+        let r: [i16x8; 4] = transmute(vld4q_lane_p16::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4_lane_f32() {
+        let a: [f32; 9] = [0., 1., 2., 2., 2., 5., 6., 7., 8.];
+        let b: [f32x2; 4] = [f32x2::new(0., 2.), f32x2::new(2., 2.), f32x2::new(2., 16.), f32x2::new(2., 18.)];
+        let e: [f32x2; 4] = [f32x2::new(1., 2.), f32x2::new(2., 2.), f32x2::new(2., 16.), f32x2::new(2., 18.)];
+        let r: [f32x2; 4] = transmute(vld4_lane_f32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld4q_lane_f32() {
+        let a: [f32; 17] = [0., 1., 2., 2., 2., 5., 6., 7., 8., 5., 6., 7., 8., 1., 4., 3., 5.];
+        let b: [f32x4; 4] = [f32x4::new(0., 2., 2., 2.), f32x4::new(2., 16., 2., 18.), f32x4::new(5., 6., 7., 8.), f32x4::new(1., 4., 3., 5.)];
+        let e: [f32x4; 4] = [f32x4::new(1., 2., 2., 2.), f32x4::new(2., 16., 2., 18.), f32x4::new(2., 6., 7., 8.), f32x4::new(2., 4., 3., 5.)];
+        let r: [f32x4; 4] = transmute(vld4q_lane_f32::<0>(a[1..].as_ptr(), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_s8() {
+        let a: [i8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i8; 8] = [1, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i8; 8] = [0i8; 8];
+        vst1_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_s16() {
+        let a: [i16; 5] = [0, 1, 2, 3, 4];
+        let e: [i16; 4] = [1, 0, 0, 0];
+        let mut r: [i16; 4] = [0i16; 4];
+        vst1_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_s32() {
+        let a: [i32; 3] = [0, 1, 2];
+        let e: [i32; 2] = [1, 0];
+        let mut r: [i32; 2] = [0i32; 2];
+        vst1_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_s64() {
+        let a: [i64; 2] = [0, 1];
+        let e: [i64; 1] = [1];
+        let mut r: [i64; 1] = [0i64; 1];
+        vst1_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_s8() {
+        let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8; 16] = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i8; 16] = [0i8; 16];
+        vst1q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_s16() {
+        let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i16; 8] = [1, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i16; 8] = [0i16; 8];
+        vst1q_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_s32() {
+        let a: [i32; 5] = [0, 1, 2, 3, 4];
+        let e: [i32; 4] = [1, 0, 0, 0];
+        let mut r: [i32; 4] = [0i32; 4];
+        vst1q_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_s64() {
+        let a: [i64; 3] = [0, 1, 2];
+        let e: [i64; 2] = [1, 0];
+        let mut r: [i64; 2] = [0i64; 2];
+        vst1q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_u8() {
+        let a: [u8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u8; 8] = [1, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 8] = [0u8; 8];
+        vst1_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_u16() {
+        let a: [u16; 5] = [0, 1, 2, 3, 4];
+        let e: [u16; 4] = [1, 0, 0, 0];
+        let mut r: [u16; 4] = [0u16; 4];
+        vst1_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_u32() {
+        let a: [u32; 3] = [0, 1, 2];
+        let e: [u32; 2] = [1, 0];
+        let mut r: [u32; 2] = [0u32; 2];
+        vst1_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_u64() {
+        let a: [u64; 2] = [0, 1];
+        let e: [u64; 1] = [1];
+        let mut r: [u64; 1] = [0u64; 1];
+        vst1_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_u8() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8; 16] = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst1q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_u16() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u16; 8] = [1, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst1q_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_u32() {
+        let a: [u32; 5] = [0, 1, 2, 3, 4];
+        let e: [u32; 4] = [1, 0, 0, 0];
+        let mut r: [u32; 4] = [0u32; 4];
+        vst1q_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_u64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64; 2] = [1, 0];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst1q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_p8() {
+        let a: [u8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u8; 8] = [1, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 8] = [0u8; 8];
+        vst1_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_p16() {
+        let a: [u16; 5] = [0, 1, 2, 3, 4];
+        let e: [u16; 4] = [1, 0, 0, 0];
+        let mut r: [u16; 4] = [0u16; 4];
+        vst1_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_p8() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8; 16] = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst1q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_p16() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u16; 8] = [1, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst1q_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_p64() {
+        let a: [u64; 2] = [0, 1];
+        let e: [u64; 1] = [1];
+        let mut r: [u64; 1] = [0u64; 1];
+        vst1_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_p64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64; 2] = [1, 0];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst1q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_lane_f32() {
+        let a: [f32; 3] = [0., 1., 2.];
+        let e: [f32; 2] = [1., 0.];
+        let mut r: [f32; 2] = [0f32; 2];
+        vst1_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_lane_f32() {
+        let a: [f32; 5] = [0., 1., 2., 3., 4.];
+        let e: [f32; 4] = [1., 0., 0., 0.];
+        let mut r: [f32; 4] = [0f32; 4];
+        vst1q_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s8_x2() {
+        let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i8; 16] = [0i8; 16];
+        vst1_s8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s16_x2() {
+        let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [i16; 8] = [0i16; 8];
+        vst1_s16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s32_x2() {
+        let a: [i32; 5] = [0, 1, 2, 3, 4];
+        let e: [i32; 4] = [1, 2, 3, 4];
+        let mut r: [i32; 4] = [0i32; 4];
+        vst1_s32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s64_x2() {
+        let a: [i64; 3] = [0, 1, 2];
+        let e: [i64; 2] = [1, 2];
+        let mut r: [i64; 2] = [0i64; 2];
+        vst1_s64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s8_x2() {
+        let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [i8; 32] = [0i8; 32];
+        vst1q_s8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s16_x2() {
+        let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i16; 16] = [0i16; 16];
+        vst1q_s16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s32_x2() {
+        let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [i32; 8] = [0i32; 8];
+        vst1q_s32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s64_x2() {
+        let a: [i64; 5] = [0, 1, 2, 3, 4];
+        let e: [i64; 4] = [1, 2, 3, 4];
+        let mut r: [i64; 4] = [0i64; 4];
+        vst1q_s64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s8_x3() {
+        let a: [i8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i8; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [i8; 24] = [0i8; 24];
+        vst1_s8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s16_x3() {
+        let a: [i16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [i16; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [i16; 12] = [0i16; 12];
+        vst1_s16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s32_x3() {
+        let a: [i32; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [i32; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [i32; 6] = [0i32; 6];
+        vst1_s32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s64_x3() {
+        let a: [i64; 4] = [0, 1, 2, 3];
+        let e: [i64; 3] = [1, 2, 3];
+        let mut r: [i64; 3] = [0i64; 3];
+        vst1_s64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s8_x3() {
+        let a: [i8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i8; 48] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i8; 48] = [0i8; 48];
+        vst1q_s8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s16_x3() {
+        let a: [i16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [i16; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [i16; 24] = [0i16; 24];
+        vst1q_s16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s32_x3() {
+        let a: [i32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [i32; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [i32; 12] = [0i32; 12];
+        vst1q_s32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s64_x3() {
+        let a: [i64; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [i64; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [i64; 6] = [0i64; 6];
+        vst1q_s64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s8_x4() {
+        let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [i8; 32] = [0i8; 32];
+        vst1_s8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s16_x4() {
+        let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i16; 16] = [0i16; 16];
+        vst1_s16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s32_x4() {
+        let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [i32; 8] = [0i32; 8];
+        vst1_s32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_s64_x4() {
+        let a: [i64; 5] = [0, 1, 2, 3, 4];
+        let e: [i64; 4] = [1, 2, 3, 4];
+        let mut r: [i64; 4] = [0i64; 4];
+        vst1_s64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s8_x4() {
+        let a: [i8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [i8; 64] = [0i8; 64];
+        vst1q_s8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s16_x4() {
+        let a: [i16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [i16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [i16; 32] = [0i16; 32];
+        vst1q_s16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s32_x4() {
+        let a: [i32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [i32; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [i32; 16] = [0i32; 16];
+        vst1q_s32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_s64_x4() {
+        let a: [i64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [i64; 8] = [0i64; 8];
+        vst1q_s64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u8_x2() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst1_u8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u16_x2() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst1_u16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u32_x2() {
+        let a: [u32; 5] = [0, 1, 2, 3, 4];
+        let e: [u32; 4] = [1, 2, 3, 4];
+        let mut r: [u32; 4] = [0u32; 4];
+        vst1_u32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u64_x2() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64; 2] = [1, 2];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst1_u64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u8_x2() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst1q_u8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u16_x2() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst1q_u16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u32_x2() {
+        let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u32; 8] = [0u32; 8];
+        vst1q_u32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u64_x2() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64; 4] = [1, 2, 3, 4];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst1q_u64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u8_x3() {
+        let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [u8; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [u8; 24] = [0u8; 24];
+        vst1_u8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u16_x3() {
+        let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [u16; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [u16; 12] = [0u16; 12];
+        vst1_u16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u32_x3() {
+        let a: [u32; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [u32; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [u32; 6] = [0u32; 6];
+        vst1_u32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u64_x3() {
+        let a: [u64; 4] = [0, 1, 2, 3];
+        let e: [u64; 3] = [1, 2, 3];
+        let mut r: [u64; 3] = [0u64; 3];
+        vst1_u64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u8_x3() {
+        let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8; 48] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u8; 48] = [0u8; 48];
+        vst1q_u8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u16_x3() {
+        let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [u16; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [u16; 24] = [0u16; 24];
+        vst1q_u16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u32_x3() {
+        let a: [u32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [u32; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [u32; 12] = [0u32; 12];
+        vst1q_u32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u64_x3() {
+        let a: [u64; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [u64; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [u64; 6] = [0u64; 6];
+        vst1q_u64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u8_x4() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst1_u8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u16_x4() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst1_u16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u32_x4() {
+        let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u32; 8] = [0u32; 8];
+        vst1_u32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_u64_x4() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64; 4] = [1, 2, 3, 4];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst1_u64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u8_x4() {
+        let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 64] = [0u8; 64];
+        vst1q_u8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u16_x4() {
+        let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u16; 32] = [0u16; 32];
+        vst1q_u16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u32_x4() {
+        let a: [u32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u32; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u32; 16] = [0u32; 16];
+        vst1q_u32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_u64_x4() {
+        let a: [u64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u64; 8] = [0u64; 8];
+        vst1q_u64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p8_x2() {
+        let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst1_p8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p8_x3() {
+        let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [u8; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [u8; 24] = [0u8; 24];
+        vst1_p8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p8_x4() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst1_p8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p8_x2() {
+        let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst1q_p8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p8_x3() {
+        let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u8; 48] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u8; 48] = [0u8; 48];
+        vst1q_p8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p8_x4() {
+        let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u8; 64] = [0u8; 64];
+        vst1q_p8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p16_x2() {
+        let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst1_p16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p16_x3() {
+        let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let e: [u16; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let mut r: [u16; 12] = [0u16; 12];
+        vst1_p16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p16_x4() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst1_p16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p16_x2() {
+        let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst1q_p16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p16_x3() {
+        let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let e: [u16; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+        let mut r: [u16; 24] = [0u16; 24];
+        vst1q_p16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p16_x4() {
+        let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let e: [u16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let mut r: [u16; 32] = [0u16; 32];
+        vst1q_p16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p64_x2() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64; 2] = [1, 2];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst1_p64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p64_x3() {
+        let a: [u64; 4] = [0, 1, 2, 3];
+        let e: [u64; 3] = [1, 2, 3];
+        let mut r: [u64; 3] = [0u64; 3];
+        vst1_p64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p64_x4() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64; 4] = [1, 2, 3, 4];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst1_p64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p64_x2() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64; 4] = [1, 2, 3, 4];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst1q_p64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p64_x3() {
+        let a: [u64; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [u64; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [u64; 6] = [0u64; 6];
+        vst1q_p64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p64_x4() {
+        let a: [u64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u64; 8] = [0u64; 8];
+        vst1q_p64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f32_x2() {
+        let a: [f32; 5] = [0., 1., 2., 3., 4.];
+        let e: [f32; 4] = [1., 2., 3., 4.];
+        let mut r: [f32; 4] = [0f32; 4];
+        vst1_f32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f32_x2() {
+        let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let e: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let mut r: [f32; 8] = [0f32; 8];
+        vst1q_f32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f32_x3() {
+        let a: [f32; 7] = [0., 1., 2., 3., 4., 5., 6.];
+        let e: [f32; 6] = [1., 2., 3., 4., 5., 6.];
+        let mut r: [f32; 6] = [0f32; 6];
+        vst1_f32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f32_x3() {
+        let a: [f32; 13] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.];
+        let e: [f32; 12] = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.];
+        let mut r: [f32; 12] = [0f32; 12];
+        vst1q_f32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_f32_x4() {
+        let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
+        let e: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let mut r: [f32; 8] = [0f32; 8];
+        vst1_f32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_f32_x4() {
+        let a: [f32; 17] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.];
+        let e: [f32; 16] = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.];
+        let mut r: [f32; 16] = [0f32; 16];
+        vst1q_f32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_s8() {
+        let a: [i8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [i8; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let mut r: [i8; 16] = [0i8; 16];
+        vst2_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_s16() {
+        let a: [i16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [i16; 8] = [1, 2, 2, 3, 2, 4, 3, 5];
+        let mut r: [i16; 8] = [0i16; 8];
+        vst2_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_s32() {
+        let a: [i32; 5] = [0, 1, 2, 2, 3];
+        let e: [i32; 4] = [1, 2, 2, 3];
+        let mut r: [i32; 4] = [0i32; 4];
+        vst2_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_s8() {
+        let a: [i8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+        let e: [i8; 32] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17];
+        let mut r: [i8; 32] = [0i8; 32];
+        vst2q_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_s16() {
+        let a: [i16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [i16; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let mut r: [i16; 16] = [0i16; 16];
+        vst2q_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_s32() {
+        let a: [i32; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [i32; 8] = [1, 2, 2, 3, 2, 4, 3, 5];
+        let mut r: [i32; 8] = [0i32; 8];
+        vst2q_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_s64() {
+        let a: [i64; 3] = [0, 1, 2];
+        let e: [i64; 2] = [1, 2];
+        let mut r: [i64; 2] = [0i64; 2];
+        vst2_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_u8() {
+        let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [u8; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst2_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_u16() {
+        let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [u16; 8] = [1, 2, 2, 3, 2, 4, 3, 5];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst2_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_u32() {
+        let a: [u32; 5] = [0, 1, 2, 2, 3];
+        let e: [u32; 4] = [1, 2, 2, 3];
+        let mut r: [u32; 4] = [0u32; 4];
+        vst2_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_u8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+        let e: [u8; 32] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst2q_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_u16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [u16; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst2q_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_u32() {
+        let a: [u32; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [u32; 8] = [1, 2, 2, 3, 2, 4, 3, 5];
+        let mut r: [u32; 8] = [0u32; 8];
+        vst2q_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_p8() {
+        let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [u8; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst2_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_p16() {
+        let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [u16; 8] = [1, 2, 2, 3, 2, 4, 3, 5];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst2_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_p8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+        let e: [u8; 32] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst2q_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_p16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [u16; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst2q_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_u64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64; 2] = [1, 2];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst2_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_p64() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64; 2] = [1, 2];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst2_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_f32() {
+        let a: [f32; 5] = [0., 1., 2., 2., 3.];
+        let e: [f32; 4] = [1., 2., 2., 3.];
+        let mut r: [f32; 4] = [0f32; 4];
+        vst2_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_f32() {
+        let a: [f32; 9] = [0., 1., 2., 2., 3., 2., 3., 4., 5.];
+        let e: [f32; 8] = [1., 2., 2., 3., 2., 4., 3., 5.];
+        let mut r: [f32; 8] = [0f32; 8];
+        vst2q_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_s8() {
+        let a: [i8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [i8; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i8; 16] = [0i8; 16];
+        vst2_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_s16() {
+        let a: [i16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [i16; 8] = [1, 2, 0, 0, 0, 0, 0, 0];
+        let mut r: [i16; 8] = [0i16; 8];
+        vst2_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_s32() {
+        let a: [i32; 5] = [0, 1, 2, 2, 3];
+        let e: [i32; 4] = [1, 2, 0, 0];
+        let mut r: [i32; 4] = [0i32; 4];
+        vst2_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_s16() {
+        let a: [i16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [i16; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i16; 16] = [0i16; 16];
+        vst2q_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_s32() {
+        let a: [i32; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [i32; 8] = [1, 2, 0, 0, 0, 0, 0, 0];
+        let mut r: [i32; 8] = [0i32; 8];
+        vst2q_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_u8() {
+        let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [u8; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst2_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_u16() {
+        let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [u16; 8] = [1, 2, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst2_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_u32() {
+        let a: [u32; 5] = [0, 1, 2, 2, 3];
+        let e: [u32; 4] = [1, 2, 0, 0];
+        let mut r: [u32; 4] = [0u32; 4];
+        vst2_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_u16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [u16; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst2q_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_u32() {
+        let a: [u32; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [u32; 8] = [1, 2, 0, 0, 0, 0, 0, 0];
+        let mut r: [u32; 8] = [0u32; 8];
+        vst2q_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_p8() {
+        let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [u8; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 16] = [0u8; 16];
+        vst2_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_p16() {
+        let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5];
+        let e: [u16; 8] = [1, 2, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 8] = [0u16; 8];
+        vst2_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_p16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9];
+        let e: [u16; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst2q_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2_lane_f32() {
+        let a: [f32; 5] = [0., 1., 2., 2., 3.];
+        let e: [f32; 4] = [1., 2., 0., 0.];
+        let mut r: [f32; 4] = [0f32; 4];
+        vst2_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst2q_lane_f32() {
+        let a: [f32; 9] = [0., 1., 2., 2., 3., 2., 3., 4., 5.];
+        let e: [f32; 8] = [1., 2., 0., 0., 0., 0., 0., 0.];
+        let mut r: [f32; 8] = [0f32; 8];
+        vst2q_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_s8() {
+        let a: [i8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [i8; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let mut r: [i8; 24] = [0i8; 24];
+        vst3_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_s16() {
+        let a: [i16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [i16; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let mut r: [i16; 12] = [0i16; 12];
+        vst3_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_s32() {
+        let a: [i32; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [i32; 6] = [1, 2, 2, 2, 4, 4];
+        let mut r: [i32; 6] = [0i32; 6];
+        vst3_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_s8() {
+        let a: [i8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48];
+        let e: [i8; 48] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48];
+        let mut r: [i8; 48] = [0i8; 48];
+        vst3q_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_s16() {
+        let a: [i16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [i16; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let mut r: [i16; 24] = [0i16; 24];
+        vst3q_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_s32() {
+        let a: [i32; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [i32; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let mut r: [i32; 12] = [0i32; 12];
+        vst3q_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_s64() {
+        let a: [i64; 4] = [0, 1, 2, 2];
+        let e: [i64; 3] = [1, 2, 2];
+        let mut r: [i64; 3] = [0i64; 3];
+        vst3_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_u8() {
+        let a: [u8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [u8; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let mut r: [u8; 24] = [0u8; 24];
+        vst3_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_u16() {
+        let a: [u16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [u16; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let mut r: [u16; 12] = [0u16; 12];
+        vst3_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_u32() {
+        let a: [u32; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [u32; 6] = [1, 2, 2, 2, 4, 4];
+        let mut r: [u32; 6] = [0u32; 6];
+        vst3_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_u8() {
+        let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48];
+        let e: [u8; 48] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48];
+        let mut r: [u8; 48] = [0u8; 48];
+        vst3q_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_u16() {
+        let a: [u16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [u16; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let mut r: [u16; 24] = [0u16; 24];
+        vst3q_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_u32() {
+        let a: [u32; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [u32; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let mut r: [u32; 12] = [0u32; 12];
+        vst3q_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_p8() {
+        let a: [u8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [u8; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let mut r: [u8; 24] = [0u8; 24];
+        vst3_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_p16() {
+        let a: [u16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [u16; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8];
+        let mut r: [u16; 12] = [0u16; 12];
+        vst3_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_p8() {
+        let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48];
+        let e: [u8; 48] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48];
+        let mut r: [u8; 48] = [0u8; 48];
+        vst3q_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_p16() {
+        let a: [u16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [u16; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16];
+        let mut r: [u16; 24] = [0u16; 24];
+        vst3q_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_u64() {
+        let a: [u64; 4] = [0, 1, 2, 2];
+        let e: [u64; 3] = [1, 2, 2];
+        let mut r: [u64; 3] = [0u64; 3];
+        vst3_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_p64() {
+        let a: [u64; 4] = [0, 1, 2, 2];
+        let e: [u64; 3] = [1, 2, 2];
+        let mut r: [u64; 3] = [0u64; 3];
+        vst3_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_f32() {
+        let a: [f32; 7] = [0., 1., 2., 2., 4., 2., 4.];
+        let e: [f32; 6] = [1., 2., 2., 2., 4., 4.];
+        let mut r: [f32; 6] = [0f32; 6];
+        vst3_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_f32() {
+        let a: [f32; 13] = [0., 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8.];
+        let e: [f32; 12] = [1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8.];
+        let mut r: [f32; 12] = [0f32; 12];
+        vst3q_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_s8() {
+        let a: [i8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [i8; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i8; 24] = [0i8; 24];
+        vst3_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_s16() {
+        let a: [i16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [i16; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i16; 12] = [0i16; 12];
+        vst3_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_s32() {
+        let a: [i32; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [i32; 6] = [1, 2, 2, 0, 0, 0];
+        let mut r: [i32; 6] = [0i32; 6];
+        vst3_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_s16() {
+        let a: [i16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [i16; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i16; 24] = [0i16; 24];
+        vst3q_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_s32() {
+        let a: [i32; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [i32; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i32; 12] = [0i32; 12];
+        vst3q_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_u8() {
+        let a: [u8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [u8; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 24] = [0u8; 24];
+        vst3_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_u16() {
+        let a: [u16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [u16; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 12] = [0u16; 12];
+        vst3_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_u32() {
+        let a: [u32; 7] = [0, 1, 2, 2, 4, 2, 4];
+        let e: [u32; 6] = [1, 2, 2, 0, 0, 0];
+        let mut r: [u32; 6] = [0u32; 6];
+        vst3_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_u16() {
+        let a: [u16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [u16; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 24] = [0u16; 24];
+        vst3q_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_u32() {
+        let a: [u32; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [u32; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u32; 12] = [0u32; 12];
+        vst3q_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_p8() {
+        let a: [u8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [u8; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 24] = [0u8; 24];
+        vst3_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_p16() {
+        let a: [u16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8];
+        let e: [u16; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 12] = [0u16; 12];
+        vst3_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_p16() {
+        let a: [u16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16];
+        let e: [u16; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 24] = [0u16; 24];
+        vst3q_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3_lane_f32() {
+        let a: [f32; 7] = [0., 1., 2., 2., 3., 2., 3.];
+        let e: [f32; 6] = [1., 2., 2., 0., 0., 0.];
+        let mut r: [f32; 6] = [0f32; 6];
+        vst3_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst3q_lane_f32() {
+        let a: [f32; 13] = [0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5.];
+        let e: [f32; 12] = [1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0.];
+        let mut r: [f32; 12] = [0f32; 12];
+        vst3q_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_s8() {
+        let a: [i8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [i8; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let mut r: [i8; 32] = [0i8; 32];
+        vst4_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_s16() {
+        let a: [i16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [i16; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let mut r: [i16; 16] = [0i16; 16];
+        vst4_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_s32() {
+        let a: [i32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [i32; 8] = [1, 2, 2, 6, 2, 6, 6, 8];
+        let mut r: [i32; 8] = [0i32; 8];
+        vst4_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_s8() {
+        let a: [i8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let e: [i8; 64] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let mut r: [i8; 64] = [0i8; 64];
+        vst4q_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_s16() {
+        let a: [i16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [i16; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let mut r: [i16; 32] = [0i16; 32];
+        vst4q_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_s32() {
+        let a: [i32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [i32; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let mut r: [i32; 16] = [0i32; 16];
+        vst4q_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_s64() {
+        let a: [i64; 5] = [0, 1, 2, 2, 6];
+        let e: [i64; 4] = [1, 2, 2, 6];
+        let mut r: [i64; 4] = [0i64; 4];
+        vst4_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_u8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u8; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst4_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_u16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [u16; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst4_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_u32() {
+        let a: [u32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [u32; 8] = [1, 2, 2, 6, 2, 6, 6, 8];
+        let mut r: [u32; 8] = [0u32; 8];
+        vst4_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_u8() {
+        let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let e: [u8; 64] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let mut r: [u8; 64] = [0u8; 64];
+        vst4q_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_u16() {
+        let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u16; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let mut r: [u16; 32] = [0u16; 32];
+        vst4q_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_u32() {
+        let a: [u32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [u32; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let mut r: [u32; 16] = [0u32; 16];
+        vst4q_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_p8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u8; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst4_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_p16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [u16; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst4_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_p8() {
+        let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let e: [u8; 64] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
+        let mut r: [u8; 64] = [0u8; 64];
+        vst4q_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_p16() {
+        let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u16; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let mut r: [u16; 32] = [0u16; 32];
+        vst4q_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_u64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 6];
+        let e: [u64; 4] = [1, 2, 2, 6];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst4_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_p64() {
+        let a: [u64; 5] = [0, 1, 2, 2, 6];
+        let e: [u64; 4] = [1, 2, 2, 6];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst4_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_f32() {
+        let a: [f32; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.];
+        let e: [f32; 8] = [1., 2., 2., 6., 2., 6., 6., 8.];
+        let mut r: [f32; 8] = [0f32; 8];
+        vst4_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_f32() {
+        let a: [f32; 17] = [0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.];
+        let e: [f32; 16] = [1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.];
+        let mut r: [f32; 16] = [0f32; 16];
+        vst4q_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_s8() {
+        let a: [i8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [i8; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i8; 32] = [0i8; 32];
+        vst4_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_s16() {
+        let a: [i16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [i16; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i16; 16] = [0i16; 16];
+        vst4_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_s32() {
+        let a: [i32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [i32; 8] = [1, 2, 2, 6, 0, 0, 0, 0];
+        let mut r: [i32; 8] = [0i32; 8];
+        vst4_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_s16() {
+        let a: [i16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [i16; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i16; 32] = [0i16; 32];
+        vst4q_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_s32() {
+        let a: [i32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [i32; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [i32; 16] = [0i32; 16];
+        vst4q_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_u8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u8; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst4_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_u16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [u16; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst4_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_u32() {
+        let a: [u32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
+        let e: [u32; 8] = [1, 2, 2, 6, 0, 0, 0, 0];
+        let mut r: [u32; 8] = [0u32; 8];
+        vst4_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_u16() {
+        let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u16; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 32] = [0u16; 32];
+        vst4q_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_u32() {
+        let a: [u32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [u32; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u32; 16] = [0u32; 16];
+        vst4q_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_p8() {
+        let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u8; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u8; 32] = [0u8; 32];
+        vst4_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_p16() {
+        let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16];
+        let e: [u16; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 16] = [0u16; 16];
+        vst4_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_p16() {
+        let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32];
+        let e: [u16; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut r: [u16; 32] = [0u16; 32];
+        vst4q_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4_lane_f32() {
+        let a: [f32; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.];
+        let e: [f32; 8] = [1., 2., 2., 6., 0., 0., 0., 0.];
+        let mut r: [f32; 8] = [0f32; 8];
+        vst4_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst4q_lane_f32() {
+        let a: [f32; 17] = [0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.];
+        let e: [f32; 16] = [1., 2., 2., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.];
+        let mut r: [f32; 16] = [0f32; 16];
+        vst4q_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 4, 3, 8, 5, 12, 7, 16);
+        let r: i8x8 = transmute(vmul_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32);
+        let r: i8x16 = transmute(vmulq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 1, 2);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(1, 4, 3, 8);
+        let r: i16x4 = transmute(vmul_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(1, 4, 3, 8, 5, 12, 7, 16);
+        let r: i16x8 = transmute(vmulq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(1, 4);
+        let r: i32x2 = transmute(vmul_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 1, 2);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(1, 4, 3, 8);
+        let r: i32x4 = transmute(vmulq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(1, 4, 3, 8, 5, 12, 7, 16);
+        let r: u8x8 = transmute(vmul_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32);
+        let r: u8x16 = transmute(vmulq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 1, 2);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(1, 4, 3, 8);
+        let r: u16x4 = transmute(vmul_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(1, 4, 3, 8, 5, 12, 7, 16);
+        let r: u16x8 = transmute(vmulq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(1, 4);
+        let r: u32x2 = transmute(vmul_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 1, 2);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 4, 3, 8);
+        let r: u32x4 = transmute(vmulq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_p8() {
+        let a: i8x8 = i8x8::new(1, 3, 1, 3, 1, 3, 1, 3);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 6, 3, 12, 5, 10, 7, 24);
+        let r: i8x8 = transmute(vmul_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_p8() {
+        let a: i8x16 = i8x16::new(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48);
+        let r: i8x16 = transmute(vmulq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let b: f32x2 = f32x2::new(2.0, 3.0);
+        let e: f32x2 = f32x2::new(2.0, 6.0);
+        let r: f32x2 = transmute(vmul_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 1.0, 2.0);
+        let b: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let e: f32x4 = f32x4::new(2.0, 6.0, 4.0, 10.0);
+        let r: f32x4 = transmute(vmulq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16 = 2;
+        let e: i16x4 = i16x4::new(2, 4, 6, 8);
+        let r: i16x4 = transmute(vmul_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16 = 2;
+        let e: i16x8 = i16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: i16x8 = transmute(vmulq_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32 = 2;
+        let e: i32x2 = i32x2::new(2, 4);
+        let r: i32x2 = transmute(vmul_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32 = 2;
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmulq_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16 = 2;
+        let e: u16x4 = u16x4::new(2, 4, 6, 8);
+        let r: u16x4 = transmute(vmul_n_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16 = 2;
+        let e: u16x8 = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: u16x8 = transmute(vmulq_n_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32 = 2;
+        let e: u32x2 = u32x2::new(2, 4);
+        let r: u32x2 = transmute(vmul_n_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32 = 2;
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmulq_n_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32 = 2.;
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmul_n_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32 = 2.;
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulq_n_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x4 = i16x4::new(2, 4, 6, 8);
+        let r: i16x4 = transmute(vmul_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(2, 4, 6, 8);
+        let r: i16x4 = transmute(vmul_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x8 = i16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: i16x8 = transmute(vmulq_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: i16x8 = transmute(vmulq_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i32x2 = i32x2::new(2, 4);
+        let r: i32x2 = transmute(vmul_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x2 = i32x2::new(2, 4);
+        let r: i32x2 = transmute(vmul_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmulq_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmulq_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u16x4 = u16x4::new(2, 4, 6, 8);
+        let r: u16x4 = transmute(vmul_lane_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u16x4 = u16x4::new(2, 4, 6, 8);
+        let r: u16x4 = transmute(vmul_laneq_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u16x8 = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: u16x8 = transmute(vmulq_lane_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u16x8 = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: u16x8 = transmute(vmulq_laneq_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(0, 2);
+        let e: u32x2 = u32x2::new(2, 4);
+        let r: u32x2 = transmute(vmul_lane_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u32x2 = u32x2::new(2, 4);
+        let r: u32x2 = transmute(vmul_laneq_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x2 = u32x2::new(0, 2);
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmulq_lane_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmulq_laneq_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmul_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmul_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulq_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulq_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: i16x8 = i16x8::new(1, 4, 3, 8, 5, 12, 7, 16);
+        let r: i16x8 = transmute(vmull_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(1, 2, 1, 2);
+        let e: i32x4 = i32x4::new(1, 4, 3, 8);
+        let r: i32x4 = transmute(vmull_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i64x2 = i64x2::new(1, 4);
+        let r: i64x2 = transmute(vmull_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: u16x8 = u16x8::new(1, 4, 3, 8, 5, 12, 7, 16);
+        let r: u16x8 = transmute(vmull_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(1, 2, 1, 2);
+        let e: u32x4 = u32x4::new(1, 4, 3, 8);
+        let r: u32x4 = transmute(vmull_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u64x2 = u64x2::new(1, 4);
+        let r: u64x2 = transmute(vmull_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_p8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(1, 3, 1, 3, 1, 3, 1, 3);
+        let e: i16x8 = i16x8::new(1, 6, 3, 12, 5, 10, 7, 24);
+        let r: i16x8 = transmute(vmull_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_n_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16 = 2;
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmull_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_n_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32 = 2;
+        let e: i64x2 = i64x2::new(2, 4);
+        let r: i64x2 = transmute(vmull_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_n_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16 = 2;
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmull_n_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_n_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32 = 2;
+        let e: u64x2 = u64x2::new(2, 4);
+        let r: u64x2 = transmute(vmull_n_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmull_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_laneq_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmull_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(2, 4);
+        let r: i64x2 = transmute(vmull_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_laneq_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(2, 4);
+        let r: i64x2 = transmute(vmull_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmull_lane_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_laneq_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmull_laneq_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(0, 2);
+        let e: u64x2 = u64x2::new(2, 4);
+        let r: u64x2 = transmute(vmull_lane_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_laneq_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u64x2 = u64x2::new(2, 4);
+        let r: u64x2 = transmute(vmull_laneq_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_f32() {
+        let a: f32x2 = f32x2::new(8.0, 18.0);
+        let b: f32x2 = f32x2::new(6.0, 4.0);
+        let c: f32x2 = f32x2::new(2.0, 3.0);
+        let e: f32x2 = f32x2::new(20.0, 30.0);
+        let r: f32x2 = transmute(vfma_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_f32() {
+        let a: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0);
+        let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
+        let c: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let e: f32x4 = f32x4::new(20.0, 30.0, 40.0, 50.0);
+        let r: f32x4 = transmute(vfmaq_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_n_f32() {
+        let a: f32x2 = f32x2::new(2.0, 3.0);
+        let b: f32x2 = f32x2::new(6.0, 4.0);
+        let c: f32 = 8.0;
+        let e: f32x2 = f32x2::new(50.0, 35.0);
+        let r: f32x2 = transmute(vfma_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_n_f32() {
+        let a: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
+        let c: f32 = 8.0;
+        let e: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0);
+        let r: f32x4 = transmute(vfmaq_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfms_f32() {
+        let a: f32x2 = f32x2::new(20.0, 30.0);
+        let b: f32x2 = f32x2::new(6.0, 4.0);
+        let c: f32x2 = f32x2::new(2.0, 3.0);
+        let e: f32x2 = f32x2::new(8.0, 18.0);
+        let r: f32x2 = transmute(vfms_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsq_f32() {
+        let a: f32x4 = f32x4::new(20.0, 30.0, 40.0, 50.0);
+        let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
+        let c: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let e: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0);
+        let r: f32x4 = transmute(vfmsq_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfms_n_f32() {
+        let a: f32x2 = f32x2::new(50.0, 35.0);
+        let b: f32x2 = f32x2::new(6.0, 4.0);
+        let c: f32 = 8.0;
+        let e: f32x2 = f32x2::new(2.0, 3.0);
+        let r: f32x2 = transmute(vfms_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmsq_n_f32() {
+        let a: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0);
+        let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
+        let c: f32 = 8.0;
+        let e: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let r: f32x4 = transmute(vfmsq_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: i8x8 = i8x8::new(0, 0, 2, 2, 4, 4, 6, 6);
+        let r: i8x8 = transmute(vsub_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+        let e: i8x16 = i8x16::new(0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
+        let r: i8x16 = transmute(vsubq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(1, 2, 1, 2);
+        let e: i16x4 = i16x4::new(0, 0, 2, 2);
+        let r: i16x4 = transmute(vsub_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: i16x8 = i16x8::new(0, 0, 2, 2, 4, 4, 6, 6);
+        let r: i16x8 = transmute(vsubq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vsub_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(1, 2, 1, 2);
+        let e: i32x4 = i32x4::new(0, 0, 2, 2);
+        let r: i32x4 = transmute(vsubq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: u8x8 = u8x8::new(0, 0, 2, 2, 4, 4, 6, 6);
+        let r: u8x8 = transmute(vsub_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+        let e: u8x16 = u8x16::new(0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
+        let r: u8x16 = transmute(vsubq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(1, 2, 1, 2);
+        let e: u16x4 = u16x4::new(0, 0, 2, 2);
+        let r: u16x4 = transmute(vsub_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: u16x8 = u16x8::new(0, 0, 2, 2, 4, 4, 6, 6);
+        let r: u16x8 = transmute(vsubq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vsub_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 1, 2);
+        let e: u32x4 = u32x4::new(0, 0, 2, 2);
+        let r: u32x4 = transmute(vsubq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vsub_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(1, 2);
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vsubq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vsub_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vsubq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_f32() {
+        let a: f32x2 = f32x2::new(1.0, 4.0);
+        let b: f32x2 = f32x2::new(1.0, 2.0);
+        let e: f32x2 = f32x2::new(0.0, 2.0);
+        let r: f32x2 = transmute(vsub_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 4.0, 3.0, 8.0);
+        let b: f32x4 = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let e: f32x4 = f32x4::new(0.0, 2.0, 0.0, 4.0);
+        let r: f32x4 = transmute(vsubq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_p8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e: i8x8 = i8x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let r: i8x8 = transmute(vadd_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_p16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(1, 1, 1, 1);
+        let e: i16x4 = i16x4::new(0, 3, 2, 5);
+        let r: i16x4 = transmute(vadd_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_p8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e: i8x16 = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17);
+        let r: i8x16 = transmute(vaddq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_p16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e: i16x8 = i16x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let r: i16x8 = transmute(vaddq_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_p64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vadd_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_p64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(1, 1);
+        let e: i64x2 = i64x2::new(0, 3);
+        let r: i64x2 = transmute(vaddq_p64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_p128() {
+        let a: p128 = 16;
+        let b: p128 = 1;
+        let e: p128 = 17;
+        let r: p128 = transmute(vaddq_p128(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, -32768, 1, 1, 0x7F_FF, -32768, 1, 1);
+        let b: i16x8 = i16x8::new(1, 0, 0, 0, 1, 0, 0, 0);
+        let e: i8x8 = i8x8::new(0x7F, -128, 0, 0, 0x7F, -128, 0, 0);
+        let r: i8x8 = transmute(vsubhn_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, -2147483648, 1, 1);
+        let b: i32x4 = i32x4::new(1, 0, 0, 0);
+        let e: i16x4 = i16x4::new(0x7F_FF, -32768, 0, 0);
+        let r: i16x4 = transmute(vsubhn_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_s64() {
+        let a: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, -9223372036854775808);
+        let b: i64x2 = i64x2::new(1, 0);
+        let e: i32x2 = i32x2::new(0x7F_FF_FF_FF, -2147483648);
+        let r: i32x2 = transmute(vsubhn_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_u16() {
+        let a: u16x8 = u16x8::new(0xFF_FF, 0, 1, 1, 0xFF_FF, 0, 1, 1);
+        let b: u16x8 = u16x8::new(1, 0, 0, 0, 1, 0, 0, 0);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0, 0, 0xFF, 0, 0, 0);
+        let r: u8x8 = transmute(vsubhn_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_u32() {
+        let a: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 1, 1);
+        let b: u32x4 = u32x4::new(1, 0, 0, 0);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0, 0);
+        let r: u16x4 = transmute(vsubhn_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_u64() {
+        let a: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let b: u64x2 = u64x2::new(1, 0);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vsubhn_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_s16() {
+        let a: i8x8 = i8x8::new(0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0);
+        let b: i16x8 = i16x8::new(0x7F_FF, 1, 0x7F_FF, 1, 0x7F_FF, 1, 0x7F_FF, 1);
+        let c: i16x8 = i16x8::new(1, 0, 1, 0, 1, 0, 1, 0);
+        let e: i8x16 = i8x16::new(0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0);
+        let r: i8x16 = transmute(vsubhn_high_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_s32() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0, 0x7F_FF, 0);
+        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 1, 0x7F_FF_FF_FF, 1);
+        let c: i32x4 = i32x4::new(1, 0, 1, 0);
+        let e: i16x8 = i16x8::new(0x7F_FF, 0, 0x7F_FF, 0, 0x7F_FF, 0, 0x7F_FF, 0);
+        let r: i16x8 = transmute(vsubhn_high_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_s64() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0);
+        let b: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 1);
+        let c: i64x2 = i64x2::new(1, 0);
+        let e: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0, 0x7F_FF_FF_FF, 0);
+        let r: i32x4 = transmute(vsubhn_high_s64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_u16() {
+        let a: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let b: u16x8 = u16x8::new(0xFF_FF, 1, 0xFF_FF, 1, 0xFF_FF, 1, 0xFF_FF, 1);
+        let c: u16x8 = u16x8::new(1, 0, 1, 0, 1, 0, 1, 0);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vsubhn_high_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_u32() {
+        let a: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
+        let b: u32x4 = u32x4::new(0xFF_FF_FF_FF, 1, 0xFF_FF_FF_FF, 1);
+        let c: u32x4 = u32x4::new(1, 0, 1, 0);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x8 = transmute(vsubhn_high_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_u64() {
+        let a: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let b: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 1);
+        let c: u64x2 = u64x2::new(1, 0);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vsubhn_high_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: u8x8 = u8x8::new(0, 0, 1, 1, 2, 2, 3, 3);
+        let r: u8x8 = transmute(vhsub_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+        let e: u8x16 = u8x16::new(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+        let r: u8x16 = transmute(vhsubq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(1, 2, 1, 2);
+        let e: u16x4 = u16x4::new(0, 0, 1, 1);
+        let r: u16x4 = transmute(vhsub_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: u16x8 = u16x8::new(0, 0, 1, 1, 2, 2, 3, 3);
+        let r: u16x8 = transmute(vhsubq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vhsub_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 1, 2);
+        let e: u32x4 = u32x4::new(0, 0, 1, 1);
+        let r: u32x4 = transmute(vhsubq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: i8x8 = i8x8::new(0, 0, 1, 1, 2, 2, 3, 3);
+        let r: i8x8 = transmute(vhsub_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+        let e: i8x16 = i8x16::new(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+        let r: i8x16 = transmute(vhsubq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(1, 2, 1, 2);
+        let e: i16x4 = i16x4::new(0, 0, 1, 1);
+        let r: i16x4 = transmute(vhsub_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
+        let e: i16x8 = i16x8::new(0, 0, 1, 1, 2, 2, 3, 3);
+        let r: i16x8 = transmute(vhsubq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vhsub_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(1, 2, 1, 2);
+        let e: i32x4 = i32x4::new(0, 0, 1, 1);
+        let r: i32x4 = transmute(vhsubq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_s8() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vsubw_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: i32x4 = i32x4::new(0, 0, 0, 0);
+        let r: i32x4 = transmute(vsubw_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(0, 1);
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vsubw_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_u8() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vsubw_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0);
+        let r: u32x4 = transmute(vsubw_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubw_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(0, 1);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vsubw_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_s8() {
+        let a: i8x8 = i8x8::new(0x7F, -128, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(0x7F, -128, 2, 3, 4, 5, 6, 7);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vsubl_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_s16() {
+        let a: i16x4 = i16x4::new(0x7F_FF, -32768, 2, 3);
+        let b: i16x4 = i16x4::new(0x7F_FF, -32768, 2, 3);
+        let e: i32x4 = i32x4::new(0, 0, 0, 0);
+        let r: i32x4 = transmute(vsubl_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_s32() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, -2147483648);
+        let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, -2147483648);
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vsubl_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_u8() {
+        let a: u8x8 = u8x8::new(0xFF, 0, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(0xFF, 0, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vsubl_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_u16() {
+        let a: u16x4 = u16x4::new(0xFF_FF, 0, 2, 3);
+        let b: u16x4 = u16x4::new(0xFF_FF, 0, 2, 3);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0);
+        let r: u32x4 = transmute(vsubl_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubl_u32() {
+        let a: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let b: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vsubl_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmax_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i8x8 = i8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let r: i8x8 = transmute(vmax_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: i8x16 = i8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vmaxq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmax_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(16, 15, 14, 13);
+        let e: i16x4 = i16x4::new(16, 15, 14, 13);
+        let r: i16x4 = transmute(vmax_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i16x8 = i16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let r: i16x8 = transmute(vmaxq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmax_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(16, 15);
+        let e: i32x2 = i32x2::new(16, 15);
+        let r: i32x2 = transmute(vmax_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(16, 15, 14, 13);
+        let e: i32x4 = i32x4::new(16, 15, 14, 13);
+        let r: i32x4 = transmute(vmaxq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmax_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u8x8 = u8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let r: u8x8 = transmute(vmax_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: u8x16 = u8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x16 = transmute(vmaxq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmax_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(16, 15, 14, 13);
+        let e: u16x4 = u16x4::new(16, 15, 14, 13);
+        let r: u16x4 = transmute(vmax_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u16x8 = u16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let r: u16x8 = transmute(vmaxq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmax_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(16, 15);
+        let e: u32x2 = u32x2::new(16, 15);
+        let r: u32x2 = transmute(vmax_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(16, 15, 14, 13);
+        let e: u32x4 = u32x4::new(16, 15, 14, 13);
+        let r: u32x4 = transmute(vmaxq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmax_f32() {
+        let a: f32x2 = f32x2::new(1.0, -2.0);
+        let b: f32x2 = f32x2::new(0.0, 3.0);
+        let e: f32x2 = f32x2::new(1.0, 3.0);
+        let r: f32x2 = transmute(vmax_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxq_f32() {
+        let a: f32x4 = f32x4::new(1.0, -2.0, 3.0, -4.0);
+        let b: f32x4 = f32x4::new(0.0, 3.0, 2.0, 8.0);
+        let e: f32x4 = f32x4::new(1.0, 3.0, 3.0, 8.0);
+        let r: f32x4 = transmute(vmaxq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxnm_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let b: f32x2 = f32x2::new(8.0, 16.0);
+        let e: f32x2 = f32x2::new(8.0, 16.0);
+        let r: f32x2 = transmute(vmaxnm_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmaxnmq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 3.0, -4.0);
+        let b: f32x4 = f32x4::new(8.0, 16.0, -1.0, 6.0);
+        let e: f32x4 = f32x4::new(8.0, 16.0, 3.0, 6.0);
+        let r: f32x4 = transmute(vmaxnmq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmin_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vmin_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1);
+        let r: i8x16 = transmute(vminq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmin_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(16, 15, 14, 13);
+        let e: i16x4 = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vmin_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vminq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmin_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(16, 15);
+        let e: i32x2 = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vmin_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(16, 15, 14, 13);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vminq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmin_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vmin_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1);
+        let r: u8x16 = transmute(vminq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmin_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(16, 15, 14, 13);
+        let e: u16x4 = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vmin_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16x8 = transmute(vminq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmin_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(16, 15);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vmin_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(16, 15, 14, 13);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vminq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmin_f32() {
+        let a: f32x2 = f32x2::new(1.0, -2.0);
+        let b: f32x2 = f32x2::new(0.0, 3.0);
+        let e: f32x2 = f32x2::new(0.0, -2.0);
+        let r: f32x2 = transmute(vmin_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminq_f32() {
+        let a: f32x4 = f32x4::new(1.0, -2.0, 3.0, -4.0);
+        let b: f32x4 = f32x4::new(0.0, 3.0, 2.0, 8.0);
+        let e: f32x4 = f32x4::new(0.0, -2.0, 2.0, -4.0);
+        let r: f32x4 = transmute(vminq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminnm_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let b: f32x2 = f32x2::new(8.0, 16.0);
+        let e: f32x2 = f32x2::new(1.0, 2.0);
+        let r: f32x2 = transmute(vminnm_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vminnmq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 3.0, -4.0);
+        let b: f32x4 = f32x4::new(8.0, 16.0, -1.0, 6.0);
+        let e: f32x4 = f32x4::new(1.0, 2.0, -1.0, -4.0);
+        let r: f32x4 = transmute(vminnmq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadd_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(3., 4.);
+        let e: f32x2 = f32x2::new(3., 7.);
+        let r: f32x2 = transmute(vpadd_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(0, 4, 12, 24);
+        let r: i32x4 = transmute(vqdmull_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: i64x2 = i64x2::new(0, 4);
+        let r: i64x2 = transmute(vqdmull_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_n_s16() {
+        let a: i16x4 = i16x4::new(2, 4, 6, 8);
+        let b: i16 = 2;
+        let e: i32x4 = i32x4::new(8, 16, 24, 32);
+        let r: i32x4 = transmute(vqdmull_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_n_s32() {
+        let a: i32x2 = i32x2::new(2, 4);
+        let b: i32 = 2;
+        let e: i64x2 = i64x2::new(8, 16);
+        let r: i64x2 = transmute(vqdmull_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 2, 2, 0);
+        let e: i32x4 = i32x4::new(4, 8, 12, 16);
+        let r: i32x4 = transmute(vqdmull_lane_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmull_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(4, 8);
+        let r: i64x2 = transmute(vqdmull_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_s16() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let c: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(5, 9, 13, 17);
+        let r: i32x4 = transmute(vqdmlal_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_s32() {
+        let a: i64x2 = i64x2::new(1, 1);
+        let b: i32x2 = i32x2::new(1, 2);
+        let c: i32x2 = i32x2::new(2, 2);
+        let e: i64x2 = i64x2::new(5, 9);
+        let r: i64x2 = transmute(vqdmlal_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_n_s16() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let c: i16 = 2;
+        let e: i32x4 = i32x4::new(5, 9, 13, 17);
+        let r: i32x4 = transmute(vqdmlal_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_n_s32() {
+        let a: i64x2 = i64x2::new(1, 1);
+        let b: i32x2 = i32x2::new(1, 2);
+        let c: i32 = 2;
+        let e: i64x2 = i64x2::new(5, 9);
+        let r: i64x2 = transmute(vqdmlal_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_lane_s16() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let c: i16x4 = i16x4::new(0, 2, 2, 0);
+        let e: i32x4 = i32x4::new(5, 10, 15, 20);
+        let r: i32x4 = transmute(vqdmlal_lane_s16::<2>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlal_lane_s32() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i32x2 = i32x2::new(1, 2);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(5, 10);
+        let r: i64x2 = transmute(vqdmlal_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_s16() {
+        let a: i32x4 = i32x4::new(3, 7, 11, 15);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let c: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(-1, -1, -1, -1);
+        let r: i32x4 = transmute(vqdmlsl_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_s32() {
+        let a: i64x2 = i64x2::new(3, 7);
+        let b: i32x2 = i32x2::new(1, 2);
+        let c: i32x2 = i32x2::new(2, 2);
+        let e: i64x2 = i64x2::new(-1, -1);
+        let r: i64x2 = transmute(vqdmlsl_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_n_s16() {
+        let a: i32x4 = i32x4::new(3, 7, 11, 15);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let c: i16 = 2;
+        let e: i32x4 = i32x4::new(-1, -1, -1, -1);
+        let r: i32x4 = transmute(vqdmlsl_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_n_s32() {
+        let a: i64x2 = i64x2::new(3, 7);
+        let b: i32x2 = i32x2::new(1, 2);
+        let c: i32 = 2;
+        let e: i64x2 = i64x2::new(-1, -1);
+        let r: i64x2 = transmute(vqdmlsl_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_lane_s16() {
+        let a: i32x4 = i32x4::new(3, 6, 9, 12);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let c: i16x4 = i16x4::new(0, 2, 2, 0);
+        let e: i32x4 = i32x4::new(-1, -2, -3, -4);
+        let r: i32x4 = transmute(vqdmlsl_lane_s16::<2>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmlsl_lane_s32() {
+        let a: i64x2 = i64x2::new(3, 6);
+        let b: i32x2 = i32x2::new(1, 2);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(-1, -2);
+        let r: i64x2 = transmute(vqdmlsl_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulh_s16() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vqdmulh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhq_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vqdmulhq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulh_s32() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vqdmulh_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhq_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vqdmulhq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulh_n_s16() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16 = 2;
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vqdmulh_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulh_n_s32() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32 = 2;
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vqdmulh_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhq_n_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16 = 2;
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vqdmulhq_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhq_n_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32 = 2;
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vqdmulhq_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhq_laneq_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x8 = i16x8::new(2, 1, 1, 1, 1, 1, 1, 1);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vqdmulhq_laneq_s16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulh_laneq_s16() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x8 = i16x8::new(2, 1, 1, 1, 1, 1, 1, 1);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vqdmulh_laneq_s16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulhq_laneq_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x4 = i32x4::new(2, 1, 1, 1);
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vqdmulhq_laneq_s32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqdmulh_laneq_s32() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x4 = i32x4::new(2, 1, 1, 1);
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vqdmulh_laneq_s32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let e: i8x8 = i8x8::new(0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
+        let r: i8x8 = transmute(vqmovn_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let e: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let r: i16x4 = transmute(vqmovn_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_s64() {
+        let a: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let r: i32x2 = transmute(vqmovn_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_u16() {
+        let a: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vqmovn_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_u32() {
+        let a: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vqmovn_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovn_u64() {
+        let a: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vqmovn_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovun_s16() {
+        let a: i16x8 = i16x8::new(-1, -1, -1, -1, -1, -1, -1, -1);
+        let e: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vqmovun_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovun_s32() {
+        let a: i32x4 = i32x4::new(-1, -1, -1, -1);
+        let e: u16x4 = u16x4::new(0, 0, 0, 0);
+        let r: u16x4 = transmute(vqmovun_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovun_s64() {
+        let a: i64x2 = i64x2::new(-1, -1);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vqmovun_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulh_s16() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i16x4 = i16x4::new(2, 2, 2, 2);
+        let r: i16x4 = transmute(vqrdmulh_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhq_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let r: i16x8 = transmute(vqrdmulhq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulh_s32() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: i32x2 = i32x2::new(2, 2);
+        let r: i32x2 = transmute(vqrdmulh_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhq_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(2, 2, 2, 2);
+        let r: i32x4 = transmute(vqrdmulhq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulh_n_s16() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16 = 2;
+        let e: i16x4 = i16x4::new(2, 2, 2, 2);
+        let r: i16x4 = transmute(vqrdmulh_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhq_n_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16 = 2;
+        let e: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let r: i16x8 = transmute(vqrdmulhq_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulh_n_s32() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32 = 2;
+        let e: i32x2 = i32x2::new(2, 2);
+        let r: i32x2 = transmute(vqrdmulh_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhq_n_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32 = 2;
+        let e: i32x4 = i32x4::new(2, 2, 2, 2);
+        let r: i32x4 = transmute(vqrdmulhq_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulh_lane_s16() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x4 = i16x4::new(2, 2, 2, 2);
+        let r: i16x4 = transmute(vqrdmulh_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulh_laneq_s16() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(2, 2, 2, 2);
+        let r: i16x4 = transmute(vqrdmulh_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhq_lane_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let r: i16x8 = transmute(vqrdmulhq_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhq_laneq_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let r: i16x8 = transmute(vqrdmulhq_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulh_lane_s32() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i32x2 = i32x2::new(2, 2);
+        let r: i32x2 = transmute(vqrdmulh_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulh_laneq_s32() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x2 = i32x2::new(2, 2);
+        let r: i32x2 = transmute(vqrdmulh_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhq_lane_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i32x4 = i32x4::new(2, 2, 2, 2);
+        let r: i32x4 = transmute(vqrdmulhq_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrdmulhq_laneq_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(2, 2, 2, 2);
+        let r: i32x4 = transmute(vqrdmulhq_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshl_s8() {
+        let a: i8x8 = i8x8::new(2, -128, 0x7F, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i8x8 = i8x8::new(8, -128, 0x7F, 12, 16, 20, 24, 28);
+        let r: i8x8 = transmute(vqrshl_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlq_s8() {
+        let a: i8x16 = i8x16::new(2, -128, 0x7F, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i8x16 = i8x16::new(8, -128, 0x7F, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
+        let r: i8x16 = transmute(vqrshlq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshl_s16() {
+        let a: i16x4 = i16x4::new(2, -32768, 0x7F_FF, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i16x4 = i16x4::new(8, -32768, 0x7F_FF, 12);
+        let r: i16x4 = transmute(vqrshl_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlq_s16() {
+        let a: i16x8 = i16x8::new(2, -32768, 0x7F_FF, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i16x8 = i16x8::new(8, -32768, 0x7F_FF, 12, 16, 20, 24, 28);
+        let r: i16x8 = transmute(vqrshlq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshl_s32() {
+        let a: i32x2 = i32x2::new(2, -2147483648);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: i32x2 = i32x2::new(8, -2147483648);
+        let r: i32x2 = transmute(vqrshl_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlq_s32() {
+        let a: i32x4 = i32x4::new(2, -2147483648, 0x7F_FF_FF_FF, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(8, -2147483648, 0x7F_FF_FF_FF, 12);
+        let r: i32x4 = transmute(vqrshlq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshl_s64() {
+        let a: i64x1 = i64x1::new(2);
+        let b: i64x1 = i64x1::new(2);
+        let e: i64x1 = i64x1::new(8);
+        let r: i64x1 = transmute(vqrshl_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlq_s64() {
+        let a: i64x2 = i64x2::new(2, -9223372036854775808);
+        let b: i64x2 = i64x2::new(2, 2);
+        let e: i64x2 = i64x2::new(8, -9223372036854775808);
+        let r: i64x2 = transmute(vqrshlq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshl_u8() {
+        let a: u8x8 = u8x8::new(2, 0, 0xFF, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u8x8 = u8x8::new(8, 0, 0xFF, 12, 16, 20, 24, 28);
+        let r: u8x8 = transmute(vqrshl_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlq_u8() {
+        let a: u8x16 = u8x16::new(2, 0, 0xFF, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u8x16 = u8x16::new(8, 0, 0xFF, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
+        let r: u8x16 = transmute(vqrshlq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshl_u16() {
+        let a: u16x4 = u16x4::new(2, 0, 0xFF_FF, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: u16x4 = u16x4::new(8, 0, 0xFF_FF, 12);
+        let r: u16x4 = transmute(vqrshl_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlq_u16() {
+        let a: u16x8 = u16x8::new(2, 0, 0xFF_FF, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u16x8 = u16x8::new(8, 0, 0xFF_FF, 12, 16, 20, 24, 28);
+        let r: u16x8 = transmute(vqrshlq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshl_u32() {
+        let a: u32x2 = u32x2::new(2, 0);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: u32x2 = u32x2::new(8, 0);
+        let r: u32x2 = transmute(vqrshl_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlq_u32() {
+        let a: u32x4 = u32x4::new(2, 0, 0xFF_FF_FF_FF, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: u32x4 = u32x4::new(8, 0, 0xFF_FF_FF_FF, 12);
+        let r: u32x4 = transmute(vqrshlq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshl_u64() {
+        let a: u64x1 = u64x1::new(2);
+        let b: i64x1 = i64x1::new(2);
+        let e: u64x1 = u64x1::new(8);
+        let r: u64x1 = transmute(vqrshl_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshlq_u64() {
+        let a: u64x2 = u64x2::new(2, 0);
+        let b: i64x2 = i64x2::new(2, 2);
+        let e: u64x2 = u64x2::new(8, 0);
+        let r: u64x2 = transmute(vqrshlq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_n_s16() {
+        let a: i16x8 = i16x8::new(-32768, 4, 8, 12, 16, 20, 24, 28);
+        let e: i8x8 = i8x8::new(-128, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vqrshrn_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_n_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 4, 8, 12);
+        let e: i16x4 = i16x4::new(-32768, 1, 2, 3);
+        let r: i16x4 = transmute(vqrshrn_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_n_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 4);
+        let e: i32x2 = i32x2::new(-2147483648, 1);
+        let r: i32x2 = transmute(vqrshrn_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_n_u16() {
+        let a: u16x8 = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vqrshrn_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_n_u32() {
+        let a: u32x4 = u32x4::new(0, 4, 8, 12);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vqrshrn_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrn_n_u64() {
+        let a: u64x2 = u64x2::new(0, 4);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vqrshrn_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrun_n_s16() {
+        let a: i16x8 = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vqrshrun_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrun_n_s32() {
+        let a: i32x4 = i32x4::new(0, 4, 8, 12);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vqrshrun_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshrun_n_s64() {
+        let a: i64x2 = i64x2::new(0, 4);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vqrshrun_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i8x8 = i8x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: i8x8 = transmute(vqshl_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i8x16 = i8x16::new(0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
+        let r: i8x16 = transmute(vqshlq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i16x4 = i16x4::new(0, 4, 8, 12);
+        let r: i16x4 = transmute(vqshl_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i16x8 = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: i16x8 = transmute(vqshlq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: i32x2 = i32x2::new(0, 4);
+        let r: i32x2 = transmute(vqshl_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(0, 4, 8, 12);
+        let r: i32x4 = transmute(vqshlq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let b: i64x1 = i64x1::new(2);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vqshl_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i64x2 = i64x2::new(2, 2);
+        let e: i64x2 = i64x2::new(0, 4);
+        let r: i64x2 = transmute(vqshlq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u8x8 = u8x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: u8x8 = transmute(vqshl_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u8x16 = u8x16::new(0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
+        let r: u8x16 = transmute(vqshlq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: u16x4 = u16x4::new(0, 4, 8, 12);
+        let r: u16x4 = transmute(vqshl_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u16x8 = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: u16x8 = transmute(vqshlq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: u32x2 = u32x2::new(0, 4);
+        let r: u32x2 = transmute(vqshl_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: u32x4 = u32x4::new(0, 4, 8, 12);
+        let r: u32x4 = transmute(vqshlq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let b: i64x1 = i64x1::new(2);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vqshl_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: i64x2 = i64x2::new(2, 2);
+        let e: u64x2 = u64x2::new(0, 4);
+        let r: u64x2 = transmute(vqshlq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_n_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x8 = i8x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: i8x8 = transmute(vqshl_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_n_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8x16 = i8x16::new(0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
+        let r: i8x16 = transmute(vqshlq_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_n_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: i16x4 = i16x4::new(0, 4, 8, 12);
+        let r: i16x4 = transmute(vqshl_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_n_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16x8 = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: i16x8 = transmute(vqshlq_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_n_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: i32x2 = i32x2::new(0, 4);
+        let r: i32x2 = transmute(vqshl_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_n_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: i32x4 = i32x4::new(0, 4, 8, 12);
+        let r: i32x4 = transmute(vqshlq_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_n_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vqshl_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_n_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i64x2 = i64x2::new(0, 4);
+        let r: i64x2 = transmute(vqshlq_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_n_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: u8x8 = transmute(vqshl_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_n_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
+        let r: u8x16 = transmute(vqshlq_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_n_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0, 4, 8, 12);
+        let r: u16x4 = transmute(vqshl_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_n_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: u16x8 = transmute(vqshlq_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_n_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0, 4);
+        let r: u32x2 = transmute(vqshl_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_n_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0, 4, 8, 12);
+        let r: u32x4 = transmute(vqshlq_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshl_n_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vqshl_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlq_n_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: u64x2 = u64x2::new(0, 4);
+        let r: u64x2 = transmute(vqshlq_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlu_n_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: u8x8 = transmute(vqshlu_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlu_n_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0, 4, 8, 12);
+        let r: u16x4 = transmute(vqshlu_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlu_n_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0, 4);
+        let r: u32x2 = transmute(vqshlu_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshlu_n_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vqshlu_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshluq_n_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
+        let r: u8x16 = transmute(vqshluq_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshluq_n_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r: u16x8 = transmute(vqshluq_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshluq_n_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0, 4, 8, 12);
+        let r: u32x4 = transmute(vqshluq_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshluq_n_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: u64x2 = u64x2::new(0, 4);
+        let r: u64x2 = transmute(vqshluq_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_n_s16() {
+        let a: i16x8 = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vqshrn_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_n_s32() {
+        let a: i32x4 = i32x4::new(0, 4, 8, 12);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vqshrn_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_n_s64() {
+        let a: i64x2 = i64x2::new(0, 4);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vqshrn_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_n_u16() {
+        let a: u16x8 = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vqshrn_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_n_u32() {
+        let a: u32x4 = u32x4::new(0, 4, 8, 12);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vqshrn_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrn_n_u64() {
+        let a: u64x2 = u64x2::new(0, 4);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vqshrn_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrun_n_s16() {
+        let a: i16x8 = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vqshrun_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrun_n_s32() {
+        let a: i32x4 = i32x4::new(0, 4, 8, 12);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vqshrun_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrun_n_s64() {
+        let a: i64x2 = i64x2::new(0, 4);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vqshrun_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrte_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let e: f32x2 = f32x2::new(0.998046875, 0.705078125);
+        let r: f32x2 = transmute(vrsqrte_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrteq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let e: f32x4 = f32x4::new(0.998046875, 0.705078125, 0.576171875, 0.4990234375);
+        let r: f32x4 = transmute(vrsqrteq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrte_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(4294967295, 4294967295);
+        let r: u32x2 = transmute(vrsqrte_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrteq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(4294967295, 4294967295, 4294967295, 4294967295);
+        let r: u32x4 = transmute(vrsqrteq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrts_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let b: f32x2 = f32x2::new(1.0, 2.0);
+        let e: f32x2 = f32x2::new(1., -0.5);
+        let r: f32x2 = transmute(vrsqrts_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrtsq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let b: f32x4 = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let e: f32x4 = f32x4::new(1., -0.5, -3.0, -6.5);
+        let r: f32x4 = transmute(vrsqrtsq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpe_f32() {
+        let a: f32x2 = f32x2::new(4.0, 3.0);
+        let e: f32x2 = f32x2::new(0.24951171875, 0.3330078125);
+        let r: f32x2 = transmute(vrecpe_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpeq_f32() {
+        let a: f32x4 = f32x4::new(4.0, 3.0, 2.0, 1.0);
+        let e: f32x4 = f32x4::new(0.24951171875, 0.3330078125, 0.4990234375, 0.998046875);
+        let r: f32x4 = transmute(vrecpeq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpe_u32() {
+        let a: u32x2 = u32x2::new(4, 3);
+        let e: u32x2 = u32x2::new(4294967295, 4294967295);
+        let r: u32x2 = transmute(vrecpe_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpeq_u32() {
+        let a: u32x4 = u32x4::new(4, 3, 2, 1);
+        let e: u32x4 = u32x4::new(4294967295, 4294967295, 4294967295, 4294967295);
+        let r: u32x4 = transmute(vrecpeq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecps_f32() {
+        let a: f32x2 = f32x2::new(4.0, 3.0);
+        let b: f32x2 = f32x2::new(4.0, 3.0);
+        let e: f32x2 = f32x2::new(-14., -7.);
+        let r: f32x2 = transmute(vrecps_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrecpsq_f32() {
+        let a: f32x4 = f32x4::new(4.0, 3.0, 2.0, 1.0);
+        let b: f32x4 = f32x4::new(4.0, 3.0, 2.0, 1.0);
+        let e: f32x4 = f32x4::new(-14., -7., -2., 1.);
+        let r: f32x4 = transmute(vrecpsq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vreinterpret_s8_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_p8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vreinterpret_s8_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_p16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_s16_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_s16_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vreinterpret_s32_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vreinterpretq_s8_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_p8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vreinterpretq_s8_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_p16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_s16_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_s16_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vreinterpretq_s32_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_p8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vreinterpret_u8_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vreinterpret_u8_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_p16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vreinterpret_u16_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vreinterpret_u16_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vreinterpret_u32_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_p8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vreinterpretq_u8_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vreinterpretq_u8_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_p16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vreinterpretq_u16_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vreinterpretq_u16_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vreinterpretq_u32_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vreinterpret_p8_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vreinterpret_p8_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_p16_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_p16_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vreinterpretq_p8_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vreinterpretq_p8_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_p16_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_p16_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_p16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: i16x4 = i16x4::new(0, 0, 1, 0);
+        let r: i16x4 = transmute(vreinterpret_s16_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let e: i16x4 = i16x4::new(0, 0, 1, 0);
+        let r: i16x4 = transmute(vreinterpret_s16_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vreinterpret_s32_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vreinterpret_s32_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_p16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: i16x8 = i16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i16x8 = transmute(vreinterpretq_s16_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: i16x8 = i16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i16x8 = transmute(vreinterpretq_s16_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i32x4 = i32x4::new(0, 0, 1, 0);
+        let r: i32x4 = transmute(vreinterpretq_s32_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: i32x4 = i32x4::new(0, 0, 1, 0);
+        let r: i32x4 = transmute(vreinterpretq_s32_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_p16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u8x8 = u8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u8x8 = u8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: u8x8 = u8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: u16x4 = u16x4::new(0, 0, 1, 0);
+        let r: u16x4 = transmute(vreinterpret_u16_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let e: u16x4 = u16x4::new(0, 0, 1, 0);
+        let r: u16x4 = transmute(vreinterpret_u16_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vreinterpret_u32_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vreinterpret_u32_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_p16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x16 = u8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x16 = u8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x16 = u8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: u16x8 = u16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: u16x8 = transmute(vreinterpretq_u16_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: u16x8 = u16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: u16x8 = transmute(vreinterpretq_u16_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: u32x4 = u32x4::new(0, 0, 1, 0);
+        let r: u32x4 = transmute(vreinterpretq_u32_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: u32x4 = u32x4::new(0, 0, 1, 0);
+        let r: u32x4 = transmute(vreinterpretq_u32_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_p16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let e: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: i16x4 = i16x4::new(0, 0, 1, 0);
+        let r: i16x4 = transmute(vreinterpret_p16_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let e: i16x4 = i16x4::new(0, 0, 1, 0);
+        let r: i16x4 = transmute(vreinterpret_p16_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_p16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: i16x8 = i16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i16x8 = transmute(vreinterpretq_p16_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: i16x8 = i16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let r: i16x8 = transmute(vreinterpretq_p16_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vreinterpret_s32_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vreinterpret_u32_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i32x4 = i32x4::new(0, 0, 1, 0);
+        let r: i32x4 = transmute(vreinterpretq_s32_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: u32x4 = u32x4::new(0, 0, 1, 0);
+        let r: u32x4 = transmute(vreinterpretq_u32_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_p128() {
+        let a: p128 = 0;
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vreinterpretq_s64_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_p128() {
+        let a: p128 = 0;
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vreinterpretq_u64_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_p128() {
+        let a: p128 = 0;
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vreinterpretq_p64_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_s16_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_s16_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_s16_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_p16() {
+        let a: i16x4 = i16x4::new(0, 0, 1, 0);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vreinterpret_s32_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_s16() {
+        let a: i16x4 = i16x4::new(0, 0, 1, 0);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vreinterpret_s32_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_u16() {
+        let a: u16x4 = u16x4::new(0, 0, 1, 0);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vreinterpret_s32_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_s32() {
+        let a: i32x2 = i32x2::new(0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_u32() {
+        let a: u32x2 = u32x2::new(0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_s16_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_s16_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_s16_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_p16() {
+        let a: i16x8 = i16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vreinterpretq_s32_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_s16() {
+        let a: i16x8 = i16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vreinterpretq_s32_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_u16() {
+        let a: u16x8 = u16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vreinterpretq_s32_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_s32() {
+        let a: i32x4 = i32x4::new(0, 0, 1, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_u32() {
+        let a: u32x4 = u32x4::new(0, 0, 1, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vreinterpret_u16_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vreinterpret_u16_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vreinterpret_u16_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_p16() {
+        let a: i16x4 = i16x4::new(0, 0, 1, 0);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vreinterpret_u32_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_s16() {
+        let a: i16x4 = i16x4::new(0, 0, 1, 0);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vreinterpret_u32_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_u16() {
+        let a: u16x4 = u16x4::new(0, 0, 1, 0);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vreinterpret_u32_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_s32() {
+        let a: i32x2 = i32x2::new(0, 0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_u32() {
+        let a: u32x2 = u32x2::new(0, 0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vreinterpretq_u16_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vreinterpretq_u16_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vreinterpretq_u16_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_p16() {
+        let a: i16x8 = i16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vreinterpretq_u32_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_s16() {
+        let a: i16x8 = i16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vreinterpretq_u32_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_u16() {
+        let a: u16x8 = u16x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vreinterpretq_u32_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_s32() {
+        let a: i32x4 = i32x4::new(0, 0, 1, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_u32() {
+        let a: u32x4 = u32x4::new(0, 0, 1, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_p16_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_p16_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 1, 0, 2, 0, 3, 0);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vreinterpret_p16_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_p16_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_p16_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vreinterpretq_p16_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_s32() {
+        let a: i32x2 = i32x2::new(0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_u32() {
+        let a: u32x2 = u32x2::new(0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_s32() {
+        let a: i32x4 = i32x4::new(0, 0, 1, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_u32() {
+        let a: u32x4 = u32x4::new(0, 0, 1, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_s64() {
+        let a: i64x2 = i64x2::new(0, 0);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_u64() {
+        let a: u64x2 = u64x2::new(0, 0);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_p64() {
+        let a: i64x2 = i64x2::new(0, 0);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_s16_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_s16_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_s16_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_s16_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: u8x8 = u8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let e: u8x8 = u8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: u16x4 = u16x4::new(0, 0, 0, 0);
+        let r: u16x4 = transmute(vreinterpret_u16_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: u16x4 = u16x4::new(0, 0, 0, 0);
+        let r: u16x4 = transmute(vreinterpret_u16_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: u8x16 = u8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: u8x16 = u8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: u16x8 = transmute(vreinterpretq_u16_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: u16x8 = transmute(vreinterpretq_u16_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_p16_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_p16_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_p16_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_p16_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_s16_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: u16x4 = u16x4::new(0, 0, 0, 0);
+        let r: u16x4 = transmute(vreinterpret_u16_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_p16_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_s16_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: u16x8 = transmute(vreinterpretq_u16_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_p16_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_p128() {
+        let a: p128 = 0;
+        let e: i32x4 = i32x4::new(0, 0, 0, 0);
+        let r: i32x4 = transmute(vreinterpretq_s32_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_p128() {
+        let a: p128 = 0;
+        let e: u32x4 = u32x4::new(0, 0, 0, 0);
+        let r: u32x4 = transmute(vreinterpretq_u32_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vreinterpret_s32_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vreinterpret_s32_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vreinterpret_s32_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_p16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_s16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_u16() {
+        let a: u16x4 = u16x4::new(0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vreinterpretq_s32_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vreinterpretq_s32_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vreinterpretq_s32_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_p16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_s16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_u16() {
+        let a: u16x8 = u16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vreinterpret_u32_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vreinterpret_u32_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vreinterpret_u32_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_p16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_s16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_u16() {
+        let a: u16x4 = u16x4::new(0, 0, 0, 0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vreinterpretq_u32_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vreinterpretq_u32_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vreinterpretq_u32_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_p16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_s16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_u16() {
+        let a: u16x8 = u16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_p16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_s16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_u16() {
+        let a: u16x4 = u16x4::new(0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_p16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_s16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_u16() {
+        let a: u16x8 = u16x8::new(0, 0, 0, 0, 1, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_s32() {
+        let a: i32x4 = i32x4::new(0, 0, 0, 0);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_u32() {
+        let a: u32x4 = u32x4::new(0, 0, 0, 0);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_p64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_p64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_p128() {
+        let a: p128 = 0;
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_s16_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_p128() {
+        let a: p128 = 0;
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vreinterpretq_u16_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_p128() {
+        let a: p128 = 0;
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_p16_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_s64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vreinterpretq_u64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p64_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_p64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p64_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vreinterpretq_p64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_s16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_u16() {
+        let a: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_p16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_s8() {
+        let a: i8x16 = i8x16::new(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: p128 = 1;
+        let r: p128 = transmute(vreinterpretq_p128_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_u8() {
+        let a: u8x16 = u8x16::new(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: p128 = 1;
+        let r: p128 = transmute(vreinterpretq_p128_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_p8() {
+        let a: i8x16 = i8x16::new(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: p128 = 1;
+        let r: p128 = transmute(vreinterpretq_p128_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_p128() {
+        let a: p128 = 1;
+        let e: i8x16 = i8x16::new(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_p128() {
+        let a: p128 = 1;
+        let e: u8x16 = u8x16::new(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_p128() {
+        let a: p128 = 1;
+        let e: i8x16 = i8x16::new(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s8_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_s8_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s16_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_s16_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s32_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vreinterpret_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_s64_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vreinterpret_s64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s8_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_s8_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s16_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_s16_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s32_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: i32x4 = i32x4::new(0, 0, 0, 0);
+        let r: i32x4 = transmute(vreinterpretq_s32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_s64_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: i64x2 = i64x2::new(0, 0);
+        let r: i64x2 = transmute(vreinterpretq_s64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u8_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vreinterpret_u8_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u16_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: u16x4 = u16x4::new(0, 0, 0, 0);
+        let r: u16x4 = transmute(vreinterpret_u16_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u32_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vreinterpret_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_u64_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vreinterpret_u64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u8_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vreinterpretq_u8_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u16_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vreinterpretq_u16_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u32_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0);
+        let r: u32x4 = transmute(vreinterpretq_u32_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_u64_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: u64x2 = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vreinterpretq_u64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p8_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vreinterpret_p8_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_p16_f32() {
+        let a: f32x2 = f32x2::new(0., 0.);
+        let e: i16x4 = i16x4::new(0, 0, 0, 0);
+        let r: i16x4 = transmute(vreinterpret_p16_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p8_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i8x16 = transmute(vreinterpretq_p8_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p16_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: i16x8 = transmute(vreinterpretq_p16_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_p128_f32() {
+        let a: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let e: p128 = 0;
+        let r: p128 = transmute(vreinterpretq_p128_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_s8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_s16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_s32() {
+        let a: i32x2 = i32x2::new(0, 0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_s8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_s16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_s32() {
+        let a: i32x4 = i32x4::new(0, 0, 0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_s64() {
+        let a: i64x2 = i64x2::new(0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_u8() {
+        let a: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_u16() {
+        let a: u16x4 = u16x4::new(0, 0, 0, 0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_u32() {
+        let a: u32x2 = u32x2::new(0, 0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_u8() {
+        let a: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_u16() {
+        let a: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_u32() {
+        let a: u32x4 = u32x4::new(0, 0, 0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_u64() {
+        let a: u64x2 = u64x2::new(0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_p8() {
+        let a: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpret_f32_p16() {
+        let a: i16x4 = i16x4::new(0, 0, 0, 0);
+        let e: f32x2 = f32x2::new(0., 0.);
+        let r: f32x2 = transmute(vreinterpret_f32_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_p8() {
+        let a: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_p16() {
+        let a: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vreinterpretq_f32_p128() {
+        let a: p128 = 0;
+        let e: f32x4 = f32x4::new(0., 0., 0., 0.);
+        let r: f32x4 = transmute(vreinterpretq_f32_p128(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshl_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i8x8 = i8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: i8x8 = transmute(vrshl_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshlq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i8x16 = i8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let r: i8x16 = transmute(vrshlq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshl_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i16x4 = i16x4::new(4, 8, 12, 16);
+        let r: i16x4 = transmute(vrshl_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshlq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: i16x8 = transmute(vrshlq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshl_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: i32x2 = i32x2::new(4, 8);
+        let r: i32x2 = transmute(vrshl_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshlq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(4, 8, 12, 16);
+        let r: i32x4 = transmute(vrshlq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshl_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(2);
+        let e: i64x1 = i64x1::new(4);
+        let r: i64x1 = transmute(vrshl_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshlq_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(2, 2);
+        let e: i64x2 = i64x2::new(4, 8);
+        let r: i64x2 = transmute(vrshlq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshl_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u8x8 = u8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: u8x8 = transmute(vrshl_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshlq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u8x16 = u8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let r: u8x16 = transmute(vrshlq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshl_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: u16x4 = u16x4::new(4, 8, 12, 16);
+        let r: u16x4 = transmute(vrshl_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshlq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: u16x8 = transmute(vrshlq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshl_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: u32x2 = u32x2::new(4, 8);
+        let r: u32x2 = transmute(vrshl_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshlq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: u32x4 = u32x4::new(4, 8, 12, 16);
+        let r: u32x4 = transmute(vrshlq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshl_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: i64x1 = i64x1::new(2);
+        let e: u64x1 = u64x1::new(4);
+        let r: u64x1 = transmute(vrshl_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshlq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(2, 2);
+        let e: u64x2 = u64x2::new(4, 8);
+        let r: u64x2 = transmute(vrshlq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshr_n_s8() {
+        let a: i8x8 = i8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vrshr_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrq_n_s8() {
+        let a: i8x16 = i8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let e: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vrshrq_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshr_n_s16() {
+        let a: i16x4 = i16x4::new(4, 8, 12, 16);
+        let e: i16x4 = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vrshr_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrq_n_s16() {
+        let a: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vrshrq_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshr_n_s32() {
+        let a: i32x2 = i32x2::new(4, 8);
+        let e: i32x2 = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vrshr_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrq_n_s32() {
+        let a: i32x4 = i32x4::new(4, 8, 12, 16);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vrshrq_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshr_n_s64() {
+        let a: i64x1 = i64x1::new(4);
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vrshr_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrq_n_s64() {
+        let a: i64x2 = i64x2::new(4, 8);
+        let e: i64x2 = i64x2::new(1, 2);
+        let r: i64x2 = transmute(vrshrq_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshr_n_u8() {
+        let a: u8x8 = u8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vrshr_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrq_n_u8() {
+        let a: u8x16 = u8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let e: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x16 = transmute(vrshrq_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshr_n_u16() {
+        let a: u16x4 = u16x4::new(4, 8, 12, 16);
+        let e: u16x4 = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vrshr_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrq_n_u16() {
+        let a: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16x8 = transmute(vrshrq_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshr_n_u32() {
+        let a: u32x2 = u32x2::new(4, 8);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vrshr_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrq_n_u32() {
+        let a: u32x4 = u32x4::new(4, 8, 12, 16);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vrshrq_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshr_n_u64() {
+        let a: u64x1 = u64x1::new(4);
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vrshr_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrq_n_u64() {
+        let a: u64x2 = u64x2::new(4, 8);
+        let e: u64x2 = u64x2::new(1, 2);
+        let r: u64x2 = transmute(vrshrq_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_n_s16() {
+        let a: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vrshrn_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_n_s32() {
+        let a: i32x4 = i32x4::new(4, 8, 12, 16);
+        let e: i16x4 = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vrshrn_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_n_s64() {
+        let a: i64x2 = i64x2::new(4, 8);
+        let e: i32x2 = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vrshrn_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_n_u16() {
+        let a: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vrshrn_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_n_u32() {
+        let a: u32x4 = u32x4::new(4, 8, 12, 16);
+        let e: u16x4 = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vrshrn_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrshrn_n_u64() {
+        let a: u64x2 = u64x2::new(4, 8);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vrshrn_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsra_n_s8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i8x8 = i8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i8x8 = i8x8::new(2, 3, 4, 5, 6, 7, 8, 9);
+        let r: i8x8 = transmute(vrsra_n_s8::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsraq_n_s8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i8x16 = i8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let e: i8x16 = i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
+        let r: i8x16 = transmute(vrsraq_n_s8::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsra_n_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(4, 8, 12, 16);
+        let e: i16x4 = i16x4::new(2, 3, 4, 5);
+        let r: i16x4 = transmute(vrsra_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsraq_n_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i16x8 = i16x8::new(2, 3, 4, 5, 6, 7, 8, 9);
+        let r: i16x8 = transmute(vrsraq_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsra_n_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let b: i32x2 = i32x2::new(4, 8);
+        let e: i32x2 = i32x2::new(2, 3);
+        let r: i32x2 = transmute(vrsra_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsraq_n_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i32x4 = i32x4::new(4, 8, 12, 16);
+        let e: i32x4 = i32x4::new(2, 3, 4, 5);
+        let r: i32x4 = transmute(vrsraq_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsra_n_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(4);
+        let e: i64x1 = i64x1::new(2);
+        let r: i64x1 = transmute(vrsra_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsraq_n_s64() {
+        let a: i64x2 = i64x2::new(1, 1);
+        let b: i64x2 = i64x2::new(4, 8);
+        let e: i64x2 = i64x2::new(2, 3);
+        let r: i64x2 = transmute(vrsraq_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsra_n_u8() {
+        let a: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: u8x8 = u8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u8x8 = u8x8::new(2, 3, 4, 5, 6, 7, 8, 9);
+        let r: u8x8 = transmute(vrsra_n_u8::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsraq_n_u8() {
+        let a: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b: u8x16 = u8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let e: u8x16 = u8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
+        let r: u8x16 = transmute(vrsraq_n_u8::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsra_n_u16() {
+        let a: u16x4 = u16x4::new(1, 1, 1, 1);
+        let b: u16x4 = u16x4::new(4, 8, 12, 16);
+        let e: u16x4 = u16x4::new(2, 3, 4, 5);
+        let r: u16x4 = transmute(vrsra_n_u16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsraq_n_u16() {
+        let a: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u16x8 = u16x8::new(2, 3, 4, 5, 6, 7, 8, 9);
+        let r: u16x8 = transmute(vrsraq_n_u16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsra_n_u32() {
+        let a: u32x2 = u32x2::new(1, 1);
+        let b: u32x2 = u32x2::new(4, 8);
+        let e: u32x2 = u32x2::new(2, 3);
+        let r: u32x2 = transmute(vrsra_n_u32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsraq_n_u32() {
+        let a: u32x4 = u32x4::new(1, 1, 1, 1);
+        let b: u32x4 = u32x4::new(4, 8, 12, 16);
+        let e: u32x4 = u32x4::new(2, 3, 4, 5);
+        let r: u32x4 = transmute(vrsraq_n_u32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsra_n_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: u64x1 = u64x1::new(4);
+        let e: u64x1 = u64x1::new(2);
+        let r: u64x1 = transmute(vrsra_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsraq_n_u64() {
+        let a: u64x2 = u64x2::new(1, 1);
+        let b: u64x2 = u64x2::new(4, 8);
+        let e: u64x2 = u64x2::new(2, 3);
+        let r: u64x2 = transmute(vrsraq_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, -32768, 0, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(-128, -128, 0, 0, 0, 0, 0, 0);
+        let r: i8x8 = transmute(vrsubhn_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, -2147483648, 0, 4);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(-32768, -32768, 0, 0);
+        let r: i16x4 = transmute(vrsubhn_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_s64() {
+        let a: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, -9223372036854775808);
+        let b: i64x2 = i64x2::new(1, 2);
+        let e: i32x2 = i32x2::new(-2147483648, -2147483648);
+        let r: i32x2 = transmute(vrsubhn_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_u16() {
+        let a: u16x8 = u16x8::new(0xFF_FF, 0, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vrsubhn_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_u32() {
+        let a: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0, 0, 0, 0);
+        let r: u16x4 = transmute(vrsubhn_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsubhn_u64() {
+        let a: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let b: u64x2 = u64x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vrsubhn_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_s8() {
+        let a: i8 = 1;
+        let b: i8x8 = i8x8::new(0, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vset_lane_s8::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_s16() {
+        let a: i16 = 1;
+        let b: i16x4 = i16x4::new(0, 2, 3, 4);
+        let e: i16x4 = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vset_lane_s16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_s32() {
+        let a: i32 = 1;
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i32x2 = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vset_lane_s32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_s64() {
+        let a: i64 = 1;
+        let b: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vset_lane_s64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_u8() {
+        let a: u8 = 1;
+        let b: u8x8 = u8x8::new(0, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vset_lane_u8::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_u16() {
+        let a: u16 = 1;
+        let b: u16x4 = u16x4::new(0, 2, 3, 4);
+        let e: u16x4 = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vset_lane_u16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_u32() {
+        let a: u32 = 1;
+        let b: u32x2 = u32x2::new(0, 2);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vset_lane_u32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_u64() {
+        let a: u64 = 1;
+        let b: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vset_lane_u64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_p8() {
+        let a: p8 = 1;
+        let b: i8x8 = i8x8::new(0, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vset_lane_p8::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_p16() {
+        let a: p16 = 1;
+        let b: i16x4 = i16x4::new(0, 2, 3, 4);
+        let e: i16x4 = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vset_lane_p16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_p64() {
+        let a: p64 = 1;
+        let b: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vset_lane_p64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_s8() {
+        let a: i8 = 1;
+        let b: i8x16 = i8x16::new(0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vsetq_lane_s8::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_s16() {
+        let a: i16 = 1;
+        let b: i16x8 = i16x8::new(0, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vsetq_lane_s16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_s32() {
+        let a: i32 = 1;
+        let b: i32x4 = i32x4::new(0, 2, 3, 4);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vsetq_lane_s32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_s64() {
+        let a: i64 = 1;
+        let b: i64x2 = i64x2::new(0, 2);
+        let e: i64x2 = i64x2::new(1, 2);
+        let r: i64x2 = transmute(vsetq_lane_s64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_u8() {
+        let a: u8 = 1;
+        let b: u8x16 = u8x16::new(0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x16 = transmute(vsetq_lane_u8::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_u16() {
+        let a: u16 = 1;
+        let b: u16x8 = u16x8::new(0, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16x8 = transmute(vsetq_lane_u16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_u32() {
+        let a: u32 = 1;
+        let b: u32x4 = u32x4::new(0, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsetq_lane_u32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_u64() {
+        let a: u64 = 1;
+        let b: u64x2 = u64x2::new(0, 2);
+        let e: u64x2 = u64x2::new(1, 2);
+        let r: u64x2 = transmute(vsetq_lane_u64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_p8() {
+        let a: p8 = 1;
+        let b: i8x16 = i8x16::new(0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vsetq_lane_p8::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_p16() {
+        let a: p16 = 1;
+        let b: i16x8 = i16x8::new(0, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vsetq_lane_p16::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_p64() {
+        let a: p64 = 1;
+        let b: i64x2 = i64x2::new(0, 2);
+        let e: i64x2 = i64x2::new(1, 2);
+        let r: i64x2 = transmute(vsetq_lane_p64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vset_lane_f32() {
+        let a: f32 = 1.;
+        let b: f32x2 = f32x2::new(0., 2.);
+        let e: f32x2 = f32x2::new(1., 2.);
+        let r: f32x2 = transmute(vset_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsetq_lane_f32() {
+        let a: f32 = 1.;
+        let b: f32x4 = f32x4::new(0., 2., 3., 4.);
+        let e: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let r: f32x4 = transmute(vsetq_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i8x8 = i8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: i8x8 = transmute(vshl_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i8x16 = i8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let r: i8x16 = transmute(vshlq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: i16x4 = i16x4::new(4, 8, 12, 16);
+        let r: i16x4 = transmute(vshl_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: i16x8 = transmute(vshlq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: i32x2 = i32x2::new(4, 8);
+        let r: i32x2 = transmute(vshl_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: i32x4 = i32x4::new(4, 8, 12, 16);
+        let r: i32x4 = transmute(vshlq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(2);
+        let e: i64x1 = i64x1::new(4);
+        let r: i64x1 = transmute(vshl_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(2, 2);
+        let e: i64x2 = i64x2::new(4, 8);
+        let r: i64x2 = transmute(vshlq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u8x8 = u8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: u8x8 = transmute(vshl_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u8x16 = u8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let r: u8x16 = transmute(vshlq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let e: u16x4 = u16x4::new(4, 8, 12, 16);
+        let r: u16x4 = transmute(vshl_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let e: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: u16x8 = transmute(vshlq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(2, 2);
+        let e: u32x2 = u32x2::new(4, 8);
+        let r: u32x2 = transmute(vshl_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let e: u32x4 = u32x4::new(4, 8, 12, 16);
+        let r: u32x4 = transmute(vshlq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: i64x1 = i64x1::new(2);
+        let e: u64x1 = u64x1::new(4);
+        let r: u64x1 = transmute(vshl_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: i64x2 = i64x2::new(2, 2);
+        let e: u64x2 = u64x2::new(4, 8);
+        let r: u64x2 = transmute(vshlq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_n_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: i8x8 = transmute(vshl_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_n_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let r: i8x16 = transmute(vshlq_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_n_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i16x4 = i16x4::new(4, 8, 12, 16);
+        let r: i16x4 = transmute(vshl_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_n_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: i16x8 = transmute(vshlq_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_n_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let e: i32x2 = i32x2::new(4, 8);
+        let r: i32x2 = transmute(vshl_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_n_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(4, 8, 12, 16);
+        let r: i32x4 = transmute(vshlq_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_n_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: u8x8 = transmute(vshl_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_n_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let r: u8x16 = transmute(vshlq_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_n_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(4, 8, 12, 16);
+        let r: u16x4 = transmute(vshl_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_n_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: u16x8 = transmute(vshlq_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_n_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(4, 8);
+        let r: u32x2 = transmute(vshl_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_n_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(4, 8, 12, 16);
+        let r: u32x4 = transmute(vshlq_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_n_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(4);
+        let r: i64x1 = transmute(vshl_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_n_s64() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let e: i64x2 = i64x2::new(4, 8);
+        let r: i64x2 = transmute(vshlq_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshl_n_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(4);
+        let r: u64x1 = transmute(vshl_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshlq_n_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(4, 8);
+        let r: u64x2 = transmute(vshlq_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_n_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: i16x8 = transmute(vshll_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_n_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: i32x4 = i32x4::new(4, 8, 12, 16);
+        let r: i32x4 = transmute(vshll_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_n_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let e: i64x2 = i64x2::new(4, 8);
+        let r: i64x2 = transmute(vshll_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_n_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let r: u16x8 = transmute(vshll_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_n_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(4, 8, 12, 16);
+        let r: u32x4 = transmute(vshll_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshll_n_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let e: u64x2 = u64x2::new(4, 8);
+        let r: u64x2 = transmute(vshll_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshr_n_s8() {
+        let a: i8x8 = i8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vshr_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrq_n_s8() {
+        let a: i8x16 = i8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let e: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x16 = transmute(vshrq_n_s8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshr_n_s16() {
+        let a: i16x4 = i16x4::new(4, 8, 12, 16);
+        let e: i16x4 = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vshr_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrq_n_s16() {
+        let a: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vshrq_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshr_n_s32() {
+        let a: i32x2 = i32x2::new(4, 8);
+        let e: i32x2 = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vshr_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrq_n_s32() {
+        let a: i32x4 = i32x4::new(4, 8, 12, 16);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vshrq_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshr_n_s64() {
+        let a: i64x1 = i64x1::new(4);
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vshr_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrq_n_s64() {
+        let a: i64x2 = i64x2::new(4, 8);
+        let e: i64x2 = i64x2::new(1, 2);
+        let r: i64x2 = transmute(vshrq_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshr_n_u8() {
+        let a: u8x8 = u8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vshr_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrq_n_u8() {
+        let a: u8x16 = u8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let e: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x16 = transmute(vshrq_n_u8::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshr_n_u16() {
+        let a: u16x4 = u16x4::new(4, 8, 12, 16);
+        let e: u16x4 = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vshr_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrq_n_u16() {
+        let a: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16x8 = transmute(vshrq_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshr_n_u32() {
+        let a: u32x2 = u32x2::new(4, 8);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vshr_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrq_n_u32() {
+        let a: u32x4 = u32x4::new(4, 8, 12, 16);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vshrq_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshr_n_u64() {
+        let a: u64x1 = u64x1::new(4);
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vshr_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrq_n_u64() {
+        let a: u64x2 = u64x2::new(4, 8);
+        let e: u64x2 = u64x2::new(1, 2);
+        let r: u64x2 = transmute(vshrq_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_n_s16() {
+        let a: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vshrn_n_s16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_n_s32() {
+        let a: i32x4 = i32x4::new(4, 8, 12, 16);
+        let e: i16x4 = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vshrn_n_s32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_n_s64() {
+        let a: i64x2 = i64x2::new(4, 8);
+        let e: i32x2 = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vshrn_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_n_u16() {
+        let a: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vshrn_n_u16::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_n_u32() {
+        let a: u32x4 = u32x4::new(4, 8, 12, 16);
+        let e: u16x4 = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vshrn_n_u32::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vshrn_n_u64() {
+        let a: u64x2 = u64x2::new(4, 8);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vshrn_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsra_n_s8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i8x8 = i8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i8x8 = i8x8::new(2, 3, 4, 5, 6, 7, 8, 9);
+        let r: i8x8 = transmute(vsra_n_s8::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsraq_n_s8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i8x16 = i8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let e: i8x16 = i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
+        let r: i8x16 = transmute(vsraq_n_s8::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsra_n_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 1);
+        let b: i16x4 = i16x4::new(4, 8, 12, 16);
+        let e: i16x4 = i16x4::new(2, 3, 4, 5);
+        let r: i16x4 = transmute(vsra_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsraq_n_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: i16x8 = i16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: i16x8 = i16x8::new(2, 3, 4, 5, 6, 7, 8, 9);
+        let r: i16x8 = transmute(vsraq_n_s16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsra_n_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let b: i32x2 = i32x2::new(4, 8);
+        let e: i32x2 = i32x2::new(2, 3);
+        let r: i32x2 = transmute(vsra_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsraq_n_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 1);
+        let b: i32x4 = i32x4::new(4, 8, 12, 16);
+        let e: i32x4 = i32x4::new(2, 3, 4, 5);
+        let r: i32x4 = transmute(vsraq_n_s32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsra_n_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let b: i64x1 = i64x1::new(4);
+        let e: i64x1 = i64x1::new(2);
+        let r: i64x1 = transmute(vsra_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsraq_n_s64() {
+        let a: i64x2 = i64x2::new(1, 1);
+        let b: i64x2 = i64x2::new(4, 8);
+        let e: i64x2 = i64x2::new(2, 3);
+        let r: i64x2 = transmute(vsraq_n_s64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsra_n_u8() {
+        let a: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: u8x8 = u8x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u8x8 = u8x8::new(2, 3, 4, 5, 6, 7, 8, 9);
+        let r: u8x8 = transmute(vsra_n_u8::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsraq_n_u8() {
+        let a: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b: u8x16 = u8x16::new(4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64);
+        let e: u8x16 = u8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
+        let r: u8x16 = transmute(vsraq_n_u8::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsra_n_u16() {
+        let a: u16x4 = u16x4::new(1, 1, 1, 1);
+        let b: u16x4 = u16x4::new(4, 8, 12, 16);
+        let e: u16x4 = u16x4::new(2, 3, 4, 5);
+        let r: u16x4 = transmute(vsra_n_u16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsraq_n_u16() {
+        let a: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let b: u16x8 = u16x8::new(4, 8, 12, 16, 20, 24, 28, 32);
+        let e: u16x8 = u16x8::new(2, 3, 4, 5, 6, 7, 8, 9);
+        let r: u16x8 = transmute(vsraq_n_u16::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsra_n_u32() {
+        let a: u32x2 = u32x2::new(1, 1);
+        let b: u32x2 = u32x2::new(4, 8);
+        let e: u32x2 = u32x2::new(2, 3);
+        let r: u32x2 = transmute(vsra_n_u32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsraq_n_u32() {
+        let a: u32x4 = u32x4::new(1, 1, 1, 1);
+        let b: u32x4 = u32x4::new(4, 8, 12, 16);
+        let e: u32x4 = u32x4::new(2, 3, 4, 5);
+        let r: u32x4 = transmute(vsraq_n_u32::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsra_n_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let b: u64x1 = u64x1::new(4);
+        let e: u64x1 = u64x1::new(2);
+        let r: u64x1 = transmute(vsra_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsraq_n_u64() {
+        let a: u64x2 = u64x2::new(1, 1);
+        let b: u64x2 = u64x2::new(4, 8);
+        let e: u64x2 = u64x2::new(2, 3);
+        let r: u64x2 = transmute(vsraq_n_u64::<2>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn_s8() {
+        let a: i8x8 = i8x8::new(0, 2, 2, 6, 2, 10, 6, 14);
+        let b: i8x8 = i8x8::new(1, 3, 3, 7, 3, 1, 7, 15);
+        let e: [i8; 16] = [0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15];
+        let r: [i8; 16] = transmute(vtrn_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn_s16() {
+        let a: i16x4 = i16x4::new(0, 2, 2, 6);
+        let b: i16x4 = i16x4::new(1, 3, 3, 7);
+        let e: [i16; 8] = [0, 1, 2, 3, 2, 3, 6, 7];
+        let r: [i16; 8] = transmute(vtrn_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrnq_s8() {
+        let a: i8x16 = i8x16::new(0, 2, 2, 6, 2, 10, 6, 14, 2, 18, 6, 22, 10, 26, 14, 30);
+        let b: i8x16 = i8x16::new(1, 3, 3, 7, 3, 1, 7, 15, 3, 19, 7, 23, 1, 27, 15, 31);
+        let e: [i8; 32] = [0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15, 2, 3, 6, 7, 10, 1, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31];
+        let r: [i8; 32] = transmute(vtrnq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrnq_s16() {
+        let a: i16x8 = i16x8::new(0, 2, 2, 6, 2, 10, 6, 14);
+        let b: i16x8 = i16x8::new(1, 3, 3, 7, 3, 1, 7, 15);
+        let e: [i16; 16] = [0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15];
+        let r: [i16; 16] = transmute(vtrnq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrnq_s32() {
+        let a: i32x4 = i32x4::new(0, 2, 2, 6);
+        let b: i32x4 = i32x4::new(1, 3, 3, 7);
+        let e: [i32; 8] = [0, 1, 2, 3, 2, 3, 6, 7];
+        let r: [i32; 8] = transmute(vtrnq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn_u8() {
+        let a: u8x8 = u8x8::new(0, 2, 2, 6, 2, 10, 6, 14);
+        let b: u8x8 = u8x8::new(1, 3, 3, 7, 3, 1, 7, 15);
+        let e: [u8; 16] = [0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15];
+        let r: [u8; 16] = transmute(vtrn_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn_u16() {
+        let a: u16x4 = u16x4::new(0, 2, 2, 6);
+        let b: u16x4 = u16x4::new(1, 3, 3, 7);
+        let e: [u16; 8] = [0, 1, 2, 3, 2, 3, 6, 7];
+        let r: [u16; 8] = transmute(vtrn_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrnq_u8() {
+        let a: u8x16 = u8x16::new(0, 2, 2, 6, 2, 10, 6, 14, 2, 18, 6, 22, 10, 26, 14, 30);
+        let b: u8x16 = u8x16::new(1, 3, 3, 7, 3, 1, 7, 15, 3, 19, 7, 23, 1, 27, 15, 31);
+        let e: [u8; 32] = [0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15, 2, 3, 6, 7, 10, 1, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31];
+        let r: [u8; 32] = transmute(vtrnq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrnq_u16() {
+        let a: u16x8 = u16x8::new(0, 2, 2, 6, 2, 10, 6, 14);
+        let b: u16x8 = u16x8::new(1, 3, 3, 7, 3, 1, 7, 15);
+        let e: [u16; 16] = [0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15];
+        let r: [u16; 16] = transmute(vtrnq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrnq_u32() {
+        let a: u32x4 = u32x4::new(0, 2, 2, 6);
+        let b: u32x4 = u32x4::new(1, 3, 3, 7);
+        let e: [u32; 8] = [0, 1, 2, 3, 2, 3, 6, 7];
+        let r: [u32; 8] = transmute(vtrnq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn_p8() {
+        let a: i8x8 = i8x8::new(0, 2, 2, 6, 2, 10, 6, 14);
+        let b: i8x8 = i8x8::new(1, 3, 3, 7, 3, 1, 7, 15);
+        let e: [u8; 16] = [0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15];
+        let r: [u8; 16] = transmute(vtrn_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn_p16() {
+        let a: i16x4 = i16x4::new(0, 2, 2, 6);
+        let b: i16x4 = i16x4::new(1, 3, 3, 7);
+        let e: [u16; 8] = [0, 1, 2, 3, 2, 3, 6, 7];
+        let r: [u16; 8] = transmute(vtrn_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrnq_p8() {
+        let a: i8x16 = i8x16::new(0, 2, 2, 6, 2, 10, 6, 14, 2, 18, 6, 22, 10, 26, 14, 30);
+        let b: i8x16 = i8x16::new(1, 3, 3, 7, 3, 1, 7, 15, 3, 19, 7, 23, 1, 27, 15, 31);
+        let e: [u8; 32] = [0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15, 2, 3, 6, 7, 10, 1, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31];
+        let r: [u8; 32] = transmute(vtrnq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrnq_p16() {
+        let a: i16x8 = i16x8::new(0, 2, 2, 6, 2, 10, 6, 14);
+        let b: i16x8 = i16x8::new(1, 3, 3, 7, 3, 1, 7, 15);
+        let e: [u16; 16] = [0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15];
+        let r: [u16; 16] = transmute(vtrnq_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn_s32() {
+        let a: i32x2 = i32x2::new(0, 2);
+        let b: i32x2 = i32x2::new(1, 3);
+        let e: [i32; 4] = [0, 1, 2, 3];
+        let r: [i32; 4] = transmute(vtrn_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn_u32() {
+        let a: u32x2 = u32x2::new(0, 2);
+        let b: u32x2 = u32x2::new(1, 3);
+        let e: [u32; 4] = [0, 1, 2, 3];
+        let r: [u32; 4] = transmute(vtrn_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrn_f32() {
+        let a: f32x2 = f32x2::new(0., 2.);
+        let b: f32x2 = f32x2::new(1., 3.);
+        let e: [f32; 4] = [0., 1., 2., 3.];
+        let r: [f32; 4] = transmute(vtrn_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vtrnq_f32() {
+        let a: f32x4 = f32x4::new(0., 2., 2., 6.);
+        let b: f32x4 = f32x4::new(1., 3., 3., 7.);
+        let e: [f32; 8] = [0., 1., 2., 3., 2., 3., 6., 7.];
+        let r: [f32; 8] = transmute(vtrnq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip_s8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i8x8 = i8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: [i8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+        let r: [i8; 16] = transmute(vzip_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip_s16() {
+        let a: i16x4 = i16x4::new(0, 2, 4, 6);
+        let b: i16x4 = i16x4::new(1, 3, 5, 7);
+        let e: [i16; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+        let r: [i16; 8] = transmute(vzip_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip_u8() {
+        let a: u8x8 = u8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: u8x8 = u8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+        let r: [u8; 16] = transmute(vzip_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip_u16() {
+        let a: u16x4 = u16x4::new(0, 2, 4, 6);
+        let b: u16x4 = u16x4::new(1, 3, 5, 7);
+        let e: [u16; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+        let r: [u16; 8] = transmute(vzip_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip_p8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i8x8 = i8x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+        let r: [u8; 16] = transmute(vzip_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip_p16() {
+        let a: i16x4 = i16x4::new(0, 2, 4, 6);
+        let b: i16x4 = i16x4::new(1, 3, 5, 7);
+        let e: [u16; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+        let r: [u16; 8] = transmute(vzip_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip_s32() {
+        let a: i32x2 = i32x2::new(0, 2);
+        let b: i32x2 = i32x2::new(1, 3);
+        let e: [i32; 4] = [0, 1, 2, 3];
+        let r: [i32; 4] = transmute(vzip_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip_u32() {
+        let a: u32x2 = u32x2::new(0, 2);
+        let b: u32x2 = u32x2::new(1, 3);
+        let e: [u32; 4] = [0, 1, 2, 3];
+        let r: [u32; 4] = transmute(vzip_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzipq_s8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: [i8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
+        let r: [i8; 32] = transmute(vzipq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzipq_s16() {
+        let a: i16x8 = i16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i16x8 = i16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: [i16; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+        let r: [i16; 16] = transmute(vzipq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzipq_s32() {
+        let a: i32x4 = i32x4::new(0, 2, 4, 6);
+        let b: i32x4 = i32x4::new(1, 3, 5, 7);
+        let e: [i32; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+        let r: [i32; 8] = transmute(vzipq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzipq_u8() {
+        let a: u8x16 = u8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: u8x16 = u8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
+        let r: [u8; 32] = transmute(vzipq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzipq_u16() {
+        let a: u16x8 = u16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: u16x8 = u16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: [u16; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+        let r: [u16; 16] = transmute(vzipq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzipq_u32() {
+        let a: u32x4 = u32x4::new(0, 2, 4, 6);
+        let b: u32x4 = u32x4::new(1, 3, 5, 7);
+        let e: [u32; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+        let r: [u32; 8] = transmute(vzipq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzipq_p8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let b: i8x16 = i8x16::new(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+        let e: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
+        let r: [u8; 32] = transmute(vzipq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzipq_p16() {
+        let a: i16x8 = i16x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let b: i16x8 = i16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+        let e: [u16; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+        let r: [u16; 16] = transmute(vzipq_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzip_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(5., 6.);
+        let e: [f32; 4] = [1., 5., 2., 6.];
+        let r: [f32; 4] = transmute(vzip_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vzipq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x4 = f32x4::new(5., 6., 7., 8.);
+        let e: [f32; 8] = [1., 5., 2., 6., 3., 7., 4., 8.];
+        let r: [f32; 8] = transmute(vzipq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 2, 3, 2, 3, 3, 8);
+        let b: i8x8 = i8x8::new(2, 3, 3, 8, 3, 15, 8, 16);
+        let e: [i8; 16] = [1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16];
+        let r: [i8; 16] = transmute(vuzp_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 2, 3);
+        let b: i16x4 = i16x4::new(2, 3, 3, 8);
+        let e: [i16; 8] = [1, 2, 2, 3, 2, 3, 3, 8];
+        let r: [i16; 8] = transmute(vuzp_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzpq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 15, 8, 16);
+        let b: i8x16 = i8x16::new(2, 3, 3, 8, 3, 15, 8, 16, 3, 29, 8, 30, 15, 31, 16, 32);
+        let e: [i8; 32] = [1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16, 2, 3, 3, 8, 3, 8, 15, 16, 3, 8, 15, 16, 29, 30, 31, 32];
+        let r: [i8; 32] = transmute(vuzpq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzpq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 2, 3, 2, 3, 3, 8);
+        let b: i16x8 = i16x8::new(2, 3, 3, 8, 3, 15, 8, 16);
+        let e: [i16; 16] = [1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16];
+        let r: [i16; 16] = transmute(vuzpq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzpq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 2, 3);
+        let b: i32x4 = i32x4::new(2, 3, 3, 8);
+        let e: [i32; 8] = [1, 2, 2, 3, 2, 3, 3, 8];
+        let r: [i32; 8] = transmute(vuzpq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 2, 3, 2, 3, 3, 8);
+        let b: u8x8 = u8x8::new(2, 3, 3, 8, 3, 15, 8, 16);
+        let e: [u8; 16] = [1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16];
+        let r: [u8; 16] = transmute(vuzp_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 2, 3);
+        let b: u16x4 = u16x4::new(2, 3, 3, 8);
+        let e: [u16; 8] = [1, 2, 2, 3, 2, 3, 3, 8];
+        let r: [u16; 8] = transmute(vuzp_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzpq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 15, 8, 16);
+        let b: u8x16 = u8x16::new(2, 3, 3, 8, 3, 15, 8, 16, 3, 29, 8, 30, 15, 31, 16, 32);
+        let e: [u8; 32] = [1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16, 2, 3, 3, 8, 3, 8, 15, 16, 3, 8, 15, 16, 29, 30, 31, 32];
+        let r: [u8; 32] = transmute(vuzpq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzpq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 2, 3, 2, 3, 3, 8);
+        let b: u16x8 = u16x8::new(2, 3, 3, 8, 3, 15, 8, 16);
+        let e: [u16; 16] = [1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16];
+        let r: [u16; 16] = transmute(vuzpq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzpq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 2, 3);
+        let b: u32x4 = u32x4::new(2, 3, 3, 8);
+        let e: [u32; 8] = [1, 2, 2, 3, 2, 3, 3, 8];
+        let r: [u32; 8] = transmute(vuzpq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp_p8() {
+        let a: i8x8 = i8x8::new(1, 2, 2, 3, 2, 3, 3, 8);
+        let b: i8x8 = i8x8::new(2, 3, 3, 8, 3, 15, 8, 16);
+        let e: [u8; 16] = [1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16];
+        let r: [u8; 16] = transmute(vuzp_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp_p16() {
+        let a: i16x4 = i16x4::new(1, 2, 2, 3);
+        let b: i16x4 = i16x4::new(2, 3, 3, 8);
+        let e: [u16; 8] = [1, 2, 2, 3, 2, 3, 3, 8];
+        let r: [u16; 8] = transmute(vuzp_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzpq_p8() {
+        let a: i8x16 = i8x16::new(1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 15, 8, 16);
+        let b: i8x16 = i8x16::new(2, 3, 3, 8, 3, 15, 8, 16, 3, 29, 8, 30, 15, 31, 16, 32);
+        let e: [u8; 32] = [1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16, 2, 3, 3, 8, 3, 8, 15, 16, 3, 8, 15, 16, 29, 30, 31, 32];
+        let r: [u8; 32] = transmute(vuzpq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzpq_p16() {
+        let a: i16x8 = i16x8::new(1, 2, 2, 3, 2, 3, 3, 8);
+        let b: i16x8 = i16x8::new(2, 3, 3, 8, 3, 15, 8, 16);
+        let e: [u16; 16] = [1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16];
+        let r: [u16; 16] = transmute(vuzpq_p16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(2, 3);
+        let e: [i32; 4] = [1, 2, 2, 3];
+        let r: [i32; 4] = transmute(vuzp_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(2, 3);
+        let e: [u32; 4] = [1, 2, 2, 3];
+        let r: [u32; 4] = transmute(vuzp_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzp_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(2., 6.);
+        let e: [f32; 4] = [1., 2., 2., 6.];
+        let r: [f32; 4] = transmute(vuzp_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuzpq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 2., 4.);
+        let b: f32x4 = f32x4::new(2., 6., 6., 8.);
+        let e: [f32; 8] = [1., 2., 2., 6., 2., 4., 6., 8.];
+        let r: [f32; 8] = transmute(vuzpq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_u8() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x8 = u8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: u16x8 = u16x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let r: u16x8 = transmute(vabal_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_u16() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let c: u16x4 = u16x4::new(10, 10, 10, 10);
+        let e: u32x4 = u32x4::new(10, 10, 10, 10);
+        let r: u32x4 = transmute(vabal_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_u32() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u32x2 = u32x2::new(1, 2);
+        let c: u32x2 = u32x2::new(10, 10);
+        let e: u64x2 = u64x2::new(10, 10);
+        let r: u64x2 = transmute(vabal_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_s8() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x8 = i8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: i16x8 = i16x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let r: i16x8 = transmute(vabal_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_s16() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let c: i16x4 = i16x4::new(10, 10, 10, 10);
+        let e: i32x4 = i32x4::new(10, 10, 10, 10);
+        let r: i32x4 = transmute(vabal_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabal_s32() {
+        let a: i64x2 = i64x2::new(1, 2);
+        let b: i32x2 = i32x2::new(1, 2);
+        let c: i32x2 = i32x2::new(10, 10);
+        let e: i64x2 = i64x2::new(10, 10);
+        let r: i64x2 = transmute(vabal_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabs_s8() {
+        let a: i8x8 = i8x8::new(-128, 0x7F, -6, -5, -4, -3, -2, -1);
+        let e: i8x8 = i8x8::new(0x7F, 0x7F, 6, 5, 4, 3, 2, 1);
+        let r: i8x8 = transmute(vqabs_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabsq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0x7F, -6, -5, -4, -3, -2, -1, 0, -127, 127, 1, 2, 3, 4, 5);
+        let e: i8x16 = i8x16::new(0x7F, 0x7F, 6, 5, 4, 3, 2, 1, 0, 127, 127, 1, 2, 3, 4, 5);
+        let r: i8x16 = transmute(vqabsq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabs_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0x7F_FF, -6, -5);
+        let e: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 6, 5);
+        let r: i16x4 = transmute(vqabs_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabsq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0x7F_FF, -6, -5, -4, -3, -2, -1);
+        let e: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 6, 5, 4, 3, 2, 1);
+        let r: i16x8 = transmute(vqabsq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabs_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0x7F_FF_FF_FF);
+        let e: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
+        let r: i32x2 = transmute(vqabs_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqabsq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0x7F_FF_FF_FF, -6, -5);
+        let e: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 6, 5);
+        let r: i32x4 = transmute(vqabsq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/load_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/load_tests.rs
new file mode 100644
index 000000000..bbee29ae7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/load_tests.rs
@@ -0,0 +1,206 @@
+//! Tests for ARM+v7+neon load (vld1) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+#[cfg(target_arch = "aarch64")]
+use crate::core_arch::aarch64::*;
+
+use crate::core_arch::simd::*;
+use std::mem;
+use stdarch_test::simd_test;
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s8() {
+    let a: [i8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: i8x8 = transmute(vld1_s8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s8() {
+    let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let e = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    let r: i8x16 = transmute(vld1q_s8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s16() {
+    let a: [i16; 5] = [0, 1, 2, 3, 4];
+    let e = i16x4::new(1, 2, 3, 4);
+    let r: i16x4 = transmute(vld1_s16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s16() {
+    let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: i16x8 = transmute(vld1q_s16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s32() {
+    let a: [i32; 3] = [0, 1, 2];
+    let e = i32x2::new(1, 2);
+    let r: i32x2 = transmute(vld1_s32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s32() {
+    let a: [i32; 5] = [0, 1, 2, 3, 4];
+    let e = i32x4::new(1, 2, 3, 4);
+    let r: i32x4 = transmute(vld1q_s32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s64() {
+    let a: [i64; 2] = [0, 1];
+    let e = i64x1::new(1);
+    let r: i64x1 = transmute(vld1_s64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s64() {
+    let a: [i64; 3] = [0, 1, 2];
+    let e = i64x2::new(1, 2);
+    let r: i64x2 = transmute(vld1q_s64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u8() {
+    let a: [u8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u8x8 = transmute(vld1_u8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u8() {
+    let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let e = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    let r: u8x16 = transmute(vld1q_u8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u16() {
+    let a: [u16; 5] = [0, 1, 2, 3, 4];
+    let e = u16x4::new(1, 2, 3, 4);
+    let r: u16x4 = transmute(vld1_u16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u16() {
+    let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u16x8 = transmute(vld1q_u16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u32() {
+    let a: [u32; 3] = [0, 1, 2];
+    let e = u32x2::new(1, 2);
+    let r: u32x2 = transmute(vld1_u32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u32() {
+    let a: [u32; 5] = [0, 1, 2, 3, 4];
+    let e = u32x4::new(1, 2, 3, 4);
+    let r: u32x4 = transmute(vld1q_u32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u64() {
+    let a: [u64; 2] = [0, 1];
+    let e = u64x1::new(1);
+    let r: u64x1 = transmute(vld1_u64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u64() {
+    let a: [u64; 3] = [0, 1, 2];
+    let e = u64x2::new(1, 2);
+    let r: u64x2 = transmute(vld1q_u64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_p8() {
+    let a: [p8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u8x8 = transmute(vld1_p8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_p8() {
+    let a: [p8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let e = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    let r: u8x16 = transmute(vld1q_p8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_p16() {
+    let a: [p16; 5] = [0, 1, 2, 3, 4];
+    let e = u16x4::new(1, 2, 3, 4);
+    let r: u16x4 = transmute(vld1_p16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_p16() {
+    let a: [p16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u16x8 = transmute(vld1q_p16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vld1_p64() {
+    let a: [p64; 2] = [0, 1];
+    let e = u64x1::new(1);
+    let r: u64x1 = transmute(vld1_p64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vld1q_p64() {
+    let a: [p64; 3] = [0, 1, 2];
+    let e = u64x2::new(1, 2);
+    let r: u64x2 = transmute(vld1q_p64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_f32() {
+    let a: [f32; 3] = [0., 1., 2.];
+    let e = f32x2::new(1., 2.);
+    let r: f32x2 = transmute(vld1_f32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_f32() {
+    let a: [f32; 5] = [0., 1., 2., 3., 4.];
+    let e = f32x4::new(1., 2., 3., 4.);
+    let r: f32x4 = transmute(vld1q_f32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
new file mode 100644
index 000000000..952d1ca2e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
@@ -0,0 +1,12347 @@
+//! ARMv7 NEON intrinsics
+
+#[rustfmt::skip]
+mod generated;
+#[rustfmt::skip]
+pub use self::generated::*;
+
+use crate::{
+    core_arch::simd::*, core_arch::simd_llvm::*, hint::unreachable_unchecked, mem::transmute,
+};
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+pub(crate) type p8 = u8;
+pub(crate) type p16 = u16;
+pub(crate) type p64 = u64;
+pub(crate) type p128 = u128;
+
+types! {
+    /// ARM-specific 64-bit wide vector of eight packed `i8`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct int8x8_t(pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8);
+    /// ARM-specific 64-bit wide vector of eight packed `u8`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct uint8x8_t(pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8);
+    /// ARM-specific 64-bit wide polynomial vector of eight packed `p8`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct poly8x8_t(pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8);
+    /// ARM-specific 64-bit wide vector of four packed `i16`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct int16x4_t(pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16);
+    /// ARM-specific 64-bit wide vector of four packed `u16`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct uint16x4_t(pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16);
+    // FIXME: ARM-specific 64-bit wide vector of four packed `f16`.
+    // pub struct float16x4_t(f16, f16, f16, f16);
+    /// ARM-specific 64-bit wide vector of four packed `p16`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct poly16x4_t(pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16);
+    /// ARM-specific 64-bit wide vector of two packed `i32`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct int32x2_t(pub(crate) i32, pub(crate) i32);
+    /// ARM-specific 64-bit wide vector of two packed `u32`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct uint32x2_t(pub(crate) u32, pub(crate) u32);
+    /// ARM-specific 64-bit wide vector of two packed `f32`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct float32x2_t(pub(crate) f32, pub(crate) f32);
+    /// ARM-specific 64-bit wide vector of one packed `i64`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct int64x1_t(pub(crate) i64);
+    /// ARM-specific 64-bit wide vector of one packed `u64`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct uint64x1_t(pub(crate) u64);
+    /// ARM-specific 64-bit wide vector of one packed `p64`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct poly64x1_t(pub(crate) p64);
+
+    /// ARM-specific 128-bit wide vector of sixteen packed `i8`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct int8x16_t(
+        pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8 , pub(crate) i8, pub(crate) i8,
+        pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8 , pub(crate) i8, pub(crate) i8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct uint8x16_t(
+        pub(crate) u8, pub(crate) u8 , pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8 , pub(crate) u8, pub(crate) u8,
+        pub(crate) u8, pub(crate) u8 , pub(crate) u8,  pub(crate) u8,  pub(crate) u8,  pub(crate) u8 , pub(crate) u8,  pub(crate) u8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `p8`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct poly8x16_t(
+         pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,
+         pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,
+    );
+    /// ARM-specific 128-bit wide vector of eight packed `i16`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct int16x8_t(pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16);
+    /// ARM-specific 128-bit wide vector of eight packed `u16`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct uint16x8_t(pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16);
+    // FIXME: ARM-specific 128-bit wide vector of eight packed `f16`.
+    // pub struct float16x8_t(f16, f16, f16, f16, f16, f16, f16);
+    /// ARM-specific 128-bit wide vector of eight packed `p16`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct poly16x8_t(pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16);
+    /// ARM-specific 128-bit wide vector of four packed `i32`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct int32x4_t(pub(crate) i32, pub(crate) i32, pub(crate) i32, pub(crate) i32);
+    /// ARM-specific 128-bit wide vector of four packed `u32`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct uint32x4_t(pub(crate) u32, pub(crate) u32, pub(crate) u32, pub(crate) u32);
+    /// ARM-specific 128-bit wide vector of four packed `f32`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct float32x4_t(pub(crate) f32, pub(crate) f32, pub(crate) f32, pub(crate) f32);
+    /// ARM-specific 128-bit wide vector of two packed `i64`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct int64x2_t(pub(crate) i64, pub(crate) i64);
+    /// ARM-specific 128-bit wide vector of two packed `u64`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct uint64x2_t(pub(crate) u64, pub(crate) u64);
+    /// ARM-specific 128-bit wide vector of two packed `p64`.
+    #[cfg_attr(target_arch = "aarch64", stable(feature = "neon_intrinsics", since = "1.59.0"))]
+    pub struct poly64x2_t(pub(crate) p64, pub(crate) p64);
+}
+
+/// ARM-specific type containing two `int8x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int8x8x2_t(pub int8x8_t, pub int8x8_t);
+/// ARM-specific type containing three `int8x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int8x8x3_t(pub int8x8_t, pub int8x8_t, pub int8x8_t);
+/// ARM-specific type containing four `int8x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int8x8x4_t(pub int8x8_t, pub int8x8_t, pub int8x8_t, pub int8x8_t);
+
+/// ARM-specific type containing two `int8x16_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int8x16x2_t(pub int8x16_t, pub int8x16_t);
+/// ARM-specific type containing three `int8x16_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int8x16x3_t(pub int8x16_t, pub int8x16_t, pub int8x16_t);
+/// ARM-specific type containing four `int8x16_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int8x16x4_t(pub int8x16_t, pub int8x16_t, pub int8x16_t, pub int8x16_t);
+
+/// ARM-specific type containing two `uint8x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint8x8x2_t(pub uint8x8_t, pub uint8x8_t);
+/// ARM-specific type containing three `uint8x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint8x8x3_t(pub uint8x8_t, pub uint8x8_t, pub uint8x8_t);
+/// ARM-specific type containing four `uint8x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint8x8x4_t(pub uint8x8_t, pub uint8x8_t, pub uint8x8_t, pub uint8x8_t);
+
+/// ARM-specific type containing two `uint8x16_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint8x16x2_t(pub uint8x16_t, pub uint8x16_t);
+/// ARM-specific type containing three `uint8x16_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint8x16x3_t(pub uint8x16_t, pub uint8x16_t, pub uint8x16_t);
+/// ARM-specific type containing four `uint8x16_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint8x16x4_t(
+    pub uint8x16_t,
+    pub uint8x16_t,
+    pub uint8x16_t,
+    pub uint8x16_t,
+);
+
+/// ARM-specific type containing two `poly8x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly8x8x2_t(pub poly8x8_t, pub poly8x8_t);
+/// ARM-specific type containing three `poly8x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly8x8x3_t(pub poly8x8_t, pub poly8x8_t, pub poly8x8_t);
+/// ARM-specific type containing four `poly8x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly8x8x4_t(pub poly8x8_t, pub poly8x8_t, pub poly8x8_t, pub poly8x8_t);
+
+/// ARM-specific type containing two `poly8x16_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly8x16x2_t(pub poly8x16_t, pub poly8x16_t);
+/// ARM-specific type containing three `poly8x16_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly8x16x3_t(pub poly8x16_t, pub poly8x16_t, pub poly8x16_t);
+/// ARM-specific type containing four `poly8x16_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly8x16x4_t(
+    pub poly8x16_t,
+    pub poly8x16_t,
+    pub poly8x16_t,
+    pub poly8x16_t,
+);
+
+/// ARM-specific type containing two `int16x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int16x4x2_t(pub int16x4_t, pub int16x4_t);
+/// ARM-specific type containing three `int16x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int16x4x3_t(pub int16x4_t, pub int16x4_t, pub int16x4_t);
+/// ARM-specific type containing four `int16x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int16x4x4_t(pub int16x4_t, pub int16x4_t, pub int16x4_t, pub int16x4_t);
+
+/// ARM-specific type containing two `int16x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int16x8x2_t(pub int16x8_t, pub int16x8_t);
+/// ARM-specific type containing three `int16x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int16x8x3_t(pub int16x8_t, pub int16x8_t, pub int16x8_t);
+/// ARM-specific type containing four `int16x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int16x8x4_t(pub int16x8_t, pub int16x8_t, pub int16x8_t, pub int16x8_t);
+
+/// ARM-specific type containing two `uint16x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint16x4x2_t(pub uint16x4_t, pub uint16x4_t);
+/// ARM-specific type containing three `uint16x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint16x4x3_t(pub uint16x4_t, pub uint16x4_t, pub uint16x4_t);
+/// ARM-specific type containing four `uint16x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint16x4x4_t(
+    pub uint16x4_t,
+    pub uint16x4_t,
+    pub uint16x4_t,
+    pub uint16x4_t,
+);
+
+/// ARM-specific type containing two `uint16x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint16x8x2_t(pub uint16x8_t, pub uint16x8_t);
+/// ARM-specific type containing three `uint16x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint16x8x3_t(pub uint16x8_t, pub uint16x8_t, pub uint16x8_t);
+/// ARM-specific type containing four `uint16x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint16x8x4_t(
+    pub uint16x8_t,
+    pub uint16x8_t,
+    pub uint16x8_t,
+    pub uint16x8_t,
+);
+
+/// ARM-specific type containing two `poly16x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly16x4x2_t(pub poly16x4_t, pub poly16x4_t);
+/// ARM-specific type containing three `poly16x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly16x4x3_t(pub poly16x4_t, pub poly16x4_t, pub poly16x4_t);
+/// ARM-specific type containing four `poly16x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly16x4x4_t(
+    pub poly16x4_t,
+    pub poly16x4_t,
+    pub poly16x4_t,
+    pub poly16x4_t,
+);
+
+/// ARM-specific type containing two `poly16x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly16x8x2_t(pub poly16x8_t, pub poly16x8_t);
+/// ARM-specific type containing three `poly16x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly16x8x3_t(pub poly16x8_t, pub poly16x8_t, pub poly16x8_t);
+/// ARM-specific type containing four `poly16x8_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly16x8x4_t(
+    pub poly16x8_t,
+    pub poly16x8_t,
+    pub poly16x8_t,
+    pub poly16x8_t,
+);
+
+/// ARM-specific type containing two `int32x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int32x2x2_t(pub int32x2_t, pub int32x2_t);
+/// ARM-specific type containing three `int32x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int32x2x3_t(pub int32x2_t, pub int32x2_t, pub int32x2_t);
+/// ARM-specific type containing four `int32x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int32x2x4_t(pub int32x2_t, pub int32x2_t, pub int32x2_t, pub int32x2_t);
+
+/// ARM-specific type containing two `int32x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int32x4x2_t(pub int32x4_t, pub int32x4_t);
+/// ARM-specific type containing three `int32x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int32x4x3_t(pub int32x4_t, pub int32x4_t, pub int32x4_t);
+/// ARM-specific type containing four `int32x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int32x4x4_t(pub int32x4_t, pub int32x4_t, pub int32x4_t, pub int32x4_t);
+
+/// ARM-specific type containing two `uint32x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint32x2x2_t(pub uint32x2_t, pub uint32x2_t);
+/// ARM-specific type containing three `uint32x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint32x2x3_t(pub uint32x2_t, pub uint32x2_t, pub uint32x2_t);
+/// ARM-specific type containing four `uint32x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint32x2x4_t(
+    pub uint32x2_t,
+    pub uint32x2_t,
+    pub uint32x2_t,
+    pub uint32x2_t,
+);
+
+/// ARM-specific type containing two `uint32x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint32x4x2_t(pub uint32x4_t, pub uint32x4_t);
+/// ARM-specific type containing three `uint32x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint32x4x3_t(pub uint32x4_t, pub uint32x4_t, pub uint32x4_t);
+/// ARM-specific type containing four `uint32x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint32x4x4_t(
+    pub uint32x4_t,
+    pub uint32x4_t,
+    pub uint32x4_t,
+    pub uint32x4_t,
+);
+
+/// ARM-specific type containing two `float32x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct float32x2x2_t(pub float32x2_t, pub float32x2_t);
+/// ARM-specific type containing three `float32x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct float32x2x3_t(pub float32x2_t, pub float32x2_t, pub float32x2_t);
+/// ARM-specific type containing four `float32x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct float32x2x4_t(
+    pub float32x2_t,
+    pub float32x2_t,
+    pub float32x2_t,
+    pub float32x2_t,
+);
+
+/// ARM-specific type containing two `float32x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct float32x4x2_t(pub float32x4_t, pub float32x4_t);
+/// ARM-specific type containing three `float32x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct float32x4x3_t(pub float32x4_t, pub float32x4_t, pub float32x4_t);
+/// ARM-specific type containing four `float32x4_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct float32x4x4_t(
+    pub float32x4_t,
+    pub float32x4_t,
+    pub float32x4_t,
+    pub float32x4_t,
+);
+
+/// ARM-specific type containing four `int64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int64x1x2_t(pub int64x1_t, pub int64x1_t);
+/// ARM-specific type containing four `int64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int64x1x3_t(pub int64x1_t, pub int64x1_t, pub int64x1_t);
+/// ARM-specific type containing four `int64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int64x1x4_t(pub int64x1_t, pub int64x1_t, pub int64x1_t, pub int64x1_t);
+
+/// ARM-specific type containing four `int64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int64x2x2_t(pub int64x2_t, pub int64x2_t);
+/// ARM-specific type containing four `int64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int64x2x3_t(pub int64x2_t, pub int64x2_t, pub int64x2_t);
+/// ARM-specific type containing four `int64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct int64x2x4_t(pub int64x2_t, pub int64x2_t, pub int64x2_t, pub int64x2_t);
+
+/// ARM-specific type containing four `uint64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint64x1x2_t(pub uint64x1_t, pub uint64x1_t);
+/// ARM-specific type containing four `uint64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint64x1x3_t(pub uint64x1_t, pub uint64x1_t, pub uint64x1_t);
+/// ARM-specific type containing four `uint64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint64x1x4_t(
+    pub uint64x1_t,
+    pub uint64x1_t,
+    pub uint64x1_t,
+    pub uint64x1_t,
+);
+
+/// ARM-specific type containing four `uint64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint64x2x2_t(pub uint64x2_t, pub uint64x2_t);
+/// ARM-specific type containing four `uint64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint64x2x3_t(pub uint64x2_t, pub uint64x2_t, pub uint64x2_t);
+/// ARM-specific type containing four `uint64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct uint64x2x4_t(
+    pub uint64x2_t,
+    pub uint64x2_t,
+    pub uint64x2_t,
+    pub uint64x2_t,
+);
+
+/// ARM-specific type containing four `poly64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly64x1x2_t(pub poly64x1_t, pub poly64x1_t);
+/// ARM-specific type containing four `poly64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly64x1x3_t(pub poly64x1_t, pub poly64x1_t, pub poly64x1_t);
+/// ARM-specific type containing four `poly64x1_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly64x1x4_t(
+    pub poly64x1_t,
+    pub poly64x1_t,
+    pub poly64x1_t,
+    pub poly64x1_t,
+);
+
+/// ARM-specific type containing four `poly64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly64x2x2_t(pub poly64x2_t, pub poly64x2_t);
+/// ARM-specific type containing four `poly64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly64x2x3_t(pub poly64x2_t, pub poly64x2_t, pub poly64x2_t);
+/// ARM-specific type containing four `poly64x2_t` vectors.
+#[derive(Copy, Clone, Debug)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub struct poly64x2x4_t(
+    pub poly64x2_t,
+    pub poly64x2_t,
+    pub poly64x2_t,
+    pub poly64x2_t,
+);
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    // absolute value (64-bit)
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.abs.v8i8")]
+    fn vabs_s8_(a: int8x8_t) -> int8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.abs.v4i16")]
+    fn vabs_s16_(a: int16x4_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.abs.v2i32")]
+    fn vabs_s32_(a: int32x2_t) -> int32x2_t;
+    // absolute value (128-bit)
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v16i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.abs.v16i8")]
+    fn vabsq_s8_(a: int8x16_t) -> int8x16_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v8i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.abs.v8i16")]
+    fn vabsq_s16_(a: int16x8_t) -> int16x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v4i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.abs.v4i32")]
+    fn vabsq_s32_(a: int32x4_t) -> int32x4_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8")]
+    fn vpmins_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16")]
+    fn vpmins_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32")]
+    fn vpmins_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8")]
+    fn vpminu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16")]
+    fn vpminu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32")]
+    fn vpminu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2f32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32")]
+    fn vpminf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8")]
+    fn vpmaxs_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16")]
+    fn vpmaxs_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32")]
+    fn vpmaxs_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8")]
+    fn vpmaxu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16")]
+    fn vpmaxu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32")]
+    fn vpmaxu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2f32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32")]
+    fn vpmaxf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vraddhn.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.raddhn.v8i8")]
+    fn vraddhn_s16_(a: int16x8_t, b: int16x8_t) -> int8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vraddhn.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.raddhn.v4i16")]
+    fn vraddhn_s32_(a: int32x4_t, b: int32x4_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vraddhn.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.raddhn.v2i32")]
+    fn vraddhn_s64_(a: int64x2_t, b: int64x2_t) -> int32x2_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.addp.v4i16")]
+    fn vpadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.addp.v2i32")]
+    fn vpadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.addp.v8i8")]
+    fn vpadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v4i16.v8i8")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v4i16.v8i8"
+    )]
+    pub(crate) fn vpaddl_s8_(a: int8x8_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v2i32.v4i16")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v2i32.v4i16"
+    )]
+    pub(crate) fn vpaddl_s16_(a: int16x4_t) -> int32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v1i64.v2i32")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v1i64.v2i32"
+    )]
+    pub(crate) fn vpaddl_s32_(a: int32x2_t) -> int64x1_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v8i16.v16i8")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v8i16.v16i8"
+    )]
+    pub(crate) fn vpaddlq_s8_(a: int8x16_t) -> int16x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v4i32.v8i16")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v4i32.v8i16"
+    )]
+    pub(crate) fn vpaddlq_s16_(a: int16x8_t) -> int32x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v2i64.v4i32")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v2i64.v4i32"
+    )]
+    pub(crate) fn vpaddlq_s32_(a: int32x4_t) -> int64x2_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v4i16.v8i8")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v4i16.v8i8"
+    )]
+    pub(crate) fn vpaddl_u8_(a: uint8x8_t) -> uint16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v2i32.v4i16")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v2i32.v4i16"
+    )]
+    pub(crate) fn vpaddl_u16_(a: uint16x4_t) -> uint32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v1i64.v2i32")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v1i64.v2i32"
+    )]
+    pub(crate) fn vpaddl_u32_(a: uint32x2_t) -> uint64x1_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v8i16.v16i8")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v8i16.v16i8"
+    )]
+    pub(crate) fn vpaddlq_u8_(a: uint8x16_t) -> uint16x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v4i32.v8i16")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v4i32.v8i16"
+    )]
+    pub(crate) fn vpaddlq_u16_(a: uint16x8_t) -> uint32x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v2i64.v4i32")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v2i64.v4i32"
+    )]
+    pub(crate) fn vpaddlq_u32_(a: uint32x4_t) -> uint64x2_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.ctpop.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ctpop.v8i8")]
+    fn vcnt_s8_(a: int8x8_t) -> int8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.ctpop.v16i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ctpop.v16i8")]
+    fn vcntq_s8_(a: int8x16_t) -> int8x16_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.ctlz.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ctlz.v8i8")]
+    fn vclz_s8_(a: int8x8_t) -> int8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.ctlz.v16i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ctlz.v16i8")]
+    fn vclzq_s8_(a: int8x16_t) -> int8x16_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.ctlz.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ctlz.v4i16")]
+    fn vclz_s16_(a: int16x4_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.ctlz.v8i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ctlz.v8i16")]
+    fn vclzq_s16_(a: int16x8_t) -> int16x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.ctlz.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ctlz.v2i32")]
+    fn vclz_s32_(a: int32x2_t) -> int32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.ctlz.v4i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ctlz.v4i32")]
+    fn vclzq_s32_(a: int32x4_t) -> int32x4_t;
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 7))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_s8<const LANE: i32>(ptr: *const i8, src: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 15))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 15))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_s8<const LANE: i32>(ptr: *const i8, src: int8x16_t) -> int8x16_t {
+    static_assert_imm4!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 3))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_s16<const LANE: i32>(ptr: *const i16, src: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 7))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_s16<const LANE: i32>(ptr: *const i16, src: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_s32<const LANE: i32>(ptr: *const i32, src: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 3))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_s32<const LANE: i32>(ptr: *const i32, src: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr, LANE = 0))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_s64<const LANE: i32>(ptr: *const i64, src: int64x1_t) -> int64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_s64<const LANE: i32>(ptr: *const i64, src: int64x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 7))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_u8<const LANE: i32>(ptr: *const u8, src: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 15))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 15))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_u8<const LANE: i32>(ptr: *const u8, src: uint8x16_t) -> uint8x16_t {
+    static_assert_imm4!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 3))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_u16<const LANE: i32>(ptr: *const u16, src: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 7))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_u16<const LANE: i32>(ptr: *const u16, src: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_u32<const LANE: i32>(ptr: *const u32, src: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 3))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_u32<const LANE: i32>(ptr: *const u32, src: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr, LANE = 0))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_u64<const LANE: i32>(ptr: *const u64, src: uint64x1_t) -> uint64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_u64<const LANE: i32>(ptr: *const u64, src: uint64x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 7))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_p8<const LANE: i32>(ptr: *const p8, src: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", LANE = 15))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 15))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_p8<const LANE: i32>(ptr: *const p8, src: poly8x16_t) -> poly8x16_t {
+    static_assert_imm4!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 3))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_p16<const LANE: i32>(ptr: *const p16, src: poly16x4_t) -> poly16x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", LANE = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 7))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_p16<const LANE: i32>(ptr: *const p16, src: poly16x8_t) -> poly16x8_t {
+    static_assert_imm3!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr, LANE = 0))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_p64<const LANE: i32>(ptr: *const p64, src: poly64x1_t) -> poly64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_p64<const LANE: i32>(ptr: *const p64, src: poly64x2_t) -> poly64x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", LANE = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 3))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    simd_insert(src, LANE as u32, *ptr)
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
+    let x = vld1_lane_s8::<0>(ptr, transmute(i8x8::splat(0)));
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
+    let x = vld1q_lane_s8::<0>(ptr, transmute(i8x16::splat(0)));
+    simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
+    let x = vld1_lane_s16::<0>(ptr, transmute(i16x4::splat(0)));
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
+    let x = vld1q_lane_s16::<0>(ptr, transmute(i16x8::splat(0)));
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
+    let x = vld1_lane_s32::<0>(ptr, transmute(i32x2::splat(0)));
+    simd_shuffle2!(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t {
+    let x = vld1q_lane_s32::<0>(ptr, transmute(i32x4::splat(0)));
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_s64(ptr: *const i64) -> int64x1_t {
+    #[cfg(target_arch = "aarch64")]
+    {
+        crate::core_arch::aarch64::vld1_s64(ptr)
+    }
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::vld1_s64(ptr)
+    }
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
+    let x = vld1q_lane_s64::<0>(ptr, transmute(i64x2::splat(0)));
+    simd_shuffle2!(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
+    let x = vld1_lane_u8::<0>(ptr, transmute(u8x8::splat(0)));
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
+    let x = vld1q_lane_u8::<0>(ptr, transmute(u8x16::splat(0)));
+    simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
+    let x = vld1_lane_u16::<0>(ptr, transmute(u16x4::splat(0)));
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
+    let x = vld1q_lane_u16::<0>(ptr, transmute(u16x8::splat(0)));
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
+    let x = vld1_lane_u32::<0>(ptr, transmute(u32x2::splat(0)));
+    simd_shuffle2!(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t {
+    let x = vld1q_lane_u32::<0>(ptr, transmute(u32x4::splat(0)));
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_u64(ptr: *const u64) -> uint64x1_t {
+    #[cfg(target_arch = "aarch64")]
+    {
+        crate::core_arch::aarch64::vld1_u64(ptr)
+    }
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::vld1_u64(ptr)
+    }
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
+    let x = vld1q_lane_u64::<0>(ptr, transmute(u64x2::splat(0)));
+    simd_shuffle2!(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
+    let x = vld1_lane_p8::<0>(ptr, transmute(u8x8::splat(0)));
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
+    let x = vld1q_lane_p8::<0>(ptr, transmute(u8x16::splat(0)));
+    simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
+    let x = vld1_lane_p16::<0>(ptr, transmute(u16x4::splat(0)));
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
+    let x = vld1q_lane_p16::<0>(ptr, transmute(u16x8::splat(0)));
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
+    let x = vld1_lane_f32::<0>(ptr, transmute(f32x2::splat(0.)));
+    simd_shuffle2!(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1_dup_p64(ptr: *const p64) -> poly64x1_t {
+    #[cfg(target_arch = "aarch64")]
+    {
+        crate::core_arch::aarch64::vld1_p64(ptr)
+    }
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::vld1_p64(ptr)
+    }
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_p64(ptr: *const p64) -> poly64x2_t {
+    let x = vld1q_lane_p64::<0>(ptr, transmute(u64x2::splat(0)));
+    simd_shuffle2!(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vld1q_dup_f32(ptr: *const f32) -> float32x4_t {
+    let x = vld1q_lane_f32::<0>(ptr, transmute(f32x4::splat(0.)));
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
+}
+
+// signed absolute difference and accumulate (64-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("saba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    simd_add(a, vabd_s8(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("saba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    simd_add(a, vabd_s16(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("saba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    simd_add(a, vabd_s32(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("uaba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    simd_add(a, vabd_u8(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("uaba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    simd_add(a, vabd_u16(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("uaba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    simd_add(a, vabd_u32(b, c))
+}
+// signed absolute difference and accumulate (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("saba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    simd_add(a, vabdq_s8(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("saba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    simd_add(a, vabdq_s16(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("saba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    simd_add(a, vabdq_s32(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("uaba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    simd_add(a, vabdq_u8(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("uaba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    simd_add(a, vabdq_u16(b, c))
+}
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vaba.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("uaba"))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    simd_add(a, vabdq_u32(b, c))
+}
+
+/// Absolute value (wrapping).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(abs))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabs_s8(a: int8x8_t) -> int8x8_t {
+    vabs_s8_(a)
+}
+/// Absolute value (wrapping).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(abs))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabs_s16(a: int16x4_t) -> int16x4_t {
+    vabs_s16_(a)
+}
+/// Absolute value (wrapping).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(abs))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabs_s32(a: int32x2_t) -> int32x2_t {
+    vabs_s32_(a)
+}
+/// Absolute value (wrapping).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(abs))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabsq_s8(a: int8x16_t) -> int8x16_t {
+    vabsq_s8_(a)
+}
+/// Absolute value (wrapping).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(abs))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabsq_s16(a: int16x8_t) -> int16x8_t {
+    vabsq_s16_(a)
+}
+/// Absolute value (wrapping).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vabs))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(abs))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vabsq_s32(a: int32x4_t) -> int32x4_t {
+    vabsq_s32_(a)
+}
+
+/// Add pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    vpadd_s16_(a, b)
+}
+/// Add pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    vpadd_s32_(a, b)
+}
+/// Add pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vpadd_s8_(a, b)
+}
+/// Add pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    transmute(vpadd_s16_(transmute(a), transmute(b)))
+}
+/// Add pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    transmute(vpadd_s32_(transmute(a), transmute(b)))
+}
+/// Add pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vpadd_s8_(transmute(a), transmute(b)))
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fadd))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_add(a, b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fadd))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_add(a, b)
+}
+
+/// Signed Add Long (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Long (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Long (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Long (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Long (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Long (vector).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    let a: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    let a: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    let a: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    let a: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), int16x8_t(8, 8, 8, 8, 8, 8, 8, 8)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), int32x4_t(16, 16, 16, 16)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), int64x2_t(32, 32)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), uint16x8_t(8, 8, 8, 8, 8, 8, 8, 8)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), uint32x4_t(16, 16, 16, 16)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), uint64x2_t(32, 32)))
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t(16, 16, 16, 16)));
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t(32, 32)));
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t(16, 16, 16, 16)));
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t(32, 32)));
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    vraddhn_s16_(a, b)
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    vraddhn_s32_(a, b)
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i64))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    vraddhn_s64_(a, b)
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    transmute(vraddhn_s16_(transmute(a), transmute(b)))
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    transmute(vraddhn_s32_(transmute(a), transmute(b)))
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i64))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    transmute(vraddhn_s64_(transmute(a), transmute(b)))
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
+    let x = vraddhn_s16_(a, b);
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
+    let x = vraddhn_s32_(a, b);
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i64))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
+    let x = vraddhn_s64_(a, b);
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
+    let x: uint8x8_t = transmute(vraddhn_s16_(transmute(a), transmute(b)));
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
+    let x: uint16x4_t = transmute(vraddhn_s32_(transmute(a), transmute(b)));
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i64))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vraddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
+    let x: uint32x2_t = transmute(vraddhn_s64_(transmute(a), transmute(b)));
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddl_s8(a: int8x8_t) -> int16x4_t {
+    vpaddl_s8_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddl_s16(a: int16x4_t) -> int32x2_t {
+    vpaddl_s16_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddl_s32(a: int32x2_t) -> int64x1_t {
+    vpaddl_s32_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddlq_s8(a: int8x16_t) -> int16x8_t {
+    vpaddlq_s8_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddlq_s16(a: int16x8_t) -> int32x4_t {
+    vpaddlq_s16_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddlq_s32(a: int32x4_t) -> int64x2_t {
+    vpaddlq_s32_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddl_u8(a: uint8x8_t) -> uint16x4_t {
+    vpaddl_u8_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddl_u16(a: uint16x4_t) -> uint32x2_t {
+    vpaddl_u16_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddl_u32(a: uint32x2_t) -> uint64x1_t {
+    vpaddl_u32_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddlq_u8(a: uint8x16_t) -> uint16x8_t {
+    vpaddlq_u8_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddlq_u16(a: uint16x8_t) -> uint32x4_t {
+    vpaddlq_u16_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpaddlq_u32(a: uint32x4_t) -> uint64x2_t {
+    vpaddlq_u32_(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovn_s16(a: int16x8_t) -> int8x8_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovn_s32(a: int32x4_t) -> int16x4_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovn_s64(a: int64x2_t) -> int32x2_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovn_u16(a: uint16x8_t) -> uint8x8_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovn_u32(a: uint32x4_t) -> uint16x4_t {
+    simd_cast(a)
+}
+
+/// Vector narrow integer.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovn_u64(a: uint64x2_t) -> uint32x2_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovl_s8(a: int8x8_t) -> int16x8_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovl_s16(a: int16x4_t) -> int32x4_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovl_s32(a: int32x2_t) -> int64x2_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovl_u8(a: uint8x8_t) -> uint16x8_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovl_u16(a: uint16x4_t) -> uint32x4_t {
+    simd_cast(a)
+}
+
+/// Vector long move.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovl_u32(a: uint32x2_t) -> uint64x2_t {
+    simd_cast(a)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvn_s8(a: int8x8_t) -> int8x8_t {
+    let b = int8x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvnq_s8(a: int8x16_t) -> int8x16_t {
+    let b = int8x16_t(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    );
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvn_s16(a: int16x4_t) -> int16x4_t {
+    let b = int16x4_t(-1, -1, -1, -1);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvnq_s16(a: int16x8_t) -> int16x8_t {
+    let b = int16x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvn_s32(a: int32x2_t) -> int32x2_t {
+    let b = int32x2_t(-1, -1);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvnq_s32(a: int32x4_t) -> int32x4_t {
+    let b = int32x4_t(-1, -1, -1, -1);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvn_u8(a: uint8x8_t) -> uint8x8_t {
+    let b = uint8x8_t(255, 255, 255, 255, 255, 255, 255, 255);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvnq_u8(a: uint8x16_t) -> uint8x16_t {
+    let b = uint8x16_t(
+        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    );
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvn_u16(a: uint16x4_t) -> uint16x4_t {
+    let b = uint16x4_t(65_535, 65_535, 65_535, 65_535);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvnq_u16(a: uint16x8_t) -> uint16x8_t {
+    let b = uint16x8_t(
+        65_535, 65_535, 65_535, 65_535, 65_535, 65_535, 65_535, 65_535,
+    );
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvn_u32(a: uint32x2_t) -> uint32x2_t {
+    let b = uint32x2_t(4_294_967_295, 4_294_967_295);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvnq_u32(a: uint32x4_t) -> uint32x4_t {
+    let b = uint32x4_t(4_294_967_295, 4_294_967_295, 4_294_967_295, 4_294_967_295);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvn_p8(a: poly8x8_t) -> poly8x8_t {
+    let b = poly8x8_t(255, 255, 255, 255, 255, 255, 255, 255);
+    simd_xor(a, b)
+}
+
+/// Vector bitwise not.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmvn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mvn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmvnq_p8(a: poly8x16_t) -> poly8x16_t {
+    let b = poly8x16_t(
+        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    );
+    simd_xor(a, b)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbic_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    let c = int8x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_and(simd_xor(b, c), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbicq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    let c = int8x16_t(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    );
+    simd_and(simd_xor(b, c), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbic_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    let c = int16x4_t(-1, -1, -1, -1);
+    simd_and(simd_xor(b, c), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbicq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    let c = int16x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_and(simd_xor(b, c), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbic_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    let c = int32x2_t(-1, -1);
+    simd_and(simd_xor(b, c), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbicq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    let c = int32x4_t(-1, -1, -1, -1);
+    simd_and(simd_xor(b, c), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbic_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    let c = int64x1_t(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbicq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    let c = int64x2_t(-1, -1);
+    simd_and(simd_xor(b, c), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbic_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let c = int8x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_and(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbicq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    let c = int8x16_t(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    );
+    simd_and(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbic_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    let c = int16x4_t(-1, -1, -1, -1);
+    simd_and(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbicq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    let c = int16x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_and(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbic_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    let c = int32x2_t(-1, -1);
+    simd_and(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbicq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    let c = int32x4_t(-1, -1, -1, -1);
+    simd_and(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbic_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    let c = int64x1_t(-1);
+    simd_and(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise bit clear
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbic))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bic))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbicq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    let c = int64x2_t(-1, -1);
+    simd_and(simd_xor(b, transmute(c)), a)
+}
+
+/// Bitwise Select instructions. This instruction sets each bit in the destination SIMD&FP register
+/// to the corresponding bit from the first source SIMD&FP register when the original
+/// destination bit was 1, otherwise from the second source SIMD&FP register.
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    simd_select(transmute::<_, int8x8_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    simd_select(transmute::<_, int16x4_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    simd_select(transmute::<_, int32x2_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t) -> int64x1_t {
+    simd_select(transmute::<_, int64x1_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    simd_select(transmute::<_, int8x8_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    simd_select(transmute::<_, int16x4_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    simd_select(transmute::<_, int32x2_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t) -> uint64x1_t {
+    simd_select(transmute::<_, int64x1_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_f32(a: uint32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    simd_select(transmute::<_, int32x2_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_p8(a: uint8x8_t, b: poly8x8_t, c: poly8x8_t) -> poly8x8_t {
+    simd_select(transmute::<_, int8x8_t>(a), b, c)
+}
+
+/// Bitwise Select.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbsl_p16(a: uint16x4_t, b: poly16x4_t, c: poly16x4_t) -> poly16x4_t {
+    simd_select(transmute::<_, int16x4_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    simd_select(transmute::<_, int8x16_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    simd_select(transmute::<_, int16x8_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    simd_select(transmute::<_, int32x4_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    simd_select(transmute::<_, int64x2_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    simd_select(transmute::<_, int8x16_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    simd_select(transmute::<_, int16x8_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    simd_select(transmute::<_, int32x4_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    simd_select(transmute::<_, int64x2_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_p8(a: uint8x16_t, b: poly8x16_t, c: poly8x16_t) -> poly8x16_t {
+    simd_select(transmute::<_, int8x16_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_p16(a: uint16x8_t, b: poly16x8_t, c: poly16x8_t) -> poly16x8_t {
+    simd_select(transmute::<_, int16x8_t>(a), b, c)
+}
+
+/// Bitwise Select. (128-bit)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(bsl))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vbslq_f32(a: uint32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    simd_select(transmute::<_, int32x4_t>(a), b, c)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vorn_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    let c = int8x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_or(simd_xor(b, c), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vornq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    let c = int8x16_t(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    );
+    simd_or(simd_xor(b, c), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vorn_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    let c = int16x4_t(-1, -1, -1, -1);
+    simd_or(simd_xor(b, c), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vornq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    let c = int16x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_or(simd_xor(b, c), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vorn_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    let c = int32x2_t(-1, -1);
+    simd_or(simd_xor(b, c), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vornq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    let c = int32x4_t(-1, -1, -1, -1);
+    simd_or(simd_xor(b, c), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vorn_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    let c = int64x1_t(-1);
+    simd_or(simd_xor(b, c), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vornq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    let c = int64x2_t(-1, -1);
+    simd_or(simd_xor(b, c), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vorn_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let c = int8x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_or(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vornq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    let c = int8x16_t(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    );
+    simd_or(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vorn_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    let c = int16x4_t(-1, -1, -1, -1);
+    simd_or(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vornq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    let c = int16x8_t(-1, -1, -1, -1, -1, -1, -1, -1);
+    simd_or(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vorn_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    let c = int32x2_t(-1, -1);
+    simd_or(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vornq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    let c = int32x4_t(-1, -1, -1, -1);
+    simd_or(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vorn_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    let c = int64x1_t(-1);
+    simd_or(simd_xor(b, transmute(c)), a)
+}
+
+/// Vector bitwise inclusive OR NOT
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(orn))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vornq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    let c = int64x2_t(-1, -1);
+    simd_or(simd_xor(b, transmute(c)), a)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vpmins_v8i8(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    vpmins_v4i16(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    vpmins_v2i32(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    vpminu_v8i8(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    vpminu_v4i16(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    vpminu_v2i32(a, b)
+}
+
+/// Folding minimum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    vpminf_v2f32(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vpmaxs_v8i8(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    vpmaxs_v4i16(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    vpmaxs_v2i32(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    vpmaxu_v8i8(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    vpmaxu_v4i16(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    vpmaxu_v2i32(a, b)
+}
+
+/// Folding maximum of adjacent pairs
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    vpmaxf_v2f32(a, b)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_u64<const IMM5: i32>(v: uint64x2_t) -> u64 {
+    static_assert_imm1!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_u64<const IMM5: i32>(v: uint64x1_t) -> u64 {
+    static_assert!(IMM5 : i32 where IMM5 == 0);
+    simd_extract(v, 0)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_u16<const IMM5: i32>(v: uint16x4_t) -> u16 {
+    static_assert_imm2!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_s16<const IMM5: i32>(v: int16x4_t) -> i16 {
+    static_assert_imm2!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_p16<const IMM5: i32>(v: poly16x4_t) -> p16 {
+    static_assert_imm2!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_u32<const IMM5: i32>(v: uint32x2_t) -> u32 {
+    static_assert_imm1!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_s32<const IMM5: i32>(v: int32x2_t) -> i32 {
+    static_assert_imm1!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_f32<const IMM5: i32>(v: float32x2_t) -> f32 {
+    static_assert_imm1!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_f32<const IMM5: i32>(v: float32x4_t) -> f32 {
+    static_assert_imm2!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_p64<const IMM5: i32>(v: poly64x1_t) -> p64 {
+    static_assert!(IMM5 : i32 where IMM5 == 0);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_p64<const IMM5: i32>(v: poly64x2_t) -> p64 {
+    static_assert_imm1!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_s64<const IMM5: i32>(v: int64x1_t) -> i64 {
+    static_assert!(IMM5 : i32 where IMM5 == 0);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_s64<const IMM5: i32>(v: int64x2_t) -> i64 {
+    static_assert_imm1!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_u16<const IMM5: i32>(v: uint16x8_t) -> u16 {
+    static_assert_imm3!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_u32<const IMM5: i32>(v: uint32x4_t) -> u32 {
+    static_assert_imm2!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_s16<const IMM5: i32>(v: int16x8_t) -> i16 {
+    static_assert_imm3!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_p16<const IMM5: i32>(v: poly16x8_t) -> p16 {
+    static_assert_imm3!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_s32<const IMM5: i32>(v: int32x4_t) -> i32 {
+    static_assert_imm2!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_u8<const IMM5: i32>(v: uint8x8_t) -> u8 {
+    static_assert_imm3!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_s8<const IMM5: i32>(v: int8x8_t) -> i8 {
+    static_assert_imm3!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_lane_p8<const IMM5: i32>(v: poly8x8_t) -> p8 {
+    static_assert_imm3!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_u8<const IMM5: i32>(v: uint8x16_t) -> u8 {
+    static_assert_imm4!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_s8<const IMM5: i32>(v: int8x16_t) -> i8 {
+    static_assert_imm4!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Move vector element to general-purpose register
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_legacy_const_generics(1)]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vgetq_lane_p8<const IMM5: i32>(v: poly8x16_t) -> p8 {
+    static_assert_imm4!(IMM5);
+    simd_extract(v, IMM5 as u32)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_s8(a: int8x16_t) -> int8x8_t {
+    simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_s16(a: int16x8_t) -> int16x4_t {
+    simd_shuffle4!(a, a, [4, 5, 6, 7])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_s32(a: int32x4_t) -> int32x2_t {
+    simd_shuffle2!(a, a, [2, 3])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_s64(a: int64x2_t) -> int64x1_t {
+    int64x1_t(simd_extract(a, 1))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_u8(a: uint8x16_t) -> uint8x8_t {
+    simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_u16(a: uint16x8_t) -> uint16x4_t {
+    simd_shuffle4!(a, a, [4, 5, 6, 7])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_u32(a: uint32x4_t) -> uint32x2_t {
+    simd_shuffle2!(a, a, [2, 3])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_u64(a: uint64x2_t) -> uint64x1_t {
+    uint64x1_t(simd_extract(a, 1))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_p8(a: poly8x16_t) -> poly8x8_t {
+    simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_p16(a: poly16x8_t) -> poly16x4_t {
+    simd_shuffle4!(a, a, [4, 5, 6, 7])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_high_f32(a: float32x4_t) -> float32x2_t {
+    simd_shuffle2!(a, a, [2, 3])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "vget_low_s8", since = "1.60.0")
+)]
+pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_s16(a: int16x8_t) -> int16x4_t {
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_s32(a: int32x4_t) -> int32x2_t {
+    simd_shuffle2!(a, a, [0, 1])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_s64(a: int64x2_t) -> int64x1_t {
+    int64x1_t(simd_extract(a, 0))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_u32(a: uint32x4_t) -> uint32x2_t {
+    simd_shuffle2!(a, a, [0, 1])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_u64(a: uint64x2_t) -> uint64x1_t {
+    uint64x1_t(simd_extract(a, 0))
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vget_low_f32(a: float32x4_t) -> float32x2_t {
+    simd_shuffle2!(a, a, [0, 1])
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_s8(value: i8) -> int8x16_t {
+    int8x16_t(
+        value, value, value, value, value, value, value, value, value, value, value, value, value,
+        value, value, value,
+    )
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_s16(value: i16) -> int16x8_t {
+    int16x8_t(value, value, value, value, value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_s32(value: i32) -> int32x4_t {
+    int32x4_t(value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_s64(value: i64) -> int64x2_t {
+    int64x2_t(value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_u8(value: u8) -> uint8x16_t {
+    uint8x16_t(
+        value, value, value, value, value, value, value, value, value, value, value, value, value,
+        value, value, value,
+    )
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_u16(value: u16) -> uint16x8_t {
+    uint16x8_t(value, value, value, value, value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_u32(value: u32) -> uint32x4_t {
+    uint32x4_t(value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_u64(value: u64) -> uint64x2_t {
+    uint64x2_t(value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_p8(value: p8) -> poly8x16_t {
+    poly8x16_t(
+        value, value, value, value, value, value, value, value, value, value, value, value, value,
+        value, value, value,
+    )
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_p16(value: p16) -> poly16x8_t {
+    poly16x8_t(value, value, value, value, value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdupq_n_f32(value: f32) -> float32x4_t {
+    float32x4_t(value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+///
+/// Private vfp4 version used by FMA intriniscs because LLVM does
+/// not inline the non-vfp4 version in vfp4 functions.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+unsafe fn vdupq_n_f32_vfp4(value: f32) -> float32x4_t {
+    float32x4_t(value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_s8(value: i8) -> int8x8_t {
+    int8x8_t(value, value, value, value, value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_s16(value: i16) -> int16x4_t {
+    int16x4_t(value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_s32(value: i32) -> int32x2_t {
+    int32x2_t(value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmov))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_s64(value: i64) -> int64x1_t {
+    int64x1_t(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_u8(value: u8) -> uint8x8_t {
+    uint8x8_t(value, value, value, value, value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_u16(value: u16) -> uint16x4_t {
+    uint16x4_t(value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_u32(value: u32) -> uint32x2_t {
+    uint32x2_t(value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmov))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_u64(value: u64) -> uint64x1_t {
+    uint64x1_t(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_p8(value: p8) -> poly8x8_t {
+    poly8x8_t(value, value, value, value, value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_p16(value: p16) -> poly16x4_t {
+    poly16x4_t(value, value, value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vdup_n_f32(value: f32) -> float32x2_t {
+    float32x2_t(value, value)
+}
+
+/// Duplicate vector element to vector or scalar
+///
+/// Private vfp4 version used by FMA intriniscs because LLVM does
+/// not inline the non-vfp4 version in vfp4 functions.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "vfp4"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+unsafe fn vdup_n_f32_vfp4(value: f32) -> float32x2_t {
+    float32x2_t(value, value)
+}
+
+/// Load SIMD&FP register (immediate offset)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vldrq_p128(a: *const p128) -> p128 {
+    *a
+}
+
+/// Store SIMD&FP register (immediate offset)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vstrq_p128(a: *mut p128, b: p128) {
+    *a = b;
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_s8(value: i8) -> int8x8_t {
+    vdup_n_s8(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_s16(value: i16) -> int16x4_t {
+    vdup_n_s16(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_s32(value: i32) -> int32x2_t {
+    vdup_n_s32(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmov))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_s64(value: i64) -> int64x1_t {
+    vdup_n_s64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_u8(value: u8) -> uint8x8_t {
+    vdup_n_u8(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_u16(value: u16) -> uint16x4_t {
+    vdup_n_u16(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_u32(value: u32) -> uint32x2_t {
+    vdup_n_u32(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmov))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_u64(value: u64) -> uint64x1_t {
+    vdup_n_u64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_p8(value: p8) -> poly8x8_t {
+    vdup_n_p8(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_p16(value: p16) -> poly16x4_t {
+    vdup_n_p16(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmov_n_f32(value: f32) -> float32x2_t {
+    vdup_n_f32(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_s8(value: i8) -> int8x16_t {
+    vdupq_n_s8(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_s16(value: i16) -> int16x8_t {
+    vdupq_n_s16(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_s32(value: i32) -> int32x4_t {
+    vdupq_n_s32(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_s64(value: i64) -> int64x2_t {
+    vdupq_n_s64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_u8(value: u8) -> uint8x16_t {
+    vdupq_n_u8(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_u16(value: u16) -> uint16x8_t {
+    vdupq_n_u16(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_u32(value: u32) -> uint32x4_t {
+    vdupq_n_u32(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_u64(value: u64) -> uint64x2_t {
+    vdupq_n_u64(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_p8(value: p8) -> poly8x16_t {
+    vdupq_n_p8(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_p16(value: p16) -> poly16x8_t {
+    vdupq_n_p16(value)
+}
+
+/// Duplicate vector element to vector or scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vmovq_n_f32(value: f32) -> float32x4_t {
+    vdupq_n_f32(value)
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("nop", N = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("nop", N = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vext_s64<const N: i32>(a: int64x1_t, _b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N == 0);
+    a
+}
+
+/// Extract vector from pair of vectors
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("nop", N = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr("nop", N = 0))]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vext_u64<const N: i32>(a: uint64x1_t, _b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N == 0);
+    a
+}
+
+/// Population count per byte.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cnt))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vcnt_s8(a: int8x8_t) -> int8x8_t {
+    vcnt_s8_(a)
+}
+/// Population count per byte.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cnt))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vcntq_s8(a: int8x16_t) -> int8x16_t {
+    vcntq_s8_(a)
+}
+/// Population count per byte.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cnt))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vcnt_u8(a: uint8x8_t) -> uint8x8_t {
+    transmute(vcnt_s8_(transmute(a)))
+}
+/// Population count per byte.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cnt))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vcntq_u8(a: uint8x16_t) -> uint8x16_t {
+    transmute(vcntq_s8_(transmute(a)))
+}
+/// Population count per byte.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cnt))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vcnt_p8(a: poly8x8_t) -> poly8x8_t {
+    transmute(vcnt_s8_(transmute(a)))
+}
+/// Population count per byte.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcnt))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(cnt))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vcntq_p8(a: poly8x16_t) -> poly8x16_t {
+    transmute(vcntq_s8_(transmute(a)))
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev16_s8(a: int8x8_t) -> int8x8_t {
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev16q_s8(a: int8x16_t) -> int8x16_t {
+    simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev16_u8(a: uint8x8_t) -> uint8x8_t {
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev16q_u8(a: uint8x16_t) -> uint8x16_t {
+    simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev16_p8(a: poly8x8_t) -> poly8x8_t {
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev16q_p8(a: poly8x16_t) -> poly8x16_t {
+    simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32_s8(a: int8x8_t) -> int8x8_t {
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32q_s8(a: int8x16_t) -> int8x16_t {
+    simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32_u8(a: uint8x8_t) -> uint8x8_t {
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t {
+    simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32_s16(a: int16x4_t) -> int16x4_t {
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32q_s16(a: int16x8_t) -> int16x8_t {
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32_p16(a: poly16x4_t) -> poly16x4_t {
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t {
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32_u16(a: uint16x4_t) -> uint16x4_t {
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32q_u16(a: uint16x8_t) -> uint16x8_t {
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32_p8(a: poly8x8_t) -> poly8x8_t {
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev32q_p8(a: poly8x16_t) -> poly8x16_t {
+    simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64_s8(a: int8x8_t) -> int8x8_t {
+    simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64q_s8(a: int8x16_t) -> int8x16_t {
+    simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64_s16(a: int16x4_t) -> int16x4_t {
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64q_s16(a: int16x8_t) -> int16x8_t {
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64_s32(a: int32x2_t) -> int32x2_t {
+    simd_shuffle2!(a, a, [1, 0])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64q_s32(a: int32x4_t) -> int32x4_t {
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64_u8(a: uint8x8_t) -> uint8x8_t {
+    simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64q_u8(a: uint8x16_t) -> uint8x16_t {
+    simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64_u16(a: uint16x4_t) -> uint16x4_t {
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64q_u16(a: uint16x8_t) -> uint16x8_t {
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64_u32(a: uint32x2_t) -> uint32x2_t {
+    simd_shuffle2!(a, a, [1, 0])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64q_u32(a: uint32x4_t) -> uint32x4_t {
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64_f32(a: float32x2_t) -> float32x2_t {
+    simd_shuffle2!(a, a, [1, 0])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64q_f32(a: float32x4_t) -> float32x4_t {
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64_p8(a: poly8x8_t) -> poly8x8_t {
+    simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64q_p8(a: poly8x16_t) -> poly8x16_t {
+    simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64_p16(a: poly16x4_t) -> poly16x4_t {
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vrev64q_p16(a: poly16x8_t) -> poly16x8_t {
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadal_s8(a: int16x4_t, b: int8x8_t) -> int16x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_s8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_s8_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadal_s16(a: int32x2_t, b: int16x4_t) -> int32x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_s16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_s16_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadal_s32(a: int64x1_t, b: int32x2_t) -> int64x1_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_s32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_s32_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadalq_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_s8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_s8_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadalq_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_s16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_s16_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadalq_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_s32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_s32_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadal_u8(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_u8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_u8_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadal_u16(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_u16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_u16_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadal_u32(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_u32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_u32_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadalq_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_u8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_u8_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadalq_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_u16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_u16_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "neon_intrinsics", since = "1.59.0")
+)]
+pub unsafe fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_u32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_u32_(b), a)
+    }
+}
+
+/// 8-bit integer matrix multiply-accumulate
+#[inline]
+#[cfg_attr(not(bootstrap), target_feature(enable = "i8mm"))]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smmla))]
+pub unsafe fn vmmlaq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.smmla.v4i32.v16i8")]
+        #[cfg_attr(
+            target_arch = "aarch64",
+            link_name = "llvm.aarch64.neon.smmla.v4i32.v16i8"
+        )]
+        fn vmmlaq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+    vmmlaq_s32_(a, b, c)
+}
+
+/// 8-bit integer matrix multiply-accumulate
+#[inline]
+#[cfg_attr(not(bootstrap), target_feature(enable = "i8mm"))]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ummla))]
+pub unsafe fn vmmlaq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.ummla.v4i32.v16i8")]
+        #[cfg_attr(
+            target_arch = "aarch64",
+            link_name = "llvm.aarch64.neon.ummla.v4i32.v16i8"
+        )]
+        fn vmmlaq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
+    }
+    vmmlaq_u32_(a, b, c)
+}
+
+/// Unsigned and signed 8-bit integer matrix multiply-accumulate
+#[inline]
+#[cfg_attr(not(bootstrap), target_feature(enable = "i8mm"))]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usmmla))]
+pub unsafe fn vusmmlaq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usmmla.v4i32.v16i8")]
+        #[cfg_attr(
+            target_arch = "aarch64",
+            link_name = "llvm.aarch64.neon.usmmla.v4i32.v16i8"
+        )]
+        fn vusmmlaq_s32_(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+    vusmmlaq_s32_(a, b, c)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[cfg(target_arch = "aarch64")]
+    use crate::core_arch::aarch64::*;
+    #[cfg(target_arch = "arm")]
+    use crate::core_arch::arm::*;
+    use crate::core_arch::arm_shared::test_support::*;
+    use crate::core_arch::simd::*;
+    use std::{i16, i32, i8, mem::transmute, u16, u32, u8, vec::Vec};
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: i8 = 42;
+        let e = i8x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: i8x8 = transmute(vld1_lane_s8::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let elem: i8 = 42;
+        let e = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 42);
+        let r: i8x16 = transmute(vld1q_lane_s8::<15>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let elem: i16 = 42;
+        let e = i16x4::new(0, 1, 2, 42);
+        let r: i16x4 = transmute(vld1_lane_s16::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: i16 = 42;
+        let e = i16x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: i16x8 = transmute(vld1q_lane_s16::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_s32() {
+        let a = i32x2::new(0, 1);
+        let elem: i32 = 42;
+        let e = i32x2::new(0, 42);
+        let r: i32x2 = transmute(vld1_lane_s32::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let elem: i32 = 42;
+        let e = i32x4::new(0, 1, 2, 42);
+        let r: i32x4 = transmute(vld1q_lane_s32::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_s64() {
+        let a = i64x1::new(0);
+        let elem: i64 = 42;
+        let e = i64x1::new(42);
+        let r: i64x1 = transmute(vld1_lane_s64::<0>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s64() {
+        let a = i64x2::new(0, 1);
+        let elem: i64 = 42;
+        let e = i64x2::new(0, 42);
+        let r: i64x2 = transmute(vld1q_lane_s64::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: u8 = 42;
+        let e = u8x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u8x8 = transmute(vld1_lane_u8::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let elem: u8 = 42;
+        let e = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 42);
+        let r: u8x16 = transmute(vld1q_lane_u8::<15>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let elem: u16 = 42;
+        let e = u16x4::new(0, 1, 2, 42);
+        let r: u16x4 = transmute(vld1_lane_u16::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: u16 = 42;
+        let e = u16x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u16x8 = transmute(vld1q_lane_u16::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u32() {
+        let a = u32x2::new(0, 1);
+        let elem: u32 = 42;
+        let e = u32x2::new(0, 42);
+        let r: u32x2 = transmute(vld1_lane_u32::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let elem: u32 = 42;
+        let e = u32x4::new(0, 1, 2, 42);
+        let r: u32x4 = transmute(vld1q_lane_u32::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u64() {
+        let a = u64x1::new(0);
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_lane_u64::<0>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u64() {
+        let a = u64x2::new(0, 1);
+        let elem: u64 = 42;
+        let e = u64x2::new(0, 42);
+        let r: u64x2 = transmute(vld1q_lane_u64::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_p8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: p8 = 42;
+        let e = u8x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u8x8 = transmute(vld1_lane_p8::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let elem: p8 = 42;
+        let e = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 42);
+        let r: u8x16 = transmute(vld1q_lane_p8::<15>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_p16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let elem: p16 = 42;
+        let e = u16x4::new(0, 1, 2, 42);
+        let r: u16x4 = transmute(vld1_lane_p16::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_p16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: p16 = 42;
+        let e = u16x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u16x8 = transmute(vld1q_lane_p16::<7>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1_lane_p64() {
+        let a = u64x1::new(0);
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_lane_p64::<0>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1q_lane_p64() {
+        let a = u64x2::new(0, 1);
+        let elem: u64 = 42;
+        let e = u64x2::new(0, 42);
+        let r: u64x2 = transmute(vld1q_lane_p64::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_f32() {
+        let a = f32x2::new(0., 1.);
+        let elem: f32 = 42.;
+        let e = f32x2::new(0., 42.);
+        let r: f32x2 = transmute(vld1_lane_f32::<1>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_f32() {
+        let a = f32x4::new(0., 1., 2., 3.);
+        let elem: f32 = 42.;
+        let e = f32x4::new(0., 1., 2., 42.);
+        let r: f32x4 = transmute(vld1q_lane_f32::<3>(&elem, transmute(a)));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s8() {
+        let elem: i8 = 42;
+        let e = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: i8x8 = transmute(vld1_dup_s8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s8() {
+        let elem: i8 = 42;
+        let e = i8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: i8x16 = transmute(vld1q_dup_s8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s16() {
+        let elem: i16 = 42;
+        let e = i16x4::new(42, 42, 42, 42);
+        let r: i16x4 = transmute(vld1_dup_s16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s16() {
+        let elem: i16 = 42;
+        let e = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: i16x8 = transmute(vld1q_dup_s16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s32() {
+        let elem: i32 = 42;
+        let e = i32x2::new(42, 42);
+        let r: i32x2 = transmute(vld1_dup_s32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s32() {
+        let elem: i32 = 42;
+        let e = i32x4::new(42, 42, 42, 42);
+        let r: i32x4 = transmute(vld1q_dup_s32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s64() {
+        let elem: i64 = 42;
+        let e = i64x1::new(42);
+        let r: i64x1 = transmute(vld1_dup_s64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s64() {
+        let elem: i64 = 42;
+        let e = i64x2::new(42, 42);
+        let r: i64x2 = transmute(vld1q_dup_s64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u8() {
+        let elem: u8 = 42;
+        let e = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u8x8 = transmute(vld1_dup_u8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u8() {
+        let elem: u8 = 42;
+        let e = u8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: u8x16 = transmute(vld1q_dup_u8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u16() {
+        let elem: u16 = 42;
+        let e = u16x4::new(42, 42, 42, 42);
+        let r: u16x4 = transmute(vld1_dup_u16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u16() {
+        let elem: u16 = 42;
+        let e = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u16x8 = transmute(vld1q_dup_u16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u32() {
+        let elem: u32 = 42;
+        let e = u32x2::new(42, 42);
+        let r: u32x2 = transmute(vld1_dup_u32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u32() {
+        let elem: u32 = 42;
+        let e = u32x4::new(42, 42, 42, 42);
+        let r: u32x4 = transmute(vld1q_dup_u32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u64() {
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_dup_u64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u64() {
+        let elem: u64 = 42;
+        let e = u64x2::new(42, 42);
+        let r: u64x2 = transmute(vld1q_dup_u64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_p8() {
+        let elem: p8 = 42;
+        let e = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u8x8 = transmute(vld1_dup_p8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_p8() {
+        let elem: p8 = 42;
+        let e = u8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: u8x16 = transmute(vld1q_dup_p8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_p16() {
+        let elem: p16 = 42;
+        let e = u16x4::new(42, 42, 42, 42);
+        let r: u16x4 = transmute(vld1_dup_p16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_p16() {
+        let elem: p16 = 42;
+        let e = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u16x8 = transmute(vld1q_dup_p16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1_dup_p64() {
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_dup_p64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1q_dup_p64() {
+        let elem: u64 = 42;
+        let e = u64x2::new(42, 42);
+        let r: u64x2 = transmute(vld1q_dup_p64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_f32() {
+        let elem: f32 = 42.;
+        let e = f32x2::new(42., 42.);
+        let r: f32x2 = transmute(vld1_dup_f32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_f32() {
+        let elem: f32 = 42.;
+        let e = f32x4::new(42., 42., 42., 42.);
+        let r: f32x4 = transmute(vld1q_dup_f32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_u8() {
+        let v = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = vget_lane_u8::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_u32() {
+        let v = i32x4::new(1, 2, 3, 4);
+        let r = vgetq_lane_u32::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_s32() {
+        let v = i32x4::new(1, 2, 3, 4);
+        let r = vgetq_lane_s32::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_u64() {
+        let v: u64 = 1;
+        let r = vget_lane_u64::<0>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_u16() {
+        let v = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = vgetq_lane_u16::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_s8() {
+        let v = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = vget_lane_s8::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_s8::<4>(transmute(v));
+        assert_eq!(r, 4);
+        let r = vget_lane_s8::<5>(transmute(v));
+        assert_eq!(r, 5);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_p8() {
+        let v = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = vget_lane_p8::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_p8::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vget_lane_p8::<5>(transmute(v));
+        assert_eq!(r, 5);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_p16() {
+        let v = u16x4::new(0, 1, 2, 3);
+        let r = vget_lane_p16::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_p16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vget_lane_p16::<0>(transmute(v));
+        assert_eq!(r, 0);
+        let r = vget_lane_p16::<1>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_s16() {
+        let v = i16x4::new(0, 1, 2, 3);
+        let r = vget_lane_s16::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_s16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vget_lane_s16::<0>(transmute(v));
+        assert_eq!(r, 0);
+        let r = vget_lane_s16::<1>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_u16() {
+        let v = u16x4::new(0, 1, 2, 3);
+        let r = vget_lane_u16::<2>(transmute(v));
+        assert_eq!(r, 2);
+        let r = vget_lane_u16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vget_lane_u16::<0>(transmute(v));
+        assert_eq!(r, 0);
+        let r = vget_lane_u16::<1>(transmute(v));
+        assert_eq!(r, 1);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_f32() {
+        let v = f32x2::new(0.0, 1.0);
+        let r = vget_lane_f32::<1>(transmute(v));
+        assert_eq!(r, 1.0);
+        let r = vget_lane_f32::<0>(transmute(v));
+        assert_eq!(r, 0.0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_s32() {
+        let v = i32x2::new(0, 1);
+        let r = vget_lane_s32::<1>(transmute(v));
+        assert_eq!(r, 1);
+        let r = vget_lane_s32::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_u32() {
+        let v = u32x2::new(0, 1);
+        let r = vget_lane_u32::<1>(transmute(v));
+        assert_eq!(r, 1);
+        let r = vget_lane_u32::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_s64() {
+        let v = i64x1::new(1);
+        let r = vget_lane_s64::<0>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_lane_p64() {
+        let v = u64x1::new(1);
+        let r = vget_lane_p64::<0>(transmute(v));
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_s8() {
+        let v = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = vgetq_lane_s8::<7>(transmute(v));
+        assert_eq!(r, 7);
+        let r = vgetq_lane_s8::<13>(transmute(v));
+        assert_eq!(r, 13);
+        let r = vgetq_lane_s8::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_s8::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_p8() {
+        let v = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = vgetq_lane_p8::<7>(transmute(v));
+        assert_eq!(r, 7);
+        let r = vgetq_lane_p8::<13>(transmute(v));
+        assert_eq!(r, 13);
+        let r = vgetq_lane_p8::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_p8::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_u8() {
+        let v = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = vgetq_lane_u8::<7>(transmute(v));
+        assert_eq!(r, 7);
+        let r = vgetq_lane_u8::<13>(transmute(v));
+        assert_eq!(r, 13);
+        let r = vgetq_lane_u8::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_u8::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_s16() {
+        let v = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = vgetq_lane_s16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_s16::<6>(transmute(v));
+        assert_eq!(r, 6);
+        let r = vgetq_lane_s16::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_p16() {
+        let v = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = vgetq_lane_p16::<3>(transmute(v));
+        assert_eq!(r, 3);
+        let r = vgetq_lane_p16::<7>(transmute(v));
+        assert_eq!(r, 7);
+        let r = vgetq_lane_p16::<1>(transmute(v));
+        assert_eq!(r, 1);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_f32() {
+        let v = f32x4::new(0.0, 1.0, 2.0, 3.0);
+        let r = vgetq_lane_f32::<3>(transmute(v));
+        assert_eq!(r, 3.0);
+        let r = vgetq_lane_f32::<0>(transmute(v));
+        assert_eq!(r, 0.0);
+        let r = vgetq_lane_f32::<2>(transmute(v));
+        assert_eq!(r, 2.0);
+        let r = vgetq_lane_f32::<1>(transmute(v));
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_s64() {
+        let v = i64x2::new(0, 1);
+        let r = vgetq_lane_s64::<1>(transmute(v));
+        assert_eq!(r, 1);
+        let r = vgetq_lane_s64::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_p64() {
+        let v = u64x2::new(0, 1);
+        let r = vgetq_lane_p64::<1>(transmute(v));
+        assert_eq!(r, 1);
+        let r = vgetq_lane_p64::<0>(transmute(v));
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let b: i64x1 = i64x1::new(1);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vext_s64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vext_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let b: u64x1 = u64x1::new(1);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vext_u64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = i8x8::new(9, 10, 11, 12, 13, 14, 15, 16);
+        let r: i8x8 = transmute(vget_high_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = i16x4::new(5, 6, 7, 8);
+        let r: i16x4 = transmute(vget_high_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let e = i32x2::new(3, 4);
+        let r: i32x2 = transmute(vget_high_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_s64() {
+        let a = i64x2::new(1, 2);
+        let e = i64x1::new(2);
+        let r: i64x1 = transmute(vget_high_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_u8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = u8x8::new(9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x8 = transmute(vget_high_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u16x4::new(5, 6, 7, 8);
+        let r: u16x4 = transmute(vget_high_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let e = u32x2::new(3, 4);
+        let r: u32x2 = transmute(vget_high_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_u64() {
+        let a = u64x2::new(1, 2);
+        let e = u64x1::new(2);
+        let r: u64x1 = transmute(vget_high_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_p8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = u8x8::new(9, 10, 11, 12, 13, 14, 15, 16);
+        let r: u8x8 = transmute(vget_high_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_p16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u16x4::new(5, 6, 7, 8);
+        let r: u16x4 = transmute(vget_high_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_high_f32() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let e = f32x2::new(3.0, 4.0);
+        let r: f32x2 = transmute(vget_high_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vget_low_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vget_low_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let e = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vget_low_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_s64() {
+        let a = i64x2::new(1, 2);
+        let e = i64x1::new(1);
+        let r: i64x1 = transmute(vget_low_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_u8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vget_low_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vget_low_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let e = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vget_low_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_u64() {
+        let a = u64x2::new(1, 2);
+        let e = u64x1::new(1);
+        let r: u64x1 = transmute(vget_low_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_p8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vget_low_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_p16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vget_low_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vget_low_f32() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let e = f32x2::new(1.0, 2.0);
+        let r: f32x2 = transmute(vget_low_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_s8() {
+        let v: i8 = 42;
+        let e = i8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: i8x16 = transmute(vdupq_n_s8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_s16() {
+        let v: i16 = 64;
+        let e = i16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: i16x8 = transmute(vdupq_n_s16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_s32() {
+        let v: i32 = 64;
+        let e = i32x4::new(64, 64, 64, 64);
+        let r: i32x4 = transmute(vdupq_n_s32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_s64() {
+        let v: i64 = 64;
+        let e = i64x2::new(64, 64);
+        let r: i64x2 = transmute(vdupq_n_s64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_u8() {
+        let v: u8 = 64;
+        let e = u8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: u8x16 = transmute(vdupq_n_u8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_u16() {
+        let v: u16 = 64;
+        let e = u16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u16x8 = transmute(vdupq_n_u16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_u32() {
+        let v: u32 = 64;
+        let e = u32x4::new(64, 64, 64, 64);
+        let r: u32x4 = transmute(vdupq_n_u32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_u64() {
+        let v: u64 = 64;
+        let e = u64x2::new(64, 64);
+        let r: u64x2 = transmute(vdupq_n_u64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_p8() {
+        let v: p8 = 64;
+        let e = u8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: u8x16 = transmute(vdupq_n_p8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_p16() {
+        let v: p16 = 64;
+        let e = u16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u16x8 = transmute(vdupq_n_p16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdupq_n_f32() {
+        let v: f32 = 64.0;
+        let e = f32x4::new(64.0, 64.0, 64.0, 64.0);
+        let r: f32x4 = transmute(vdupq_n_f32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_s8() {
+        let v: i8 = 64;
+        let e = i8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: i8x8 = transmute(vdup_n_s8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_s16() {
+        let v: i16 = 64;
+        let e = i16x4::new(64, 64, 64, 64);
+        let r: i16x4 = transmute(vdup_n_s16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_s32() {
+        let v: i32 = 64;
+        let e = i32x2::new(64, 64);
+        let r: i32x2 = transmute(vdup_n_s32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_s64() {
+        let v: i64 = 64;
+        let e = i64x1::new(64);
+        let r: i64x1 = transmute(vdup_n_s64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_u8() {
+        let v: u8 = 64;
+        let e = u8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u8x8 = transmute(vdup_n_u8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_u16() {
+        let v: u16 = 64;
+        let e = u16x4::new(64, 64, 64, 64);
+        let r: u16x4 = transmute(vdup_n_u16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_u32() {
+        let v: u32 = 64;
+        let e = u32x2::new(64, 64);
+        let r: u32x2 = transmute(vdup_n_u32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_u64() {
+        let v: u64 = 64;
+        let e = u64x1::new(64);
+        let r: u64x1 = transmute(vdup_n_u64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_p8() {
+        let v: p8 = 64;
+        let e = u8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u8x8 = transmute(vdup_n_p8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_p16() {
+        let v: p16 = 64;
+        let e = u16x4::new(64, 64, 64, 64);
+        let r: u16x4 = transmute(vdup_n_p16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vdup_n_f32() {
+        let v: f32 = 64.0;
+        let e = f32x2::new(64.0, 64.0);
+        let r: f32x2 = transmute(vdup_n_f32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vldrq_p128() {
+        let v: [p128; 2] = [1, 2];
+        let e: p128 = 2;
+        let r: p128 = vldrq_p128(v[1..].as_ptr());
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vstrq_p128() {
+        let v: [p128; 2] = [1, 2];
+        let e: p128 = 2;
+        let mut r: p128 = 1;
+        vstrq_p128(&mut r, v[1]);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_s8() {
+        let v: i8 = 64;
+        let e = i8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: i8x8 = transmute(vmov_n_s8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_s16() {
+        let v: i16 = 64;
+        let e = i16x4::new(64, 64, 64, 64);
+        let r: i16x4 = transmute(vmov_n_s16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_s32() {
+        let v: i32 = 64;
+        let e = i32x2::new(64, 64);
+        let r: i32x2 = transmute(vmov_n_s32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_s64() {
+        let v: i64 = 64;
+        let e = i64x1::new(64);
+        let r: i64x1 = transmute(vmov_n_s64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_u8() {
+        let v: u8 = 64;
+        let e = u8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u8x8 = transmute(vmov_n_u8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_u16() {
+        let v: u16 = 64;
+        let e = u16x4::new(64, 64, 64, 64);
+        let r: u16x4 = transmute(vmov_n_u16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_u32() {
+        let v: u32 = 64;
+        let e = u32x2::new(64, 64);
+        let r: u32x2 = transmute(vmov_n_u32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_u64() {
+        let v: u64 = 64;
+        let e = u64x1::new(64);
+        let r: u64x1 = transmute(vmov_n_u64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_p8() {
+        let v: p8 = 64;
+        let e = u8x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u8x8 = transmute(vmov_n_p8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_p16() {
+        let v: p16 = 64;
+        let e = u16x4::new(64, 64, 64, 64);
+        let r: u16x4 = transmute(vmov_n_p16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmov_n_f32() {
+        let v: f32 = 64.0;
+        let e = f32x2::new(64.0, 64.0);
+        let r: f32x2 = transmute(vmov_n_f32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_s8() {
+        let v: i8 = 64;
+        let e = i8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: i8x16 = transmute(vmovq_n_s8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_s16() {
+        let v: i16 = 64;
+        let e = i16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: i16x8 = transmute(vmovq_n_s16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_s32() {
+        let v: i32 = 64;
+        let e = i32x4::new(64, 64, 64, 64);
+        let r: i32x4 = transmute(vmovq_n_s32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_s64() {
+        let v: i64 = 64;
+        let e = i64x2::new(64, 64);
+        let r: i64x2 = transmute(vmovq_n_s64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_u8() {
+        let v: u8 = 64;
+        let e = u8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: u8x16 = transmute(vmovq_n_u8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_u16() {
+        let v: u16 = 64;
+        let e = u16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u16x8 = transmute(vmovq_n_u16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_u32() {
+        let v: u32 = 64;
+        let e = u32x4::new(64, 64, 64, 64);
+        let r: u32x4 = transmute(vmovq_n_u32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_u64() {
+        let v: u64 = 64;
+        let e = u64x2::new(64, 64);
+        let r: u64x2 = transmute(vmovq_n_u64(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_p8() {
+        let v: p8 = 64;
+        let e = u8x16::new(
+            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        );
+        let r: u8x16 = transmute(vmovq_n_p8(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_p16() {
+        let v: p16 = 64;
+        let e = u16x8::new(64, 64, 64, 64, 64, 64, 64, 64);
+        let r: u16x8 = transmute(vmovq_n_p16(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovq_n_f32() {
+        let v: f32 = 64.0;
+        let e = f32x4::new(64.0, 64.0, 64.0, 64.0);
+        let r: f32x4 = transmute(vmovq_n_f32(v));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vgetq_lane_u64() {
+        let v = i64x2::new(1, 2);
+        let r = vgetq_lane_u64::<1>(transmute(v));
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s8() {
+        test_ari_s8(
+            |i, j| vadd_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_s8() {
+        testq_ari_s8(
+            |i, j| vaddq_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s16() {
+        test_ari_s16(
+            |i, j| vadd_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_s16() {
+        testq_ari_s16(
+            |i, j| vaddq_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_s32() {
+        test_ari_s32(
+            |i, j| vadd_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_s32() {
+        testq_ari_s32(
+            |i, j| vaddq_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.overflowing_add(b).0 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u8() {
+        test_ari_u8(
+            |i, j| vadd_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_u8() {
+        testq_ari_u8(
+            |i, j| vaddq_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u16() {
+        test_ari_u16(
+            |i, j| vadd_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_u16() {
+        testq_ari_u16(
+            |i, j| vaddq_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_u32() {
+        test_ari_u32(
+            |i, j| vadd_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.overflowing_add(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_u32() {
+        testq_ari_u32(
+            |i, j| vaddq_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.overflowing_add(b).0 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vadd_f32() {
+        test_ari_f32(|i, j| vadd_f32(i, j), |a: f32, b: f32| -> f32 { a + b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddq_f32() {
+        testq_ari_f32(|i, j| vaddq_f32(i, j), |a: f32, b: f32| -> f32 { a + b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_s8() {
+        let v = i8::MAX;
+        let a = i8x8::new(v, v, v, v, v, v, v, v);
+        let v = 2 * (v as i16);
+        let e = i16x8::new(v, v, v, v, v, v, v, v);
+        let r: i16x8 = transmute(vaddl_s8(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_s16() {
+        let v = i16::MAX;
+        let a = i16x4::new(v, v, v, v);
+        let v = 2 * (v as i32);
+        let e = i32x4::new(v, v, v, v);
+        let r: i32x4 = transmute(vaddl_s16(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_s32() {
+        let v = i32::MAX;
+        let a = i32x2::new(v, v);
+        let v = 2 * (v as i64);
+        let e = i64x2::new(v, v);
+        let r: i64x2 = transmute(vaddl_s32(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_u8() {
+        let v = u8::MAX;
+        let a = u8x8::new(v, v, v, v, v, v, v, v);
+        let v = 2 * (v as u16);
+        let e = u16x8::new(v, v, v, v, v, v, v, v);
+        let r: u16x8 = transmute(vaddl_u8(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_u16() {
+        let v = u16::MAX;
+        let a = u16x4::new(v, v, v, v);
+        let v = 2 * (v as u32);
+        let e = u32x4::new(v, v, v, v);
+        let r: u32x4 = transmute(vaddl_u16(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_u32() {
+        let v = u32::MAX;
+        let a = u32x2::new(v, v);
+        let v = 2 * (v as u64);
+        let e = u64x2::new(v, v);
+        let r: u64x2 = transmute(vaddl_u32(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let x = i8::MAX;
+        let b = i8x16::new(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
+        let x = x as i16;
+        let e = i16x8::new(x + 8, x + 9, x + 10, x + 11, x + 12, x + 13, x + 14, x + 15);
+        let r: i16x8 = transmute(vaddl_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let x = i16::MAX;
+        let b = i16x8::new(x, x, x, x, x, x, x, x);
+        let x = x as i32;
+        let e = i32x4::new(x + 4, x + 5, x + 6, x + 7);
+        let r: i32x4 = transmute(vaddl_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let x = i32::MAX;
+        let b = i32x4::new(x, x, x, x);
+        let x = x as i64;
+        let e = i64x2::new(x + 2, x + 3);
+        let r: i64x2 = transmute(vaddl_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let x = u8::MAX;
+        let b = u8x16::new(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
+        let x = x as u16;
+        let e = u16x8::new(x + 8, x + 9, x + 10, x + 11, x + 12, x + 13, x + 14, x + 15);
+        let r: u16x8 = transmute(vaddl_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let x = u16::MAX;
+        let b = u16x8::new(x, x, x, x, x, x, x, x);
+        let x = x as u32;
+        let e = u32x4::new(x + 4, x + 5, x + 6, x + 7);
+        let r: u32x4 = transmute(vaddl_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let x = u32::MAX;
+        let b = u32x4::new(x, x, x, x);
+        let x = x as u64;
+        let e = u64x2::new(x + 2, x + 3);
+        let r: u64x2 = transmute(vaddl_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_s8() {
+        let x = i16::MAX;
+        let a = i16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = i8::MAX;
+        let b = i8x8::new(y, y, y, y, y, y, y, y);
+        let y = y as i16;
+        let e = i16x8::new(
+            x.wrapping_add(y),
+            1 + y,
+            2 + y,
+            3 + y,
+            4 + y,
+            5 + y,
+            6 + y,
+            7 + y,
+        );
+        let r: i16x8 = transmute(vaddw_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_s16() {
+        let x = i32::MAX;
+        let a = i32x4::new(x, 1, 2, 3);
+        let y = i16::MAX;
+        let b = i16x4::new(y, y, y, y);
+        let y = y as i32;
+        let e = i32x4::new(x.wrapping_add(y), 1 + y, 2 + y, 3 + y);
+        let r: i32x4 = transmute(vaddw_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_s32() {
+        let x = i64::MAX;
+        let a = i64x2::new(x, 1);
+        let y = i32::MAX;
+        let b = i32x2::new(y, y);
+        let y = y as i64;
+        let e = i64x2::new(x.wrapping_add(y), 1 + y);
+        let r: i64x2 = transmute(vaddw_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_u8() {
+        let x = u16::MAX;
+        let a = u16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = u8::MAX;
+        let b = u8x8::new(y, y, y, y, y, y, y, y);
+        let y = y as u16;
+        let e = u16x8::new(
+            x.wrapping_add(y),
+            1 + y,
+            2 + y,
+            3 + y,
+            4 + y,
+            5 + y,
+            6 + y,
+            7 + y,
+        );
+        let r: u16x8 = transmute(vaddw_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_u16() {
+        let x = u32::MAX;
+        let a = u32x4::new(x, 1, 2, 3);
+        let y = u16::MAX;
+        let b = u16x4::new(y, y, y, y);
+        let y = y as u32;
+        let e = u32x4::new(x.wrapping_add(y), 1 + y, 2 + y, 3 + y);
+        let r: u32x4 = transmute(vaddw_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_u32() {
+        let x = u64::MAX;
+        let a = u64x2::new(x, 1);
+        let y = u32::MAX;
+        let b = u32x2::new(y, y);
+        let y = y as u64;
+        let e = u64x2::new(x.wrapping_add(y), 1 + y);
+        let r: u64x2 = transmute(vaddw_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_s8() {
+        let x = i16::MAX;
+        let a = i16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = i8::MAX;
+        let b = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, y, y, y, y, y, y, y, y);
+        let y = y as i16;
+        let e = i16x8::new(
+            x.wrapping_add(y),
+            1 + y,
+            2 + y,
+            3 + y,
+            4 + y,
+            5 + y,
+            6 + y,
+            7 + y,
+        );
+        let r: i16x8 = transmute(vaddw_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_s16() {
+        let x = i32::MAX;
+        let a = i32x4::new(x, 1, 2, 3);
+        let y = i16::MAX;
+        let b = i16x8::new(0, 0, 0, 0, y, y, y, y);
+        let y = y as i32;
+        let e = i32x4::new(x.wrapping_add(y), 1 + y, 2 + y, 3 + y);
+        let r: i32x4 = transmute(vaddw_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_s32() {
+        let x = i64::MAX;
+        let a = i64x2::new(x, 1);
+        let y = i32::MAX;
+        let b = i32x4::new(0, 0, y, y);
+        let y = y as i64;
+        let e = i64x2::new(x.wrapping_add(y), 1 + y);
+        let r: i64x2 = transmute(vaddw_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_u8() {
+        let x = u16::MAX;
+        let a = u16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = u8::MAX;
+        let b = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, y, y, y, y, y, y, y, y);
+        let y = y as u16;
+        let e = u16x8::new(
+            x.wrapping_add(y),
+            1 + y,
+            2 + y,
+            3 + y,
+            4 + y,
+            5 + y,
+            6 + y,
+            7 + y,
+        );
+        let r: u16x8 = transmute(vaddw_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_u16() {
+        let x = u32::MAX;
+        let a = u32x4::new(x, 1, 2, 3);
+        let y = u16::MAX;
+        let b = u16x8::new(0, 0, 0, 0, y, y, y, y);
+        let y = y as u32;
+        let e = u32x4::new(x.wrapping_add(y), 1 + y, 2 + y, 3 + y);
+        let r: u32x4 = transmute(vaddw_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_u32() {
+        let x = u64::MAX;
+        let a = u64x2::new(x, 1);
+        let y = u32::MAX;
+        let b = u32x4::new(0, 0, y, y);
+        let y = y as u64;
+        let e = u64x2::new(x.wrapping_add(y), 1 + y);
+        let r: u64x2 = transmute(vaddw_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_s16() {
+        let a = i16x8::new(
+            (0 << 8) + 1,
+            (1 << 8) + 1,
+            (2 << 8) + 1,
+            (3 << 8) + 1,
+            (4 << 8) + 1,
+            (5 << 8) + 1,
+            (6 << 8) + 1,
+            (7 << 8) + 1,
+        );
+        let e = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let r: i8x8 = transmute(vaddhn_s16(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_s32() {
+        let a = i32x4::new((0 << 16) + 1, (1 << 16) + 1, (2 << 16) + 1, (3 << 16) + 1);
+        let e = i16x4::new(0, 2, 4, 6);
+        let r: i16x4 = transmute(vaddhn_s32(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_s64() {
+        let a = i64x2::new((0 << 32) + 1, (1 << 32) + 1);
+        let e = i32x2::new(0, 2);
+        let r: i32x2 = transmute(vaddhn_s64(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_u16() {
+        let a = u16x8::new(
+            (0 << 8) + 1,
+            (1 << 8) + 1,
+            (2 << 8) + 1,
+            (3 << 8) + 1,
+            (4 << 8) + 1,
+            (5 << 8) + 1,
+            (6 << 8) + 1,
+            (7 << 8) + 1,
+        );
+        let e = u8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let r: u8x8 = transmute(vaddhn_u16(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_u32() {
+        let a = u32x4::new((0 << 16) + 1, (1 << 16) + 1, (2 << 16) + 1, (3 << 16) + 1);
+        let e = u16x4::new(0, 2, 4, 6);
+        let r: u16x4 = transmute(vaddhn_u32(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_u64() {
+        let a = u64x2::new((0 << 32) + 1, (1 << 32) + 1);
+        let e = u32x2::new(0, 2);
+        let r: u32x2 = transmute(vaddhn_u64(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_s16() {
+        let r = i8x8::splat(42);
+        let a = i16x8::new(
+            (0 << 8) + 1,
+            (1 << 8) + 1,
+            (2 << 8) + 1,
+            (3 << 8) + 1,
+            (4 << 8) + 1,
+            (5 << 8) + 1,
+            (6 << 8) + 1,
+            (7 << 8) + 1,
+        );
+        let e = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 0, 2, 4, 6, 8, 10, 12, 14);
+        let r: i8x16 = transmute(vaddhn_high_s16(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_s32() {
+        let r = i16x4::splat(42);
+        let a = i32x4::new((0 << 16) + 1, (1 << 16) + 1, (2 << 16) + 1, (3 << 16) + 1);
+        let e = i16x8::new(42, 42, 42, 42, 0, 2, 4, 6);
+        let r: i16x8 = transmute(vaddhn_high_s32(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_s64() {
+        let r = i32x2::splat(42);
+        let a = i64x2::new((0 << 32) + 1, (1 << 32) + 1);
+        let e = i32x4::new(42, 42, 0, 2);
+        let r: i32x4 = transmute(vaddhn_high_s64(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_u16() {
+        let r = u8x8::splat(42);
+        let a = u16x8::new(
+            (0 << 8) + 1,
+            (1 << 8) + 1,
+            (2 << 8) + 1,
+            (3 << 8) + 1,
+            (4 << 8) + 1,
+            (5 << 8) + 1,
+            (6 << 8) + 1,
+            (7 << 8) + 1,
+        );
+        let e = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 0, 2, 4, 6, 8, 10, 12, 14);
+        let r: u8x16 = transmute(vaddhn_high_u16(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_u32() {
+        let r = u16x4::splat(42);
+        let a = u32x4::new((0 << 16) + 1, (1 << 16) + 1, (2 << 16) + 1, (3 << 16) + 1);
+        let e = u16x8::new(42, 42, 42, 42, 0, 2, 4, 6);
+        let r: u16x8 = transmute(vaddhn_high_u32(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_u64() {
+        let r = u32x2::splat(42);
+        let a = u64x2::new((0 << 32) + 1, (1 << 32) + 1);
+        let e = u32x4::new(42, 42, 0, 2);
+        let r: u32x4 = transmute(vaddhn_high_u64(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_s16() {
+        let round_constant: i16 = (1 << 8) - 1;
+        let a = i16x8::new(
+            0 << 8,
+            1 << 8,
+            2 << 8,
+            3 << 8,
+            4 << 8,
+            5 << 8,
+            6 << 8,
+            7 << 8,
+        );
+        let b = i16x8::new(
+            0 << 8,
+            (1 << 8) + round_constant,
+            2 << 8,
+            (3 << 8) + round_constant,
+            4 << 8,
+            (5 << 8) + round_constant,
+            6 << 8,
+            (7 << 8) + round_constant,
+        );
+        let e = i8x8::new(0, 3, 4, 7, 8, 11, 12, 15);
+        let r: i8x8 = transmute(vraddhn_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_s32() {
+        let round_constant: i32 = (1 << 16) - 1;
+        let a = i32x4::new(0 << 16, 1 << 16, 2 << 16, 3 << 16);
+        let b = i32x4::new(
+            0 << 16,
+            (1 << 16) + round_constant,
+            2 << 16,
+            (3 << 16) + round_constant,
+        );
+        let e = i16x4::new(0, 3, 4, 7);
+        let r: i16x4 = transmute(vraddhn_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_s64() {
+        let round_constant: i64 = (1 << 32) - 1;
+        let a = i64x2::new(0 << 32, 1 << 32);
+        let b = i64x2::new(0 << 32, (1 << 32) + round_constant);
+        let e = i32x2::new(0, 3);
+        let r: i32x2 = transmute(vraddhn_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_u16() {
+        let round_constant: u16 = (1 << 8) - 1;
+        let a = u16x8::new(
+            0 << 8,
+            1 << 8,
+            2 << 8,
+            3 << 8,
+            4 << 8,
+            5 << 8,
+            6 << 8,
+            7 << 8,
+        );
+        let b = u16x8::new(
+            0 << 8,
+            (1 << 8) + round_constant,
+            2 << 8,
+            (3 << 8) + round_constant,
+            4 << 8,
+            (5 << 8) + round_constant,
+            6 << 8,
+            (7 << 8) + round_constant,
+        );
+        let e = u8x8::new(0, 3, 4, 7, 8, 11, 12, 15);
+        let r: u8x8 = transmute(vraddhn_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_u32() {
+        let round_constant: u32 = (1 << 16) - 1;
+        let a = u32x4::new(0 << 16, 1 << 16, 2 << 16, 3 << 16);
+        let b = u32x4::new(
+            0 << 16,
+            (1 << 16) + round_constant,
+            2 << 16,
+            (3 << 16) + round_constant,
+        );
+        let e = u16x4::new(0, 3, 4, 7);
+        let r: u16x4 = transmute(vraddhn_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_u64() {
+        let round_constant: u64 = (1 << 32) - 1;
+        let a = u64x2::new(0 << 32, 1 << 32);
+        let b = u64x2::new(0 << 32, (1 << 32) + round_constant);
+        let e = u32x2::new(0, 3);
+        let r: u32x2 = transmute(vraddhn_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_s16() {
+        let r = i8x8::splat(42);
+        let round_constant: i16 = (1 << 8) - 1;
+        let a = i16x8::new(
+            0 << 8,
+            1 << 8,
+            2 << 8,
+            3 << 8,
+            4 << 8,
+            5 << 8,
+            6 << 8,
+            7 << 8,
+        );
+        let b = i16x8::new(
+            0 << 8,
+            (1 << 8) + round_constant,
+            2 << 8,
+            (3 << 8) + round_constant,
+            4 << 8,
+            (5 << 8) + round_constant,
+            6 << 8,
+            (7 << 8) + round_constant,
+        );
+        let e = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 0, 3, 4, 7, 8, 11, 12, 15);
+        let r: i8x16 = transmute(vraddhn_high_s16(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_s32() {
+        let r = i16x4::splat(42);
+        let round_constant: i32 = (1 << 16) - 1;
+        let a = i32x4::new(0 << 16, 1 << 16, 2 << 16, 3 << 16);
+        let b = i32x4::new(
+            0 << 16,
+            (1 << 16) + round_constant,
+            2 << 16,
+            (3 << 16) + round_constant,
+        );
+        let e = i16x8::new(42, 42, 42, 42, 0, 3, 4, 7);
+        let r: i16x8 = transmute(vraddhn_high_s32(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_s64() {
+        let r = i32x2::splat(42);
+        let round_constant: i64 = (1 << 32) - 1;
+        let a = i64x2::new(0 << 32, 1 << 32);
+        let b = i64x2::new(0 << 32, (1 << 32) + round_constant);
+        let e = i32x4::new(42, 42, 0, 3);
+        let r: i32x4 = transmute(vraddhn_high_s64(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_u16() {
+        let r = u8x8::splat(42);
+        let round_constant: u16 = (1 << 8) - 1;
+        let a = u16x8::new(
+            0 << 8,
+            1 << 8,
+            2 << 8,
+            3 << 8,
+            4 << 8,
+            5 << 8,
+            6 << 8,
+            7 << 8,
+        );
+        let b = u16x8::new(
+            0 << 8,
+            (1 << 8) + round_constant,
+            2 << 8,
+            (3 << 8) + round_constant,
+            4 << 8,
+            (5 << 8) + round_constant,
+            6 << 8,
+            (7 << 8) + round_constant,
+        );
+        let e = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 0, 3, 4, 7, 8, 11, 12, 15);
+        let r: u8x16 = transmute(vraddhn_high_u16(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_u32() {
+        let r = u16x4::splat(42);
+        let round_constant: u32 = (1 << 16) - 1;
+        let a = u32x4::new(0 << 16, 1 << 16, 2 << 16, 3 << 16);
+        let b = u32x4::new(
+            0 << 16,
+            (1 << 16) + round_constant,
+            2 << 16,
+            (3 << 16) + round_constant,
+        );
+        let e = u16x8::new(42, 42, 42, 42, 0, 3, 4, 7);
+        let r: u16x8 = transmute(vraddhn_high_s32(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_u64() {
+        let r = u32x2::splat(42);
+        let round_constant: u64 = (1 << 32) - 1;
+        let a = u64x2::new(0 << 32, 1 << 32);
+        let b = u64x2::new(0 << 32, (1 << 32) + round_constant);
+        let e = u32x4::new(42, 42, 0, 3);
+        let r: u32x4 = transmute(vraddhn_high_s64(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_s8() {
+        let a = i8x8::new(-4, -3, -2, -1, 0, 1, 2, 3);
+        let r: i16x4 = transmute(vpaddl_s8(transmute(a)));
+        let e = i16x4::new(-7, -3, 1, 5);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_s16() {
+        let a = i16x4::new(-2, -1, 0, 1);
+        let r: i32x2 = transmute(vpaddl_s16(transmute(a)));
+        let e = i32x2::new(-3, 1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_s32() {
+        let a = i32x2::new(-1, 0);
+        let r: i64x1 = transmute(vpaddl_s32(transmute(a)));
+        let e = i64x1::new(-1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_s8() {
+        let a = i8x16::new(-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vpaddlq_s8(transmute(a)));
+        let e = i16x8::new(-15, -11, -7, -3, 1, 5, 9, 13);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_s16() {
+        let a = i16x8::new(-4, -3, -2, -1, 0, 1, 2, 3);
+        let r: i32x4 = transmute(vpaddlq_s16(transmute(a)));
+        let e = i32x4::new(-7, -3, 1, 5);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_s32() {
+        let a = i32x4::new(-2, -1, 0, 1);
+        let r: i64x2 = transmute(vpaddlq_s32(transmute(a)));
+        let e = i64x2::new(-3, 1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, u8::MAX);
+        let r: u16x4 = transmute(vpaddl_u8(transmute(a)));
+        let e = u16x4::new(1, 5, 9, u8::MAX as u16 + 6);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_u16() {
+        let a = u16x4::new(0, 1, 2, u16::MAX);
+        let r: u32x2 = transmute(vpaddl_u16(transmute(a)));
+        let e = u32x2::new(1, u16::MAX as u32 + 2);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_u32() {
+        let a = u32x2::new(1, u32::MAX);
+        let r: u64x1 = transmute(vpaddl_u32(transmute(a)));
+        let e = u64x1::new(u32::MAX as u64 + 1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, u8::MAX);
+        let r: u16x8 = transmute(vpaddlq_u8(transmute(a)));
+        let e = u16x8::new(1, 5, 9, 13, 17, 21, 25, u8::MAX as u16 + 14);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, u16::MAX);
+        let r: u32x4 = transmute(vpaddlq_u16(transmute(a)));
+        let e = u32x4::new(1, 5, 9, u16::MAX as u32 + 6);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_u32() {
+        let a = u32x4::new(0, 1, 2, u32::MAX);
+        let r: u64x2 = transmute(vpaddlq_u32(transmute(a)));
+        let e = u64x2::new(1, u32::MAX as u64 + 2);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_s8() {
+        let a = i16x4::new(42, 42, 42, 42);
+        let b = i8x8::new(-4, -3, -2, -1, 0, 1, 2, 3);
+        let r: i16x4 = transmute(vpadal_s8(transmute(a), transmute(b)));
+        let e = i16x4::new(35, 39, 43, 47);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_s16() {
+        let a = i32x2::new(42, 42);
+        let b = i16x4::new(-2, -1, 0, 1);
+        let r: i32x2 = transmute(vpadal_s16(transmute(a), transmute(b)));
+        let e = i32x2::new(39, 43);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_s32() {
+        let a = i64x1::new(42);
+        let b = i32x2::new(-1, 0);
+        let r: i64x1 = transmute(vpadal_s32(transmute(a), transmute(b)));
+        let e = i64x1::new(41);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_s8() {
+        let a = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b = i8x16::new(-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vpadalq_s8(transmute(a), transmute(b)));
+        let e = i16x8::new(27, 31, 35, 39, 43, 47, 51, 55);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_s16() {
+        let a = i32x4::new(42, 42, 42, 42);
+        let b = i16x8::new(-4, -3, -2, -1, 0, 1, 2, 3);
+        let r: i32x4 = transmute(vpadalq_s16(transmute(a), transmute(b)));
+        let e = i32x4::new(35, 39, 43, 47);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_s32() {
+        let a = i64x2::new(42, 42);
+        let b = i32x4::new(-2, -1, 0, 1);
+        let r: i64x2 = transmute(vpadalq_s32(transmute(a), transmute(b)));
+        let e = i64x2::new(39, 43);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_u8() {
+        let a = u16x4::new(42, 42, 42, 42);
+        let b = u8x8::new(0, 1, 2, 3, 4, 5, 6, u8::MAX);
+        let r: u16x4 = transmute(vpadal_u8(transmute(a), transmute(b)));
+        let e = u16x4::new(43, 47, 51, u8::MAX as u16 + 48);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_u16() {
+        let a = u32x2::new(42, 42);
+        let b = u16x4::new(0, 1, 2, u16::MAX);
+        let r: u32x2 = transmute(vpadal_u16(transmute(a), transmute(b)));
+        let e = u32x2::new(43, u16::MAX as u32 + 44);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_u32() {
+        let a = u64x1::new(42);
+        let b = u32x2::new(1, u32::MAX);
+        let r: u64x1 = transmute(vpadal_u32(transmute(a), transmute(b)));
+        let e = u64x1::new(u32::MAX as u64 + 43);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_u8() {
+        let a = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, u8::MAX);
+        let r: u16x8 = transmute(vpadalq_u8(transmute(a), transmute(b)));
+        let e = u16x8::new(43, 47, 51, 55, 59, 63, 67, u8::MAX as u16 + 56);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_u16() {
+        let a = u32x4::new(42, 42, 42, 42);
+        let b = u16x8::new(0, 1, 2, 3, 4, 5, 6, u16::MAX);
+        let r: u32x4 = transmute(vpadalq_u16(transmute(a), transmute(b)));
+        let e = u32x4::new(43, 47, 51, u16::MAX as u32 + 48);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_u32() {
+        let a = u64x2::new(42, 42);
+        let b = u32x4::new(0, 1, 2, u32::MAX);
+        let r: u64x2 = transmute(vpadalq_u32(transmute(a), transmute(b)));
+        let e = u64x2::new(43, u32::MAX as u64 + 44);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = i8x8::new(-1, -2, -3, -4, -5, -6, -7, -8);
+        let r: i8x8 = transmute(vmvn_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e = i8x16::new(
+            -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16,
+        );
+        let r: i8x16 = transmute(vmvnq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let e = i16x4::new(-1, -2, -3, -4);
+        let r: i16x4 = transmute(vmvn_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = i16x8::new(-1, -2, -3, -4, -5, -6, -7, -8);
+        let r: i16x8 = transmute(vmvnq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_s32() {
+        let a = i32x2::new(0, 1);
+        let e = i32x2::new(-1, -2);
+        let r: i32x2 = transmute(vmvn_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let e = i32x4::new(-1, -2, -3, -4);
+        let r: i32x4 = transmute(vmvnq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = u8x8::new(255, 254, 253, 252, 251, 250, 249, 248);
+        let r: u8x8 = transmute(vmvn_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e = u8x16::new(
+            255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240,
+        );
+        let r: u8x16 = transmute(vmvnq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let e = u16x4::new(65_535, 65_534, 65_533, 65_532);
+        let r: u16x4 = transmute(vmvn_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = u16x8::new(
+            65_535, 65_534, 65_533, 65_532, 65_531, 65_530, 65_529, 65_528,
+        );
+        let r: u16x8 = transmute(vmvnq_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_u32() {
+        let a = u32x2::new(0, 1);
+        let e = u32x2::new(4_294_967_295, 4_294_967_294);
+        let r: u32x2 = transmute(vmvn_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let e = u32x4::new(4_294_967_295, 4_294_967_294, 4_294_967_293, 4_294_967_292);
+        let r: u32x4 = transmute(vmvnq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvn_p8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e = u8x8::new(255, 254, 253, 252, 251, 250, 249, 248);
+        let r: u8x8 = transmute(vmvn_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmvnq_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e = u8x16::new(
+            255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240,
+        );
+        let r: u8x16 = transmute(vmvnq_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_s8() {
+        let a = i8x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let b = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e = i8x8::new(0, -2, -2, -4, -4, -6, -6, -8);
+        let r: i8x8 = transmute(vbic_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_s8() {
+        let a = i8x16::new(
+            0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+        );
+        let b = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e = i8x16::new(
+            0, -2, -2, -4, -4, -6, -6, -8, -8, -10, -10, -12, -12, -14, -14, -16,
+        );
+        let r: i8x16 = transmute(vbicq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_s16() {
+        let a = i16x4::new(0, -1, -2, -3);
+        let b = i16x4::new(1, 1, 1, 1);
+        let e = i16x4::new(0, -2, -2, -4);
+        let r: i16x4 = transmute(vbic_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_s16() {
+        let a = i16x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let b = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e = i16x8::new(0, -2, -2, -4, -4, -6, -6, -8);
+        let r: i16x8 = transmute(vbicq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_s32() {
+        let a = i32x2::new(0, -1);
+        let b = i32x2::new(1, 1);
+        let e = i32x2::new(0, -2);
+        let r: i32x2 = transmute(vbic_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_s32() {
+        let a = i32x4::new(0, -1, -2, -3);
+        let b = i32x4::new(1, 1, 1, 1);
+        let e = i32x4::new(0, -2, -2, -4);
+        let r: i32x4 = transmute(vbicq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_s64() {
+        let a = i64x1::new(-1);
+        let b = i64x1::new(1);
+        let e = i64x1::new(-2);
+        let r: i64x1 = transmute(vbic_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_s64() {
+        let a = i64x2::new(0, -1);
+        let b = i64x2::new(1, 1);
+        let e = i64x2::new(0, -2);
+        let r: i64x2 = transmute(vbicq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e = u8x8::new(0, 0, 2, 2, 4, 4, 6, 6);
+        let r: u8x8 = transmute(vbic_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e = u8x16::new(0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
+        let r: u8x16 = transmute(vbicq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let b = u16x4::new(1, 1, 1, 1);
+        let e = u16x4::new(0, 0, 2, 2);
+        let r: u16x4 = transmute(vbic_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let e = u16x8::new(0, 0, 2, 2, 4, 4, 6, 6);
+        let r: u16x8 = transmute(vbicq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_u32() {
+        let a = u32x2::new(0, 1);
+        let b = u32x2::new(1, 1);
+        let e = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vbic_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let b = u32x4::new(1, 1, 1, 1);
+        let e = u32x4::new(0, 0, 2, 2);
+        let r: u32x4 = transmute(vbicq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbic_u64() {
+        let a = u64x1::new(1);
+        let b = u64x1::new(1);
+        let e = u64x1::new(0);
+        let r: u64x1 = transmute(vbic_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbicq_u64() {
+        let a = u64x2::new(0, 1);
+        let b = u64x2::new(1, 1);
+        let e = u64x2::new(0, 0);
+        let r: u64x2 = transmute(vbicq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_s8() {
+        let a = u8x8::new(u8::MAX, 0, u8::MAX, 0, u8::MAX, 0, u8::MAX, 0);
+        let b = i8x8::new(
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+        );
+        let c = i8x8::new(
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+        );
+        let e = i8x8::new(
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+        );
+        let r: i8x8 = transmute(vbsl_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_s16() {
+        let a = u16x4::new(u16::MAX, 0, u16::MAX, 0);
+        let b = i16x4::new(i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        let c = i16x4::new(i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        let e = i16x4::new(i16::MAX, i16::MIN, i16::MAX, i16::MIN);
+        let r: i16x4 = transmute(vbsl_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_s32() {
+        let a = u32x2::new(u32::MAX, u32::MIN);
+        let b = i32x2::new(i32::MAX, i32::MAX);
+        let c = i32x2::new(i32::MIN, i32::MIN);
+        let e = i32x2::new(i32::MAX, i32::MIN);
+        let r: i32x2 = transmute(vbsl_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_s64() {
+        let a = u64x1::new(u64::MAX);
+        let b = i64x1::new(i64::MAX);
+        let c = i64x1::new(i64::MIN);
+        let e = i64x1::new(i64::MAX);
+        let r: i64x1 = transmute(vbsl_s64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_u8() {
+        let a = u8x8::new(u8::MAX, 0, u8::MAX, 0, u8::MAX, 0, u8::MAX, 0);
+        let b = u8x8::new(
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+        );
+        let c = u8x8::new(
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+        );
+        let e = u8x8::new(
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+        );
+        let r: u8x8 = transmute(vbsl_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_u16() {
+        let a = u16x4::new(u16::MAX, 0, u16::MAX, 0);
+        let b = u16x4::new(u16::MAX, u16::MAX, u16::MAX, u16::MAX);
+        let c = u16x4::new(u16::MIN, u16::MIN, u16::MIN, u16::MIN);
+        let e = u16x4::new(u16::MAX, u16::MIN, u16::MAX, u16::MIN);
+        let r: u16x4 = transmute(vbsl_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_u32() {
+        let a = u32x2::new(u32::MAX, 0);
+        let b = u32x2::new(u32::MAX, u32::MAX);
+        let c = u32x2::new(u32::MIN, u32::MIN);
+        let e = u32x2::new(u32::MAX, u32::MIN);
+        let r: u32x2 = transmute(vbsl_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_u64() {
+        let a = u64x1::new(u64::MAX);
+        let b = u64x1::new(u64::MAX);
+        let c = u64x1::new(u64::MIN);
+        let e = u64x1::new(u64::MAX);
+        let r: u64x1 = transmute(vbsl_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_f32() {
+        let a = u32x2::new(u32::MAX, 0);
+        let b = f32x2::new(f32::MAX, f32::MAX);
+        let c = f32x2::new(f32::MIN, f32::MIN);
+        let e = f32x2::new(f32::MAX, f32::MIN);
+        let r: f32x2 = transmute(vbsl_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_p8() {
+        let a = u8x8::new(u8::MAX, 0, u8::MAX, 0, u8::MAX, 0, u8::MAX, 0);
+        let b = u8x8::new(
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+        );
+        let c = u8x8::new(
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+        );
+        let e = u8x8::new(
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+        );
+        let r: u8x8 = transmute(vbsl_p8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbsl_p16() {
+        let a = u16x4::new(u16::MAX, 0, u16::MAX, 0);
+        let b = u16x4::new(u16::MAX, u16::MAX, u16::MAX, u16::MAX);
+        let c = u16x4::new(u16::MIN, u16::MIN, u16::MIN, u16::MIN);
+        let e = u16x4::new(u16::MAX, u16::MIN, u16::MAX, u16::MIN);
+        let r: u16x4 = transmute(vbsl_p16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_s8() {
+        let a = u8x16::new(
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+        );
+        let b = i8x16::new(
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+        );
+        let c = i8x16::new(
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+            i8::MIN,
+        );
+        let e = i8x16::new(
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+            i8::MAX,
+            i8::MIN,
+        );
+        let r: i8x16 = transmute(vbslq_s8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_s16() {
+        let a = u16x8::new(u16::MAX, 0, u16::MAX, 0, u16::MAX, 0, u16::MAX, 0);
+        let b = i16x8::new(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+        );
+        let c = i16x8::new(
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let e = i16x8::new(
+            i16::MAX,
+            i16::MIN,
+            i16::MAX,
+            i16::MIN,
+            i16::MAX,
+            i16::MIN,
+            i16::MAX,
+            i16::MIN,
+        );
+        let r: i16x8 = transmute(vbslq_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_s32() {
+        let a = u32x4::new(u32::MAX, 0, u32::MAX, 0);
+        let b = i32x4::new(i32::MAX, i32::MAX, i32::MAX, i32::MAX);
+        let c = i32x4::new(i32::MIN, i32::MIN, i32::MIN, i32::MIN);
+        let e = i32x4::new(i32::MAX, i32::MIN, i32::MAX, i32::MIN);
+        let r: i32x4 = transmute(vbslq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_s64() {
+        let a = u64x2::new(u64::MAX, 0);
+        let b = i64x2::new(i64::MAX, i64::MAX);
+        let c = i64x2::new(i64::MIN, i64::MIN);
+        let e = i64x2::new(i64::MAX, i64::MIN);
+        let r: i64x2 = transmute(vbslq_s64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_u8() {
+        let a = u8x16::new(
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+        );
+        let b = u8x16::new(
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+        );
+        let c = u8x16::new(
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+        );
+        let e = u8x16::new(
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+        );
+        let r: u8x16 = transmute(vbslq_u8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_u16() {
+        let a = u16x8::new(u16::MAX, 0, u16::MAX, 0, u16::MAX, 0, u16::MAX, 0);
+        let b = u16x8::new(
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+        );
+        let c = u16x8::new(
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+        );
+        let e = u16x8::new(
+            u16::MAX,
+            u16::MIN,
+            u16::MAX,
+            u16::MIN,
+            u16::MAX,
+            u16::MIN,
+            u16::MAX,
+            u16::MIN,
+        );
+        let r: u16x8 = transmute(vbslq_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_u32() {
+        let a = u32x4::new(u32::MAX, 0, u32::MAX, 0);
+        let b = u32x4::new(u32::MAX, u32::MAX, u32::MAX, u32::MAX);
+        let c = u32x4::new(u32::MIN, u32::MIN, u32::MIN, u32::MIN);
+        let e = u32x4::new(u32::MAX, u32::MIN, u32::MAX, u32::MIN);
+        let r: u32x4 = transmute(vbslq_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_u64() {
+        let a = u64x2::new(u64::MAX, 0);
+        let b = u64x2::new(u64::MAX, u64::MAX);
+        let c = u64x2::new(u64::MIN, u64::MIN);
+        let e = u64x2::new(u64::MAX, u64::MIN);
+        let r: u64x2 = transmute(vbslq_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_f32() {
+        let a = u32x4::new(u32::MAX, 0, u32::MAX, 0);
+        let b = f32x4::new(f32::MAX, f32::MAX, f32::MAX, f32::MAX);
+        let c = f32x4::new(f32::MIN, f32::MIN, f32::MIN, f32::MIN);
+        let e = f32x4::new(f32::MAX, f32::MIN, f32::MAX, f32::MIN);
+        let r: f32x4 = transmute(vbslq_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_p8() {
+        let a = u8x16::new(
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+            u8::MAX,
+            0,
+        );
+        let b = u8x16::new(
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+            u8::MAX,
+        );
+        let c = u8x16::new(
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+            u8::MIN,
+        );
+        let e = u8x16::new(
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+            u8::MAX,
+            u8::MIN,
+        );
+        let r: u8x16 = transmute(vbslq_p8(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vbslq_p16() {
+        let a = u16x8::new(u16::MAX, 0, u16::MAX, 0, u16::MAX, 0, u16::MAX, 0);
+        let b = u16x8::new(
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+            u16::MAX,
+        );
+        let c = u16x8::new(
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+            u16::MIN,
+        );
+        let e = u16x8::new(
+            u16::MAX,
+            u16::MIN,
+            u16::MAX,
+            u16::MIN,
+            u16::MAX,
+            u16::MIN,
+            u16::MAX,
+            u16::MIN,
+        );
+        let r: u16x8 = transmute(vbslq_p16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_s8() {
+        let a = i8x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let b = i8x8::new(-2, -2, -2, -2, -2, -2, -2, -2);
+        let e = i8x8::new(1, -1, -1, -3, -3, -5, -5, -7);
+        let r: i8x8 = transmute(vorn_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_s8() {
+        let a = i8x16::new(
+            0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+        );
+        let b = i8x16::new(
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+        );
+        let e = i8x16::new(
+            1, -1, -1, -3, -3, -5, -5, -7, -7, -9, -9, -11, -11, -13, -13, -15,
+        );
+        let r: i8x16 = transmute(vornq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_s16() {
+        let a = i16x4::new(0, -1, -2, -3);
+        let b = i16x4::new(-2, -2, -2, -2);
+        let e = i16x4::new(1, -1, -1, -3);
+        let r: i16x4 = transmute(vorn_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_s16() {
+        let a = i16x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let b = i16x8::new(-2, -2, -2, -2, -2, -2, -2, -2);
+        let e = i16x8::new(1, -1, -1, -3, -3, -5, -5, -7);
+        let r: i16x8 = transmute(vornq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_s32() {
+        let a = i32x2::new(0, -1);
+        let b = i32x2::new(-2, -2);
+        let e = i32x2::new(1, -1);
+        let r: i32x2 = transmute(vorn_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_s32() {
+        let a = i32x4::new(0, -1, -2, -3);
+        let b = i32x4::new(-2, -2, -2, -2);
+        let e = i32x4::new(1, -1, -1, -3);
+        let r: i32x4 = transmute(vornq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_s64() {
+        let a = i64x1::new(0);
+        let b = i64x1::new(-2);
+        let e = i64x1::new(1);
+        let r: i64x1 = transmute(vorn_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_s64() {
+        let a = i64x2::new(0, -1);
+        let b = i64x2::new(-2, -2);
+        let e = i64x2::new(1, -1);
+        let r: i64x2 = transmute(vornq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let t = u8::MAX - 1;
+        let b = u8x8::new(t, t, t, t, t, t, t, t);
+        let e = u8x8::new(1, 1, 3, 3, 5, 5, 7, 7);
+        let r: u8x8 = transmute(vorn_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let t = u8::MAX - 1;
+        let b = u8x16::new(t, t, t, t, t, t, t, t, t, t, t, t, t, t, t, t);
+        let e = u8x16::new(1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
+        let r: u8x16 = transmute(vornq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let t = u16::MAX - 1;
+        let b = u16x4::new(t, t, t, t);
+        let e = u16x4::new(1, 1, 3, 3);
+        let r: u16x4 = transmute(vorn_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let t = u16::MAX - 1;
+        let b = u16x8::new(t, t, t, t, t, t, t, t);
+        let e = u16x8::new(1, 1, 3, 3, 5, 5, 7, 7);
+        let r: u16x8 = transmute(vornq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_u32() {
+        let a = u32x2::new(0, 1);
+        let t = u32::MAX - 1;
+        let b = u32x2::new(t, t);
+        let e = u32x2::new(1, 1);
+        let r: u32x2 = transmute(vorn_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let t = u32::MAX - 1;
+        let b = u32x4::new(t, t, t, t);
+        let e = u32x4::new(1, 1, 3, 3);
+        let r: u32x4 = transmute(vornq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorn_u64() {
+        let a = u64x1::new(0);
+        let t = u64::MAX - 1;
+        let b = u64x1::new(t);
+        let e = u64x1::new(1);
+        let r: u64x1 = transmute(vorn_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vornq_u64() {
+        let a = u64x2::new(0, 1);
+        let t = u64::MAX - 1;
+        let b = u64x2::new(t, t);
+        let e = u64x2::new(1, 1);
+        let r: u64x2 = transmute(vornq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i8x8 = transmute(vmovn_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let e = i16x4::new(1, 2, 3, 4);
+        let r: i16x4 = transmute(vmovn_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_s64() {
+        let a = i64x2::new(1, 2);
+        let e = i32x2::new(1, 2);
+        let r: i32x2 = transmute(vmovn_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u8x8 = transmute(vmovn_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let e = u16x4::new(1, 2, 3, 4);
+        let r: u16x4 = transmute(vmovn_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_u64() {
+        let a = u64x2::new(1, 2);
+        let e = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vmovn_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_s8() {
+        let e = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: i16x8 = transmute(vmovl_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_s16() {
+        let e = i32x4::new(1, 2, 3, 4);
+        let a = i16x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vmovl_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_s32() {
+        let e = i64x2::new(1, 2);
+        let a = i32x2::new(1, 2);
+        let r: i64x2 = transmute(vmovl_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_u8() {
+        let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r: u16x8 = transmute(vmovl_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_u16() {
+        let e = u32x4::new(1, 2, 3, 4);
+        let a = u16x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vmovl_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovl_u32() {
+        let e = u64x2::new(1, 2);
+        let a = u32x2::new(1, 2);
+        let r: u64x2 = transmute(vmovl_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_s8() {
+        let a = i8x8::new(1, -2, 3, -4, 5, 6, 7, 8);
+        let b = i8x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = i8x8::new(-2, -4, 5, 7, 0, 2, 4, 6);
+        let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_s16() {
+        let a = i16x4::new(1, 2, 3, -4);
+        let b = i16x4::new(0, 3, 2, 5);
+        let e = i16x4::new(1, -4, 0, 2);
+        let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_s32() {
+        let a = i32x2::new(1, -2);
+        let b = i32x2::new(0, 3);
+        let e = i32x2::new(-2, 0);
+        let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u8x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = u8x8::new(1, 3, 5, 7, 0, 2, 4, 6);
+        let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_u16() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let b = u16x4::new(0, 3, 2, 5);
+        let e = u16x4::new(1, 3, 0, 2);
+        let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_u32() {
+        let a = u32x2::new(1, 2);
+        let b = u32x2::new(0, 3);
+        let e = u32x2::new(1, 0);
+        let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmin_f32() {
+        let a = f32x2::new(1., -2.);
+        let b = f32x2::new(0., 3.);
+        let e = f32x2::new(-2., 0.);
+        let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_s8() {
+        let a = i8x8::new(1, -2, 3, -4, 5, 6, 7, 8);
+        let b = i8x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = i8x8::new(1, 3, 6, 8, 3, 5, 7, 9);
+        let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_s16() {
+        let a = i16x4::new(1, 2, 3, -4);
+        let b = i16x4::new(0, 3, 2, 5);
+        let e = i16x4::new(2, 3, 3, 5);
+        let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_s32() {
+        let a = i32x2::new(1, -2);
+        let b = i32x2::new(0, 3);
+        let e = i32x2::new(1, 3);
+        let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u8x8::new(0, 3, 2, 5, 4, 7, 6, 9);
+        let e = u8x8::new(2, 4, 6, 8, 3, 5, 7, 9);
+        let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_u16() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let b = u16x4::new(0, 3, 2, 5);
+        let e = u16x4::new(2, 4, 3, 5);
+        let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_u32() {
+        let a = u32x2::new(1, 2);
+        let b = u32x2::new(0, 3);
+        let e = u32x2::new(2, 3);
+        let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpmax_f32() {
+        let a = f32x2::new(1., -2.);
+        let b = f32x2::new(0., 3.);
+        let e = f32x2::new(1., 3.);
+        let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s8() {
+        test_bit_s8(|i, j| vand_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s8() {
+        testq_bit_s8(|i, j| vandq_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s16() {
+        test_bit_s16(|i, j| vand_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s16() {
+        testq_bit_s16(|i, j| vandq_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s32() {
+        test_bit_s32(|i, j| vand_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s32() {
+        testq_bit_s32(|i, j| vandq_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s64() {
+        test_bit_s64(|i, j| vand_s64(i, j), |a: i64, b: i64| -> i64 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s64() {
+        testq_bit_s64(|i, j| vandq_s64(i, j), |a: i64, b: i64| -> i64 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u8() {
+        test_bit_u8(|i, j| vand_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u8() {
+        testq_bit_u8(|i, j| vandq_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u16() {
+        test_bit_u16(|i, j| vand_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u16() {
+        testq_bit_u16(|i, j| vandq_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u32() {
+        test_bit_u32(|i, j| vand_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u32() {
+        testq_bit_u32(|i, j| vandq_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u64() {
+        test_bit_u64(|i, j| vand_u64(i, j), |a: u64, b: u64| -> u64 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u64() {
+        testq_bit_u64(|i, j| vandq_u64(i, j), |a: u64, b: u64| -> u64 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s8() {
+        test_bit_s8(|i, j| vorr_s8(i, j), |a: i8, b: i8| -> i8 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s8() {
+        testq_bit_s8(|i, j| vorrq_s8(i, j), |a: i8, b: i8| -> i8 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s16() {
+        test_bit_s16(|i, j| vorr_s16(i, j), |a: i16, b: i16| -> i16 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s16() {
+        testq_bit_s16(|i, j| vorrq_s16(i, j), |a: i16, b: i16| -> i16 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s32() {
+        test_bit_s32(|i, j| vorr_s32(i, j), |a: i32, b: i32| -> i32 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s32() {
+        testq_bit_s32(|i, j| vorrq_s32(i, j), |a: i32, b: i32| -> i32 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s64() {
+        test_bit_s64(|i, j| vorr_s64(i, j), |a: i64, b: i64| -> i64 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s64() {
+        testq_bit_s64(|i, j| vorrq_s64(i, j), |a: i64, b: i64| -> i64 { a | b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u8() {
+        test_bit_u8(|i, j| vorr_u8(i, j), |a: u8, b: u8| -> u8 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u8() {
+        testq_bit_u8(|i, j| vorrq_u8(i, j), |a: u8, b: u8| -> u8 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u16() {
+        test_bit_u16(|i, j| vorr_u16(i, j), |a: u16, b: u16| -> u16 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u16() {
+        testq_bit_u16(|i, j| vorrq_u16(i, j), |a: u16, b: u16| -> u16 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u32() {
+        test_bit_u32(|i, j| vorr_u32(i, j), |a: u32, b: u32| -> u32 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u32() {
+        testq_bit_u32(|i, j| vorrq_u32(i, j), |a: u32, b: u32| -> u32 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u64() {
+        test_bit_u64(|i, j| vorr_u64(i, j), |a: u64, b: u64| -> u64 { a | b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u64() {
+        testq_bit_u64(|i, j| vorrq_u64(i, j), |a: u64, b: u64| -> u64 { a | b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s8() {
+        test_bit_s8(|i, j| veor_s8(i, j), |a: i8, b: i8| -> i8 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s8() {
+        testq_bit_s8(|i, j| veorq_s8(i, j), |a: i8, b: i8| -> i8 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s16() {
+        test_bit_s16(|i, j| veor_s16(i, j), |a: i16, b: i16| -> i16 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s16() {
+        testq_bit_s16(|i, j| veorq_s16(i, j), |a: i16, b: i16| -> i16 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s32() {
+        test_bit_s32(|i, j| veor_s32(i, j), |a: i32, b: i32| -> i32 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s32() {
+        testq_bit_s32(|i, j| veorq_s32(i, j), |a: i32, b: i32| -> i32 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s64() {
+        test_bit_s64(|i, j| veor_s64(i, j), |a: i64, b: i64| -> i64 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s64() {
+        testq_bit_s64(|i, j| veorq_s64(i, j), |a: i64, b: i64| -> i64 { a ^ b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u8() {
+        test_bit_u8(|i, j| veor_u8(i, j), |a: u8, b: u8| -> u8 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u8() {
+        testq_bit_u8(|i, j| veorq_u8(i, j), |a: u8, b: u8| -> u8 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u16() {
+        test_bit_u16(|i, j| veor_u16(i, j), |a: u16, b: u16| -> u16 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u16() {
+        testq_bit_u16(|i, j| veorq_u16(i, j), |a: u16, b: u16| -> u16 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u32() {
+        test_bit_u32(|i, j| veor_u32(i, j), |a: u32, b: u32| -> u32 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u32() {
+        testq_bit_u32(|i, j| veorq_u32(i, j), |a: u32, b: u32| -> u32 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_u64() {
+        test_bit_u64(|i, j| veor_u64(i, j), |a: u64, b: u64| -> u64 { a ^ b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u64() {
+        testq_bit_u64(|i, j| veorq_u64(i, j), |a: u64, b: u64| -> u64 { a ^ b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s8() {
+        test_cmp_s8(
+            |i, j| vceq_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a == b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s8() {
+        testq_cmp_s8(
+            |i, j| vceqq_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a == b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s16() {
+        test_cmp_s16(
+            |i, j| vceq_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a == b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s16() {
+        testq_cmp_s16(
+            |i, j| vceqq_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a == b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_s32() {
+        test_cmp_s32(
+            |i, j| vceq_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a == b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_s32() {
+        testq_cmp_s32(
+            |i, j| vceqq_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a == b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u8() {
+        test_cmp_u8(
+            |i, j| vceq_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a == b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u8() {
+        testq_cmp_u8(
+            |i, j| vceqq_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a == b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u16() {
+        test_cmp_u16(
+            |i, j| vceq_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a == b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u16() {
+        testq_cmp_u16(
+            |i, j| vceqq_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a == b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_u32() {
+        test_cmp_u32(
+            |i, j| vceq_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a == b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_u32() {
+        testq_cmp_u32(
+            |i, j| vceqq_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a == b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceq_f32() {
+        test_cmp_f32(
+            |i, j| vcge_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a == b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqq_f32() {
+        testq_cmp_f32(
+            |i, j| vcgeq_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a == b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s8() {
+        test_cmp_s8(
+            |i, j| vcgt_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a > b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s8() {
+        testq_cmp_s8(
+            |i, j| vcgtq_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a > b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s16() {
+        test_cmp_s16(
+            |i, j| vcgt_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a > b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s16() {
+        testq_cmp_s16(
+            |i, j| vcgtq_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a > b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_s32() {
+        test_cmp_s32(
+            |i, j| vcgt_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a > b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_s32() {
+        testq_cmp_s32(
+            |i, j| vcgtq_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a > b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u8() {
+        test_cmp_u8(
+            |i, j| vcgt_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a > b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u8() {
+        testq_cmp_u8(
+            |i, j| vcgtq_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a > b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u16() {
+        test_cmp_u16(
+            |i, j| vcgt_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a > b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u16() {
+        testq_cmp_u16(
+            |i, j| vcgtq_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a > b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_u32() {
+        test_cmp_u32(
+            |i, j| vcgt_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a > b {
+                    0xFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_u32() {
+        testq_cmp_u32(
+            |i, j| vcgtq_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a > b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_f32() {
+        test_cmp_f32(
+            |i, j| vcgt_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a > b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgtq_f32() {
+        testq_cmp_f32(
+            |i, j| vcgtq_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a > b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s8() {
+        test_cmp_s8(
+            |i, j| vclt_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a < b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s8() {
+        testq_cmp_s8(
+            |i, j| vcltq_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a < b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s16() {
+        test_cmp_s16(
+            |i, j| vclt_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a < b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s16() {
+        testq_cmp_s16(
+            |i, j| vcltq_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a < b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s32() {
+        test_cmp_s32(
+            |i, j| vclt_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a < b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_s32() {
+        testq_cmp_s32(
+            |i, j| vcltq_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a < b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u8() {
+        test_cmp_u8(
+            |i, j| vclt_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a < b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u8() {
+        testq_cmp_u8(
+            |i, j| vcltq_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a < b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u16() {
+        test_cmp_u16(
+            |i, j| vclt_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a < b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u16() {
+        testq_cmp_u16(
+            |i, j| vcltq_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a < b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_u32() {
+        test_cmp_u32(
+            |i, j| vclt_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a < b {
+                    0xFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u32() {
+        testq_cmp_u32(
+            |i, j| vcltq_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a < b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_f32() {
+        test_cmp_f32(
+            |i, j| vclt_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a < b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_f32() {
+        testq_cmp_f32(
+            |i, j| vcltq_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a < b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s8() {
+        test_cmp_s8(
+            |i, j| vcle_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a <= b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s8() {
+        testq_cmp_s8(
+            |i, j| vcleq_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a <= b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s16() {
+        test_cmp_s16(
+            |i, j| vcle_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a <= b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s16() {
+        testq_cmp_s16(
+            |i, j| vcleq_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a <= b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_s32() {
+        test_cmp_s32(
+            |i, j| vcle_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a <= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_s32() {
+        testq_cmp_s32(
+            |i, j| vcleq_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a <= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u8() {
+        test_cmp_u8(
+            |i, j| vcle_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a <= b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u8() {
+        testq_cmp_u8(
+            |i, j| vcleq_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a <= b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u16() {
+        test_cmp_u16(
+            |i, j| vcle_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a <= b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u16() {
+        testq_cmp_u16(
+            |i, j| vcleq_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a <= b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_u32() {
+        test_cmp_u32(
+            |i, j| vcle_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a <= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_u32() {
+        testq_cmp_u32(
+            |i, j| vcleq_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a <= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcle_f32() {
+        test_cmp_f32(
+            |i, j| vcle_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a <= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcleq_f32() {
+        testq_cmp_f32(
+            |i, j| vcleq_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a <= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s8() {
+        test_cmp_s8(
+            |i, j| vcge_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a >= b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s8() {
+        testq_cmp_s8(
+            |i, j| vcgeq_s8(i, j),
+            |a: i8, b: i8| -> u8 {
+                if a >= b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s16() {
+        test_cmp_s16(
+            |i, j| vcge_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a >= b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s16() {
+        testq_cmp_s16(
+            |i, j| vcgeq_s16(i, j),
+            |a: i16, b: i16| -> u16 {
+                if a >= b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_s32() {
+        test_cmp_s32(
+            |i, j| vcge_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a >= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_s32() {
+        testq_cmp_s32(
+            |i, j| vcgeq_s32(i, j),
+            |a: i32, b: i32| -> u32 {
+                if a >= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u8() {
+        test_cmp_u8(
+            |i, j| vcge_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a >= b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u8() {
+        testq_cmp_u8(
+            |i, j| vcgeq_u8(i, j),
+            |a: u8, b: u8| -> u8 {
+                if a >= b {
+                    0xFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u16() {
+        test_cmp_u16(
+            |i, j| vcge_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a >= b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u16() {
+        testq_cmp_u16(
+            |i, j| vcgeq_u16(i, j),
+            |a: u16, b: u16| -> u16 {
+                if a >= b {
+                    0xFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_u32() {
+        test_cmp_u32(
+            |i, j| vcge_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a >= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_u32() {
+        testq_cmp_u32(
+            |i, j| vcgeq_u32(i, j),
+            |a: u32, b: u32| -> u32 {
+                if a >= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcge_f32() {
+        test_cmp_f32(
+            |i, j| vcge_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a >= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgeq_f32() {
+        testq_cmp_f32(
+            |i, j| vcgeq_f32(i, j),
+            |a: f32, b: f32| -> u32 {
+                if a >= b {
+                    0xFFFFFFFF
+                } else {
+                    0
+                }
+            },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s8() {
+        test_ari_s8(
+            |i, j| vqsub_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s8() {
+        testq_ari_s8(
+            |i, j| vqsubq_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s16() {
+        test_ari_s16(
+            |i, j| vqsub_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s16() {
+        testq_ari_s16(
+            |i, j| vqsubq_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_s32() {
+        test_ari_s32(
+            |i, j| vqsub_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_s32() {
+        testq_ari_s32(
+            |i, j| vqsubq_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.saturating_sub(b) },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u8() {
+        test_ari_u8(
+            |i, j| vqsub_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u8() {
+        testq_ari_u8(
+            |i, j| vqsubq_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u16() {
+        test_ari_u16(
+            |i, j| vqsub_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u16() {
+        testq_ari_u16(
+            |i, j| vqsubq_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsub_u32() {
+        test_ari_u32(
+            |i, j| vqsub_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.saturating_sub(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubq_u32() {
+        testq_ari_u32(
+            |i, j| vqsubq_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.saturating_sub(b) },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_s8() {
+        test_ari_s8(|i, j| vhadd_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_s8() {
+        testq_ari_s8(|i, j| vhaddq_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_s16() {
+        test_ari_s16(|i, j| vhadd_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_s16() {
+        testq_ari_s16(|i, j| vhaddq_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_s32() {
+        test_ari_s32(|i, j| vhadd_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_s32() {
+        testq_ari_s32(|i, j| vhaddq_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_u8() {
+        test_ari_u8(|i, j| vhadd_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_u8() {
+        testq_ari_u8(|i, j| vhaddq_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_u16() {
+        test_ari_u16(|i, j| vhadd_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_u16() {
+        testq_ari_u16(|i, j| vhaddq_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhadd_u32() {
+        test_ari_u32(|i, j| vhadd_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhaddq_u32() {
+        testq_ari_u32(|i, j| vhaddq_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_s8() {
+        test_ari_s8(|i, j| vrhadd_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_s8() {
+        testq_ari_s8(|i, j| vrhaddq_s8(i, j), |a: i8, b: i8| -> i8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_s16() {
+        test_ari_s16(|i, j| vrhadd_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_s16() {
+        testq_ari_s16(|i, j| vrhaddq_s16(i, j), |a: i16, b: i16| -> i16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_s32() {
+        test_ari_s32(|i, j| vrhadd_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_s32() {
+        testq_ari_s32(|i, j| vrhaddq_s32(i, j), |a: i32, b: i32| -> i32 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_u8() {
+        test_ari_u8(|i, j| vrhadd_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_u8() {
+        testq_ari_u8(|i, j| vrhaddq_u8(i, j), |a: u8, b: u8| -> u8 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_u16() {
+        test_ari_u16(|i, j| vrhadd_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_u16() {
+        testq_ari_u16(|i, j| vrhaddq_u16(i, j), |a: u16, b: u16| -> u16 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhadd_u32() {
+        test_ari_u32(|i, j| vrhadd_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrhaddq_u32() {
+        testq_ari_u32(|i, j| vrhaddq_u32(i, j), |a: u32, b: u32| -> u32 { a & b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s8() {
+        test_ari_s8(
+            |i, j| vqadd_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s8() {
+        testq_ari_s8(
+            |i, j| vqaddq_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s16() {
+        test_ari_s16(
+            |i, j| vqadd_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s16() {
+        testq_ari_s16(
+            |i, j| vqaddq_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_s32() {
+        test_ari_s32(
+            |i, j| vqadd_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_s32() {
+        testq_ari_s32(
+            |i, j| vqaddq_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.saturating_add(b) },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u8() {
+        test_ari_u8(
+            |i, j| vqadd_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u8() {
+        testq_ari_u8(
+            |i, j| vqaddq_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u16() {
+        test_ari_u16(
+            |i, j| vqadd_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u16() {
+        testq_ari_u16(
+            |i, j| vqaddq_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqadd_u32() {
+        test_ari_u32(
+            |i, j| vqadd_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.saturating_add(b) },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddq_u32() {
+        testq_ari_u32(
+            |i, j| vqaddq_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.saturating_add(b) },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_s8() {
+        test_ari_s8(
+            |i, j| vmul_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_s8() {
+        testq_ari_s8(
+            |i, j| vmulq_s8(i, j),
+            |a: i8, b: i8| -> i8 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_s16() {
+        test_ari_s16(
+            |i, j| vmul_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_s16() {
+        testq_ari_s16(
+            |i, j| vmulq_s16(i, j),
+            |a: i16, b: i16| -> i16 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_s32() {
+        test_ari_s32(
+            |i, j| vmul_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_s32() {
+        testq_ari_s32(
+            |i, j| vmulq_s32(i, j),
+            |a: i32, b: i32| -> i32 { a.overflowing_mul(b).0 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_u8() {
+        test_ari_u8(
+            |i, j| vmul_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_u8() {
+        testq_ari_u8(
+            |i, j| vmulq_u8(i, j),
+            |a: u8, b: u8| -> u8 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_u16() {
+        test_ari_u16(
+            |i, j| vmul_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_u16() {
+        testq_ari_u16(
+            |i, j| vmulq_u16(i, j),
+            |a: u16, b: u16| -> u16 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_u32() {
+        test_ari_u32(
+            |i, j| vmul_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.overflowing_mul(b).0 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_u32() {
+        testq_ari_u32(
+            |i, j| vmulq_u32(i, j),
+            |a: u32, b: u32| -> u32 { a.overflowing_mul(b).0 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_f32() {
+        test_ari_f32(|i, j| vmul_f32(i, j), |a: f32, b: f32| -> f32 { a * b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_f32() {
+        testq_ari_f32(|i, j| vmulq_f32(i, j), |a: f32, b: f32| -> f32 { a * b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s8() {
+        test_ari_s8(|i, j| vsub_s8(i, j), |a: i8, b: i8| -> i8 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s8() {
+        testq_ari_s8(|i, j| vsubq_s8(i, j), |a: i8, b: i8| -> i8 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s16() {
+        test_ari_s16(|i, j| vsub_s16(i, j), |a: i16, b: i16| -> i16 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s16() {
+        testq_ari_s16(|i, j| vsubq_s16(i, j), |a: i16, b: i16| -> i16 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_s32() {
+        test_ari_s32(|i, j| vsub_s32(i, j), |a: i32, b: i32| -> i32 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_s32() {
+        testq_ari_s32(|i, j| vsubq_s32(i, j), |a: i32, b: i32| -> i32 { a - b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u8() {
+        test_ari_u8(|i, j| vsub_u8(i, j), |a: u8, b: u8| -> u8 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u8() {
+        testq_ari_u8(|i, j| vsubq_u8(i, j), |a: u8, b: u8| -> u8 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u16() {
+        test_ari_u16(|i, j| vsub_u16(i, j), |a: u16, b: u16| -> u16 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u16() {
+        testq_ari_u16(|i, j| vsubq_u16(i, j), |a: u16, b: u16| -> u16 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_u32() {
+        test_ari_u32(|i, j| vsub_u32(i, j), |a: u32, b: u32| -> u32 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_u32() {
+        testq_ari_u32(|i, j| vsubq_u32(i, j), |a: u32, b: u32| -> u32 { a - b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsub_f32() {
+        test_ari_f32(|i, j| vsub_f32(i, j), |a: f32, b: f32| -> f32 { a - b });
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubq_f32() {
+        testq_ari_f32(|i, j| vsubq_f32(i, j), |a: f32, b: f32| -> f32 { a - b });
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_s8() {
+        test_ari_s8(
+            |i, j| vhsub_s8(i, j),
+            |a: i8, b: i8| -> i8 { (((a as i16) - (b as i16)) / 2) as i8 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_s8() {
+        testq_ari_s8(
+            |i, j| vhsubq_s8(i, j),
+            |a: i8, b: i8| -> i8 { (((a as i16) - (b as i16)) / 2) as i8 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_s16() {
+        test_ari_s16(
+            |i, j| vhsub_s16(i, j),
+            |a: i16, b: i16| -> i16 { (((a as i32) - (b as i32)) / 2) as i16 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_s16() {
+        testq_ari_s16(
+            |i, j| vhsubq_s16(i, j),
+            |a: i16, b: i16| -> i16 { (((a as i32) - (b as i32)) / 2) as i16 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_s32() {
+        test_ari_s32(
+            |i, j| vhsub_s32(i, j),
+            |a: i32, b: i32| -> i32 { (((a as i64) - (b as i64)) / 2) as i32 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_s32() {
+        testq_ari_s32(
+            |i, j| vhsubq_s32(i, j),
+            |a: i32, b: i32| -> i32 { (((a as i64) - (b as i64)) / 2) as i32 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_u8() {
+        test_ari_u8(
+            |i, j| vhsub_u8(i, j),
+            |a: u8, b: u8| -> u8 { (((a as u16) - (b as u16)) / 2) as u8 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_u8() {
+        testq_ari_u8(
+            |i, j| vhsubq_u8(i, j),
+            |a: u8, b: u8| -> u8 { (((a as u16) - (b as u16)) / 2) as u8 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_u16() {
+        test_ari_u16(
+            |i, j| vhsub_u16(i, j),
+            |a: u16, b: u16| -> u16 { (((a as u16) - (b as u16)) / 2) as u16 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_u16() {
+        testq_ari_u16(
+            |i, j| vhsubq_u16(i, j),
+            |a: u16, b: u16| -> u16 { (((a as u16) - (b as u16)) / 2) as u16 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsub_u32() {
+        test_ari_u32(
+            |i, j| vhsub_u32(i, j),
+            |a: u32, b: u32| -> u32 { (((a as u64) - (b as u64)) / 2) as u32 },
+        );
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vhsubq_u32() {
+        testq_ari_u32(
+            |i, j| vhsubq_u32(i, j),
+            |a: u32, b: u32| -> u32 { (((a as u64) - (b as u64)) / 2) as u32 },
+        );
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabs_s8() {
+        let a = i8x8::new(-1, 0, 1, -2, 0, 2, -128, 127);
+        let r: i8x8 = transmute(vabs_s8(transmute(a)));
+        let e = i8x8::new(1, 0, 1, 2, 0, 2, -128, 127);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabsq_s8() {
+        let a = i8x16::new(-1, 0, 1, -2, 0, 2, -128, 127, -1, 0, 1, -2, 0, 2, -128, 127);
+        let r: i8x16 = transmute(vabsq_s8(transmute(a)));
+        let e = i8x16::new(1, 0, 1, 2, 0, 2, -128, 127, 1, 0, 1, 2, 0, 2, -128, 127);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabs_s16() {
+        let a = i16x4::new(-1, 0, i16::MIN, i16::MAX);
+        let r: i16x4 = transmute(vabs_s16(transmute(a)));
+        let e = i16x4::new(1, 0, i16::MIN, i16::MAX);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabsq_s16() {
+        let a = i16x8::new(-1, 0, i16::MIN, i16::MAX, -1, 0, i16::MIN, i16::MAX);
+        let r: i16x8 = transmute(vabsq_s16(transmute(a)));
+        let e = i16x8::new(1, 0, i16::MIN, i16::MAX, 1, 0, i16::MIN, i16::MAX);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabs_s32() {
+        let a = i32x2::new(i32::MIN, i32::MIN + 1);
+        let r: i32x2 = transmute(vabs_s32(transmute(a)));
+        let e = i32x2::new(i32::MIN, i32::MAX);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabsq_s32() {
+        let a = i32x4::new(i32::MIN, i32::MIN + 1, 0, -1);
+        let r: i32x4 = transmute(vabsq_s32(transmute(a)));
+        let e = i32x4::new(i32::MIN, i32::MAX, 0, 1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_s8() {
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let c = i8x8::new(10, 9, 8, 7, 6, 5, 4, 3);
+        let r: i8x8 = transmute(vaba_s8(transmute(a), transmute(b), transmute(c)));
+        let e = i8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_s16() {
+        let a = i16x4::new(1, 2, 3, 4);
+        let b = i16x4::new(1, 1, 1, 1);
+        let c = i16x4::new(10, 9, 8, 7);
+        let r: i16x4 = transmute(vaba_s16(transmute(a), transmute(b), transmute(c)));
+        let e = i16x4::new(10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_s32() {
+        let a = i32x2::new(1, 2);
+        let b = i32x2::new(1, 1);
+        let c = i32x2::new(10, 9);
+        let r: i32x2 = transmute(vaba_s32(transmute(a), transmute(b), transmute(c)));
+        let e = i32x2::new(10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let c = u8x8::new(10, 9, 8, 7, 6, 5, 4, 3);
+        let r: u8x8 = transmute(vaba_u8(transmute(a), transmute(b), transmute(c)));
+        let e = u8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_u16() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let b = u16x4::new(1, 1, 1, 1);
+        let c = u16x4::new(10, 9, 8, 7);
+        let r: u16x4 = transmute(vaba_u16(transmute(a), transmute(b), transmute(c)));
+        let e = u16x4::new(10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaba_u32() {
+        let a = u32x2::new(1, 2);
+        let b = u32x2::new(1, 1);
+        let c = u32x2::new(10, 9);
+        let r: u32x2 = transmute(vaba_u32(transmute(a), transmute(b), transmute(c)));
+        let e = u32x2::new(10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_s8() {
+        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3, 2);
+        let b = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let c = i8x16::new(10, 9, 8, 7, 6, 5, 4, 3, 12, 13, 14, 15, 16, 17, 18, 19);
+        let r: i8x16 = transmute(vabaq_s8(transmute(a), transmute(b), transmute(c)));
+        let e = i8x16::new(
+            10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20,
+        );
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_s16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let c = i16x8::new(10, 9, 8, 7, 6, 5, 4, 3);
+        let r: i16x8 = transmute(vabaq_s16(transmute(a), transmute(b), transmute(c)));
+        let e = i16x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_s32() {
+        let a = i32x4::new(1, 2, 3, 4);
+        let b = i32x4::new(1, 1, 1, 1);
+        let c = i32x4::new(10, 9, 8, 7);
+        let r: i32x4 = transmute(vabaq_s32(transmute(a), transmute(b), transmute(c)));
+        let e = i32x4::new(10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_u8() {
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3, 2);
+        let b = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let c = u8x16::new(10, 9, 8, 7, 6, 5, 4, 3, 12, 13, 14, 15, 16, 17, 18, 19);
+        let r: u8x16 = transmute(vabaq_u8(transmute(a), transmute(b), transmute(c)));
+        let e = u8x16::new(
+            10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20,
+        );
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_u16() {
+        let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let c = u16x8::new(10, 9, 8, 7, 6, 5, 4, 3);
+        let r: u16x8 = transmute(vabaq_u16(transmute(a), transmute(b), transmute(c)));
+        let e = u16x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabaq_u32() {
+        let a = u32x4::new(1, 2, 3, 4);
+        let b = u32x4::new(1, 1, 1, 1);
+        let c = u32x4::new(10, 9, 8, 7);
+        let r: u32x4 = transmute(vabaq_u32(transmute(a), transmute(b), transmute(c)));
+        let e = u32x4::new(10, 10, 10, 10);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadd_s16() {
+        let a = i16x4::new(1, 2, 3, 4);
+        let b = i16x4::new(0, -1, -2, -3);
+        let r: i16x4 = transmute(vpadd_s16(transmute(a), transmute(b)));
+        let e = i16x4::new(3, 7, -1, -5);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadd_s32() {
+        let a = i32x2::new(1, 2);
+        let b = i32x2::new(0, -1);
+        let r: i32x2 = transmute(vpadd_s32(transmute(a), transmute(b)));
+        let e = i32x2::new(3, -1);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadd_s8() {
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i8x8::new(0, -1, -2, -3, -4, -5, -6, -7);
+        let r: i8x8 = transmute(vpadd_s8(transmute(a), transmute(b)));
+        let e = i8x8::new(3, 7, 11, 15, -1, -5, -9, -13);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadd_u16() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let b = u16x4::new(30, 31, 32, 33);
+        let r: u16x4 = transmute(vpadd_u16(transmute(a), transmute(b)));
+        let e = u16x4::new(3, 7, 61, 65);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadd_u32() {
+        let a = u32x2::new(1, 2);
+        let b = u32x2::new(30, 31);
+        let r: u32x2 = transmute(vpadd_u32(transmute(a), transmute(b)));
+        let e = u32x2::new(3, 61);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadd_u8() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = u8x8::new(30, 31, 32, 33, 34, 35, 36, 37);
+        let r: u8x8 = transmute(vpadd_u8(transmute(a), transmute(b)));
+        let e = u8x8::new(3, 7, 11, 15, 61, 65, 69, 73);
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcnt_s8() {
+        let a: i8x8 = transmute(u8x8::new(
+            0b11001000, 0b11111111, 0b00000000, 0b11011111, 0b10000001, 0b10101001, 0b00001000,
+            0b00111111,
+        ));
+        let e = i8x8::new(3, 8, 0, 7, 2, 4, 1, 6);
+        let r: i8x8 = transmute(vcnt_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcntq_s8() {
+        let a: i8x16 = transmute(u8x16::new(
+            0b11001000, 0b11111111, 0b00000000, 0b11011111, 0b10000001, 0b10101001, 0b00001000,
+            0b00111111, 0b11101110, 0b00000000, 0b11111111, 0b00100001, 0b11111111, 0b10010111,
+            0b11100000, 0b00010000,
+        ));
+        let e = i8x16::new(3, 8, 0, 7, 2, 4, 1, 6, 6, 0, 8, 2, 8, 5, 3, 1);
+        let r: i8x16 = transmute(vcntq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcnt_u8() {
+        let a = u8x8::new(
+            0b11001000, 0b11111111, 0b00000000, 0b11011111, 0b10000001, 0b10101001, 0b00001000,
+            0b00111111,
+        );
+        let e = u8x8::new(3, 8, 0, 7, 2, 4, 1, 6);
+        let r: u8x8 = transmute(vcnt_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcntq_u8() {
+        let a = u8x16::new(
+            0b11001000, 0b11111111, 0b00000000, 0b11011111, 0b10000001, 0b10101001, 0b00001000,
+            0b00111111, 0b11101110, 0b00000000, 0b11111111, 0b00100001, 0b11111111, 0b10010111,
+            0b11100000, 0b00010000,
+        );
+        let e = u8x16::new(3, 8, 0, 7, 2, 4, 1, 6, 6, 0, 8, 2, 8, 5, 3, 1);
+        let r: u8x16 = transmute(vcntq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcnt_p8() {
+        let a = u8x8::new(
+            0b11001000, 0b11111111, 0b00000000, 0b11011111, 0b10000001, 0b10101001, 0b00001000,
+            0b00111111,
+        );
+        let e = u8x8::new(3, 8, 0, 7, 2, 4, 1, 6);
+        let r: u8x8 = transmute(vcnt_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcntq_p8() {
+        let a = u8x16::new(
+            0b11001000, 0b11111111, 0b00000000, 0b11011111, 0b10000001, 0b10101001, 0b00001000,
+            0b00111111, 0b11101110, 0b00000000, 0b11111111, 0b00100001, 0b11111111, 0b10010111,
+            0b11100000, 0b00010000,
+        );
+        let e = u8x16::new(3, 8, 0, 7, 2, 4, 1, 6, 6, 0, 8, 2, 8, 5, 3, 1);
+        let r: u8x16 = transmute(vcntq_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i8x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i8x8 = transmute(vrev16_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16q_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = i8x16::new(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        let e: i8x16 = transmute(vrev16q_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: u8x8 = transmute(vrev16_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16q_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        let e: u8x16 = transmute(vrev16q_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16_p8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i8x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i8x8 = transmute(vrev16_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev16q_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        let e: u8x16 = transmute(vrev16q_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i8x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: i8x8 = transmute(vrev32_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = i8x16::new(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+        let e: i8x16 = transmute(vrev32q_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: u8x8 = transmute(vrev32_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+        let e: u8x16 = transmute(vrev32q_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let r = i16x4::new(1, 0, 3, 2);
+        let e: i16x4 = transmute(vrev32_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i16x8 = transmute(vrev32q_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_p16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let r = i16x4::new(1, 0, 3, 2);
+        let e: i16x4 = transmute(vrev32_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_p16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i16x8 = transmute(vrev32q_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let r = u16x4::new(1, 0, 3, 2);
+        let e: u16x4 = transmute(vrev32_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: u16x8 = transmute(vrev32q_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_p8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: u8x8 = transmute(vrev32_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+        let e: u8x16 = transmute(vrev32q_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i8x8::new(7, 6, 5, 4, 3, 2, 1, 0);
+        let e: i8x8 = transmute(vrev64_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = i8x16::new(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+        let e: i8x16 = transmute(vrev64q_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let r = i16x4::new(3, 2, 1, 0);
+        let e: i16x4 = transmute(vrev64_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i16x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: i16x8 = transmute(vrev64q_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_s32() {
+        let a = i32x2::new(0, 1);
+        let r = i32x2::new(1, 0);
+        let e: i32x2 = transmute(vrev64_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let r = i32x4::new(1, 0, 3, 2);
+        let e: i32x4 = transmute(vrev64q_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(7, 6, 5, 4, 3, 2, 1, 0);
+        let e: u8x8 = transmute(vrev64_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+        let e: u8x16 = transmute(vrev64q_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let r = u16x4::new(3, 2, 1, 0);
+        let e: u16x4 = transmute(vrev64_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u16x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: u16x8 = transmute(vrev64q_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_u32() {
+        let a = u32x2::new(0, 1);
+        let r = u32x2::new(1, 0);
+        let e: u32x2 = transmute(vrev64_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let r = u32x4::new(1, 0, 3, 2);
+        let e: u32x4 = transmute(vrev64q_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_f32() {
+        let a = f32x2::new(1.0, 2.0);
+        let r = f32x2::new(2.0, 1.0);
+        let e: f32x2 = transmute(vrev64_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_f32() {
+        let a = f32x4::new(1.0, 2.0, -2.0, -1.0);
+        let r = f32x4::new(2.0, 1.0, -1.0, -2.0);
+        let e: f32x4 = transmute(vrev64q_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_p8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u8x8::new(7, 6, 5, 4, 3, 2, 1, 0);
+        let e: u8x8 = transmute(vrev64_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = u8x16::new(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+        let e: u8x16 = transmute(vrev64q_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64_p16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let r = u16x4::new(3, 2, 1, 0);
+        let e: u16x4 = transmute(vrev64_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev64q_p16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = u16x8::new(3, 2, 1, 0, 7, 6, 5, 4);
+        let e: u16x8 = transmute(vrev64q_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vmmlaq_s32() {
+        let a: i32x4 = i32x4::new(1, 3, 4, 9);
+        let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16);
+        let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vmmlaq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vmmlaq_u32() {
+        let a: u32x4 = u32x4::new(1, 3, 4, 9);
+        let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16);
+        let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vmmlaq_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vusmmlaq_s32() {
+        let a: i32x4 = i32x4::new(1, 3, 4, 9);
+        let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16);
+        let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vusmmlaq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+}
+
+#[cfg(all(test, target_arch = "arm", target_endian = "little"))]
+mod table_lookup_tests;
+
+#[cfg(all(test, target_arch = "arm"))]
+mod shift_and_insert_tests;
+
+#[cfg(all(test, target_arch = "arm"))]
+mod load_tests;
+
+#[cfg(all(test, target_arch = "arm"))]
+mod store_tests;
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
new file mode 100644
index 000000000..ebb8b7b9e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
@@ -0,0 +1,93 @@
+//! Tests for ARM+v7+neon shift and insert (vsli[q]_n, vsri[q]_n) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(target_arch = "aarch64")]
+use crate::core_arch::aarch64::*;
+
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+use crate::core_arch::simd::*;
+use std::mem::transmute;
+use stdarch_test::simd_test;
+
+macro_rules! test_vsli {
+    ($test_id:ident, $t:ty => $fn_id:ident ([$($a:expr),*], [$($b:expr),*], $n:expr)) => {
+        #[simd_test(enable = "neon")]
+        #[allow(unused_assignments)]
+        unsafe fn $test_id() {
+            let a = [$($a as $t),*];
+            let b = [$($b as $t),*];
+            let n_bit_mask: $t = (1 << $n) - 1;
+            let e = [$(($a as $t & n_bit_mask) | ($b as $t << $n)),*];
+            let r = $fn_id::<$n>(transmute(a), transmute(b));
+            let mut d = e;
+            d = transmute(r);
+            assert_eq!(d, e);
+        }
+    }
+}
+test_vsli!(test_vsli_n_s8, i8 => vsli_n_s8([3, -44, 127, -56, 0, 24, -97, 10], [-128, -14, 125, -77, 27, 8, -1, 110], 5));
+test_vsli!(test_vsliq_n_s8, i8 => vsliq_n_s8([3, -44, 127, -56, 0, 24, -97, 10, -33, 1, -6, -39, 15, 101, -80, -1], [-128, -14, 125, -77, 27, 8, -1, 110, -4, -92, 111, 32, 1, -4, -29, 99], 2));
+test_vsli!(test_vsli_n_s16, i16 => vsli_n_s16([3304, -44, 2300, -546], [-1208, -140, 1225, -707], 7));
+test_vsli!(test_vsliq_n_s16, i16 => vsliq_n_s16([3304, -44, 2300, -20046, 0, 9924, -907, 1190], [-1208, -140, 4225, -707, 2701, 804, -71, 2110], 14));
+test_vsli!(test_vsli_n_s32, i32 => vsli_n_s32([125683, -78901], [-128, -112944], 23));
+test_vsli!(test_vsliq_n_s32, i32 => vsliq_n_s32([125683, -78901, 127, -12009], [-128, -112944, 125, -707], 15));
+test_vsli!(test_vsli_n_s64, i64 => vsli_n_s64([-333333], [1028], 45));
+test_vsli!(test_vsliq_n_s64, i64 => vsliq_n_s64([-333333, -52023], [1028, -99814], 33));
+test_vsli!(test_vsli_n_u8, u8 => vsli_n_u8([3, 44, 127, 56, 0, 24, 97, 10], [127, 14, 125, 77, 27, 8, 1, 110], 5));
+test_vsli!(test_vsliq_n_u8, u8 => vsliq_n_u8([3, 44, 127, 56, 0, 24, 97, 10, 33, 1, 6, 39, 15, 101, 80, 1], [127, 14, 125, 77, 27, 8, 1, 110, 4, 92, 111, 32, 1, 4, 29, 99], 2));
+test_vsli!(test_vsli_n_u16, u16 => vsli_n_u16([3304, 44, 2300, 546], [1208, 140, 1225, 707], 7));
+test_vsli!(test_vsliq_n_u16, u16 => vsliq_n_u16([3304, 44, 2300, 20046, 0, 9924, 907, 1190], [1208, 140, 4225, 707, 2701, 804, 71, 2110], 14));
+test_vsli!(test_vsli_n_u32, u32 => vsli_n_u32([125683, 78901], [128, 112944], 23));
+test_vsli!(test_vsliq_n_u32, u32 => vsliq_n_u32([125683, 78901, 127, 12009], [128, 112944, 125, 707], 15));
+test_vsli!(test_vsli_n_u64, u64 => vsli_n_u64([333333], [1028], 45));
+test_vsli!(test_vsliq_n_u64, u64 => vsliq_n_u64([333333, 52023], [1028, 99814], 33));
+test_vsli!(test_vsli_n_p8, i8 => vsli_n_p8([3, 44, 127, 56, 0, 24, 97, 10], [127, 14, 125, 77, 27, 8, 1, 110], 5));
+test_vsli!(test_vsliq_n_p8, i8 => vsliq_n_p8([3, 44, 127, 56, 0, 24, 97, 10, 33, 1, 6, 39, 15, 101, 80, 1], [127, 14, 125, 77, 27, 8, 1, 110, 4, 92, 111, 32, 1, 4, 29, 99], 2));
+test_vsli!(test_vsli_n_p16, i16 => vsli_n_p16([3304, 44, 2300, 546], [1208, 140, 1225, 707], 7));
+test_vsli!(test_vsliq_n_p16, i16 => vsliq_n_p16([3304, 44, 2300, 20046, 0, 9924, 907, 1190], [1208, 140, 4225, 707, 2701, 804, 71, 2110], 14));
+test_vsli!(test_vsli_n_p64, i64 => vsli_n_p64([333333], [1028], 45));
+test_vsli!(test_vsliq_n_p64, i64 => vsliq_n_p64([333333, 52023], [1028, 99814], 33));
+
+macro_rules! test_vsri {
+    ($test_id:ident, $t:ty => $fn_id:ident ([$($a:expr),*], [$($b:expr),*], $n:expr)) => {
+        #[simd_test(enable = "neon")]
+        #[allow(unused_assignments)]
+        unsafe fn $test_id() {
+            let a = [$($a as $t),*];
+            let b = [$($b as $t),*];
+            let n_bit_mask = ((1 as $t << $n) - 1).rotate_right($n);
+            let e = [$(($a as $t & n_bit_mask) | (($b as $t >> $n) & !n_bit_mask)),*];
+            let r = $fn_id::<$n>(transmute(a), transmute(b));
+            let mut d = e;
+            d = transmute(r);
+            assert_eq!(d, e);
+        }
+    }
+}
+test_vsri!(test_vsri_n_s8, i8 => vsri_n_s8([3, -44, 127, -56, 0, 24, -97, 10], [-128, -14, 125, -77, 27, 8, -1, 110], 5));
+test_vsri!(test_vsriq_n_s8, i8 => vsriq_n_s8([3, -44, 127, -56, 0, 24, -97, 10, -33, 1, -6, -39, 15, 101, -80, -1], [-128, -14, 125, -77, 27, 8, -1, 110, -4, -92, 111, 32, 1, -4, -29, 99], 2));
+test_vsri!(test_vsri_n_s16, i16 => vsri_n_s16([3304, -44, 2300, -546], [-1208, -140, 1225, -707], 7));
+test_vsri!(test_vsriq_n_s16, i16 => vsriq_n_s16([3304, -44, 2300, -20046, 0, 9924, -907, 1190], [-1208, -140, 4225, -707, 2701, 804, -71, 2110], 14));
+test_vsri!(test_vsri_n_s32, i32 => vsri_n_s32([125683, -78901], [-128, -112944], 23));
+test_vsri!(test_vsriq_n_s32, i32 => vsriq_n_s32([125683, -78901, 127, -12009], [-128, -112944, 125, -707], 15));
+test_vsri!(test_vsri_n_s64, i64 => vsri_n_s64([-333333], [1028], 45));
+test_vsri!(test_vsriq_n_s64, i64 => vsriq_n_s64([-333333, -52023], [1028, -99814], 33));
+test_vsri!(test_vsri_n_u8, u8 => vsri_n_u8([3, 44, 127, 56, 0, 24, 97, 10], [127, 14, 125, 77, 27, 8, 1, 110], 5));
+test_vsri!(test_vsriq_n_u8, u8 => vsriq_n_u8([3, 44, 127, 56, 0, 24, 97, 10, 33, 1, 6, 39, 15, 101, 80, 1], [127, 14, 125, 77, 27, 8, 1, 110, 4, 92, 111, 32, 1, 4, 29, 99], 2));
+test_vsri!(test_vsri_n_u16, u16 => vsri_n_u16([3304, 44, 2300, 546], [1208, 140, 1225, 707], 7));
+test_vsri!(test_vsriq_n_u16, u16 => vsriq_n_u16([3304, 44, 2300, 20046, 0, 9924, 907, 1190], [1208, 140, 4225, 707, 2701, 804, 71, 2110], 14));
+test_vsri!(test_vsri_n_u32, u32 => vsri_n_u32([125683, 78901], [128, 112944], 23));
+test_vsri!(test_vsriq_n_u32, u32 => vsriq_n_u32([125683, 78901, 127, 12009], [128, 112944, 125, 707], 15));
+test_vsri!(test_vsri_n_u64, u64 => vsri_n_u64([333333], [1028], 45));
+test_vsri!(test_vsriq_n_u64, u64 => vsriq_n_u64([333333, 52023], [1028, 99814], 33));
+test_vsri!(test_vsri_n_p8, i8 => vsri_n_p8([3, 44, 127, 56, 0, 24, 97, 10], [127, 14, 125, 77, 27, 8, 1, 110], 5));
+test_vsri!(test_vsriq_n_p8, i8 => vsriq_n_p8([3, 44, 127, 56, 0, 24, 97, 10, 33, 1, 6, 39, 15, 101, 80, 1], [127, 14, 125, 77, 27, 8, 1, 110, 4, 92, 111, 32, 1, 4, 29, 99], 2));
+test_vsri!(test_vsri_n_p16, i16 => vsri_n_p16([3304, 44, 2300, 546], [1208, 140, 1225, 707], 7));
+test_vsri!(test_vsriq_n_p16, i16 => vsriq_n_p16([3304, 44, 2300, 20046, 0, 9924, 907, 1190], [1208, 140, 4225, 707, 2701, 804, 71, 2110], 14));
+test_vsri!(test_vsri_n_p64, i64 => vsri_n_p64([333333], [1028], 45));
+test_vsri!(test_vsriq_n_p64, i64 => vsriq_n_p64([333333, 52023], [1028, 99814], 33));
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/store_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/store_tests.rs
new file mode 100644
index 000000000..cad660e87
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/store_tests.rs
@@ -0,0 +1,389 @@
+//! Tests for ARM+v7+neon store (vst1) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+#[cfg(target_arch = "aarch64")]
+use crate::core_arch::aarch64::*;
+
+use crate::core_arch::simd::*;
+use stdarch_test::simd_test;
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_s8() {
+    let mut vals = [0_i8; 9];
+    let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1_s8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_s8() {
+    let mut vals = [0_i8; 17];
+    let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+
+    vst1q_s8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+    assert_eq!(vals[9], 9);
+    assert_eq!(vals[10], 10);
+    assert_eq!(vals[11], 11);
+    assert_eq!(vals[12], 12);
+    assert_eq!(vals[13], 13);
+    assert_eq!(vals[14], 14);
+    assert_eq!(vals[15], 15);
+    assert_eq!(vals[16], 16);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_s16() {
+    let mut vals = [0_i16; 5];
+    let a = i16x4::new(1, 2, 3, 4);
+
+    vst1_s16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_s16() {
+    let mut vals = [0_i16; 9];
+    let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1q_s16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_s32() {
+    let mut vals = [0_i32; 3];
+    let a = i32x2::new(1, 2);
+
+    vst1_s32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_s32() {
+    let mut vals = [0_i32; 5];
+    let a = i32x4::new(1, 2, 3, 4);
+
+    vst1q_s32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_s64() {
+    let mut vals = [0_i64; 2];
+    let a = i64x1::new(1);
+
+    vst1_s64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_s64() {
+    let mut vals = [0_i64; 3];
+    let a = i64x2::new(1, 2);
+
+    vst1q_s64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_u8() {
+    let mut vals = [0_u8; 9];
+    let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1_u8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_u8() {
+    let mut vals = [0_u8; 17];
+    let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+
+    vst1q_u8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+    assert_eq!(vals[9], 9);
+    assert_eq!(vals[10], 10);
+    assert_eq!(vals[11], 11);
+    assert_eq!(vals[12], 12);
+    assert_eq!(vals[13], 13);
+    assert_eq!(vals[14], 14);
+    assert_eq!(vals[15], 15);
+    assert_eq!(vals[16], 16);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_u16() {
+    let mut vals = [0_u16; 5];
+    let a = u16x4::new(1, 2, 3, 4);
+
+    vst1_u16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_u16() {
+    let mut vals = [0_u16; 9];
+    let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1q_u16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_u32() {
+    let mut vals = [0_u32; 3];
+    let a = u32x2::new(1, 2);
+
+    vst1_u32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_u32() {
+    let mut vals = [0_u32; 5];
+    let a = u32x4::new(1, 2, 3, 4);
+
+    vst1q_u32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_u64() {
+    let mut vals = [0_u64; 2];
+    let a = u64x1::new(1);
+
+    vst1_u64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_u64() {
+    let mut vals = [0_u64; 3];
+    let a = u64x2::new(1, 2);
+
+    vst1q_u64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_p8() {
+    let mut vals = [0_u8; 9];
+    let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1_p8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_p8() {
+    let mut vals = [0_u8; 17];
+    let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+
+    vst1q_p8(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+    assert_eq!(vals[9], 9);
+    assert_eq!(vals[10], 10);
+    assert_eq!(vals[11], 11);
+    assert_eq!(vals[12], 12);
+    assert_eq!(vals[13], 13);
+    assert_eq!(vals[14], 14);
+    assert_eq!(vals[15], 15);
+    assert_eq!(vals[16], 16);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_p16() {
+    let mut vals = [0_u16; 5];
+    let a = u16x4::new(1, 2, 3, 4);
+
+    vst1_p16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_p16() {
+    let mut vals = [0_u16; 9];
+    let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+
+    vst1q_p16(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+    assert_eq!(vals[3], 3);
+    assert_eq!(vals[4], 4);
+    assert_eq!(vals[5], 5);
+    assert_eq!(vals[6], 6);
+    assert_eq!(vals[7], 7);
+    assert_eq!(vals[8], 8);
+}
+
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vst1_p64() {
+    let mut vals = [0_u64; 2];
+    let a = u64x1::new(1);
+
+    vst1_p64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+}
+
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vst1q_p64() {
+    let mut vals = [0_u64; 3];
+    let a = u64x2::new(1, 2);
+
+    vst1q_p64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1_f32() {
+    let mut vals = [0_f32; 3];
+    let a = f32x2::new(1., 2.);
+
+    vst1_f32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0.);
+    assert_eq!(vals[1], 1.);
+    assert_eq!(vals[2], 2.);
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vst1q_f32() {
+    let mut vals = [0_f32; 5];
+    let a = f32x4::new(1., 2., 3., 4.);
+
+    vst1q_f32(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0.);
+    assert_eq!(vals[1], 1.);
+    assert_eq!(vals[2], 2.);
+    assert_eq!(vals[3], 3.);
+    assert_eq!(vals[4], 4.);
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs
new file mode 100644
index 000000000..15aa2f269
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs
@@ -0,0 +1,1042 @@
+//! Tests for ARM+v7+neon table lookup (vtbl, vtbx) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(target_arch = "aarch64")]
+use crate::core_arch::aarch64::*;
+
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+use crate::core_arch::simd::*;
+use std::mem;
+use stdarch_test::simd_test;
+
+macro_rules! test_vtbl {
+    ($test_name:ident => $fn_id:ident:
+     - table[$table_t:ident]: [$($table_v:expr),*] |
+     $(- ctrl[$ctrl_t:ident]: [$($ctrl_v:expr),*] => [$($exp_v:expr),*])|*
+    ) => {
+        #[simd_test(enable = "neon")]
+        unsafe fn $test_name() {
+            // create table as array, and transmute it to
+            // arm's table type
+            let table: $table_t = mem::transmute([$($table_v),*]);
+
+            // For each control vector, perform a table lookup and
+            // verify the result:
+            $(
+                {
+                    let ctrl: $ctrl_t = mem::transmute([$($ctrl_v),*]);
+                    let result = $fn_id(table, mem::transmute(ctrl));
+                    let result: $ctrl_t = mem::transmute(result);
+                    let expected: $ctrl_t = mem::transmute([$($exp_v),*]);
+                    assert_eq!(result, expected);
+                }
+            )*
+        }
+    }
+}
+
+// ARM+v7+neon and AArch64+neon tests
+
+test_vtbl!(
+    test_vtbl1_s8 => vtbl1_s8:
+    - table[int8x8_t]: [0_i8, -11, 2, 3, 4, 5, 6, 7] |
+    - ctrl[i8x8]: [3_i8, 4, 1, 6, 0, 2, 7, 5] => [3_i8, 4, -11, 6, 0, 2, 7, 5] |
+    - ctrl[i8x8]: [3_i8, 8, 1, -9, 10, 2, 15, 5] => [3_i8, 0, -11, 0, 0, 2, 0, 5]
+);
+
+test_vtbl!(
+    test_vtbl1_u8 => vtbl1_u8:
+    - table[uint8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 0, 1, 0, 0, 2, 0, 5]
+);
+
+test_vtbl!(
+    test_vtbl1_p8 => vtbl1_p8:
+    - table[poly8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 0, 1, 0, 0, 2, 0, 5]
+);
+
+test_vtbl!(
+    test_vtbl2_s8 => vtbl2_s8:
+    - table[int8x8x2_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 14, 2, 13, 3, 12] => [0_i8, -121, -17, -72, 34, -116, 51, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, -19, 7, 18] => [68_i8, -117, 0, -84, 102, 0, 119, 0]
+);
+
+test_vtbl!(
+    test_vtbl2_u8 => vtbl2_u8:
+    - table[uint8x8x2_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 255, 17, 238, 34, 221, 51, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 0]
+);
+
+test_vtbl!(
+    test_vtbl2_p8 => vtbl2_p8:
+    - table[poly8x8x2_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 255, 17, 238, 34, 221, 51, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 0]
+);
+
+test_vtbl!(
+    test_vtbl3_s8 => vtbl3_s8:
+    - table[int8x8x3_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121,
+        0, 1, -2, 3, 4, -5, 6, 7
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 19, 2, 13, 21, 12] => [0_i8, -121, -17, 3, 34, -116, -5, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, -27, 7, 18] => [68_i8, -117, 0, -84, 102, 0, 119, -2]
+);
+
+test_vtbl!(
+    test_vtbl3_u8 => vtbl3_u8:
+    - table[uint8x8x3_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 2]
+);
+
+test_vtbl!(
+    test_vtbl3_p8 => vtbl3_p8:
+    - table[poly8x8x3_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 0, 119, 2]
+);
+
+test_vtbl!(
+    test_vtbl4_s8 => vtbl4_s8:
+    - table[int8x8x4_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121,
+        0, 1, -2, 3, 4, -5, 6, 7,
+        8, -9, 10, 11, 12, -13, 14, 15
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 19, 2, 13, 25, 12] => [0_i8, -121, -17, 3, 34, -116, -9, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 32, 10, -33, 27, 7, 18] => [68_i8, -117, 0, -84, 0, 11, 119, -2]
+);
+
+test_vtbl!(
+    test_vtbl4_u8 => vtbl4_u8:
+    - table[uint8x8x4_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 11, 119, 2]
+);
+
+test_vtbl!(
+    test_vtbl4_p8 => vtbl4_p8:
+    - table[poly8x8x4_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        136, 153, 170, 187, 204, 221, 238, 255,
+        0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 19, 2, 13, 21, 12] => [0_u8, 255, 17, 3, 34, 221, 5, 204] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 27, 7, 18] => [68_u8, 187, 0, 170, 102, 11, 119, 2]
+);
+
+macro_rules! test_vtbx {
+    ($test_name:ident => $fn_id:ident:
+     - table[$table_t:ident]: [$($table_v:expr),*] |
+     - ext[$ext_t:ident]: [$($ext_v:expr),*] |
+     $(- ctrl[$ctrl_t:ident]: [$($ctrl_v:expr),*] => [$($exp_v:expr),*])|*
+    ) => {
+        #[simd_test(enable = "neon")]
+        unsafe fn $test_name() {
+            // create table as array, and transmute it to
+            // arm's table type
+            let table: $table_t = mem::transmute([$($table_v),*]);
+            let ext: $ext_t = mem::transmute([$($ext_v),*]);
+
+            // For each control vector, perform a table lookup and
+            // verify the result:
+            $(
+                {
+                    let ctrl: $ctrl_t = mem::transmute([$($ctrl_v),*]);
+                    let result = $fn_id(ext, table, mem::transmute(ctrl));
+                    let result: $ctrl_t = mem::transmute(result);
+                    let expected: $ctrl_t = mem::transmute([$($exp_v),*]);
+                    assert_eq!(result, expected);
+                }
+            )*
+        }
+    }
+}
+
+test_vtbx!(
+    test_vtbx1_s8 => vtbx1_s8:
+    - table[int8x8_t]: [0_i8, 1, 2, -3, 4, 5, 6, 7] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 4, 1, 6, 0, 2, 7, 5] => [-3_i8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[i8x8]: [3_i8, 8, 1, 9, 10, 2, -15, 5] => [-3_i8, 51, 1, 53, 54, 2, 56, 5]
+);
+
+test_vtbx!(
+    test_vtbx1_u8 => vtbx1_u8:
+    - table[uint8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ext[uint8x8_t]: [50_u8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 51, 1, 53, 54, 2, 56, 5]
+);
+
+test_vtbx!(
+    test_vtbx1_p8 => vtbx1_p8:
+    - table[poly8x8_t]: [0_u8, 1, 2, 3, 4, 5, 6, 7] |
+    - ext[poly8x8_t]: [50_u8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 0, 2, 7, 5] => [3_u8, 4, 1, 6, 0, 2, 7, 5] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 9, 10, 2, 15, 5] => [3_u8, 51, 1, 53, 54, 2, 56, 5]
+);
+
+test_vtbx!(
+    test_vtbx2_s8 => vtbx2_s8:
+    - table[int8x8x2_t]: [0_i8, 1, 2, -3, 4, 5, 6, 7, 8, 9, -10, 11, 12, -13, 14, 15] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 4, 1, 6, 10, 2, 7, 15] => [-3_i8, 4, 1, 6, -10, 2, 7, 15] |
+    - ctrl[i8x8]: [3_i8, 8, 1, 10, 17, 2, 15, -19] => [-3_i8, 8, 1, -10, 54, 2, 15, 57]
+);
+
+test_vtbx!(
+    test_vtbx2_u8 => vtbx2_u8:
+    - table[uint8x8x2_t]: [0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] |
+    - ext[uint8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 10, 2, 7, 15] => [3_i8, 4, 1, 6, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 10, 17, 2, 15, 19] => [3_i8, 8, 1, 10, 54, 2, 15, 57]
+);
+
+test_vtbx!(
+    test_vtbx2_p8 => vtbx2_p8:
+    - table[poly8x8x2_t]: [0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] |
+    - ext[poly8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 1, 6, 10, 2, 7, 15] => [3_i8, 4, 1, 6, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 1, 10, 17, 2, 15, 19] => [3_i8, 8, 1, 10, 54, 2, 15, 57]
+);
+
+test_vtbx!(
+    test_vtbx3_s8 => vtbx3_s8:
+    - table[int8x8x3_t]: [
+        0_i8, 1, 2, -3, 4, 5, 6, 7,
+        8, 9, -10, 11, 12, -13, 14, 15,
+        16, -17, 18, 19, 20, 21, 22, 23 ] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 4, 17, 22, 10, 2, 7, 15] => [-3_i8, 4, -17, 22, -10, 2, 7, 15] |
+    - ctrl[i8x8]: [3_i8, 8, 17, 10, 37, 2, 19, -29] => [-3_i8, 8, -17, -10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx3_u8 => vtbx3_u8:
+    - table[uint8x8x3_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23 ] |
+    - ext[uint8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 17, 22, 10, 2, 7, 15] => [3_i8, 4, 17, 22, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 29] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx3_p8 => vtbx3_p8:
+    - table[poly8x8x3_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23 ] |
+    - ext[poly8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 4, 17, 22, 10, 2, 7, 15] => [3_i8, 4, 17, 22, 10, 2, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 29] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx4_s8 => vtbx4_s8:
+    - table[int8x8x4_t]: [
+        0_i8, 1, 2, -3, 4, 5, 6, 7,
+        8, 9, -10, 11, 12, -13, 14, 15,
+        16, -17, 18, 19, 20, 21, 22, 23,
+        -24, 25, 26, -27, 28, -29, 30, 31] |
+    - ext[int8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[i8x8]: [3_i8, 31, 17, 22, 10, 29, 7, 15] => [-3_i8, 31, -17, 22, -10, -29, 7, 15] |
+    - ctrl[i8x8]: [3_i8, 8, 17, 10, 37, 2, 19, -42] => [-3_i8, 8, -17, -10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx4_u8 => vtbx4_u8:
+    - table[uint8x8x4_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31] |
+    - ext[uint8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 31, 17, 22, 10, 29, 7, 15] => [3_i8, 31, 17, 22, 10, 29, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 42] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+test_vtbx!(
+    test_vtbx4_p8 => vtbx4_p8:
+    - table[poly8x8x4_t]: [
+        0_i8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31] |
+    - ext[poly8x8_t]: [50_i8, 51, 52, 53, 54, 55, 56, 57] |
+    - ctrl[u8x8]: [3_u8, 31, 17, 22, 10, 29, 7, 15] => [3_i8, 31, 17, 22, 10, 29, 7, 15] |
+    - ctrl[u8x8]: [3_u8, 8, 17, 10, 37, 2, 19, 42] => [3_i8, 8, 17, 10, 54, 2, 19, 57]
+);
+
+// Aarch64 tests
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1_s8 => vqtbl1_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 14, 2, 13, 3, 12] => [0_i8, -121, -17, -72, 34, -116, 51, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, 19, 7, 18] => [68_i8, -117, 0, -84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1q_s8 => vqtbl1q_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ctrl[i8x16]: [127_i8, 15, 1, 14, 2, 13, 3, 12, 4_i8, 11, 16, 10, 6, 19, 7, 18]
+        => [0_i8, -121, -17, -72, 34, -116, 51, -104, 68, -117, 0, -84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1_u8 => vqtbl1_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1q_u8 => vqtbl1q_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [0_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1_p8 => vqtbl1_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [0_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl1q_p8 => vqtbl1q_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [0_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 0, 84, 102, 0, 119, 0]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2_s8 => vqtbl2_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [0_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 31, 32, 10, 6, 49, 7, 18] => [4_i8, -31, 0, 10, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2q_s8 => vqtbl2q_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 31, 32, 10, 6, 49, 7, 18]
+        => [0_i8, -15, -1, 24, 2, -13, -3, -29, 4, -31, 0, 10, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2_u8 => vqtbl2_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2q_u8 => vqtbl2q_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2_p8 => vqtbl2_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl2q_p8 => vqtbl2q_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 0, 10, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3_s8 => vqtbl3_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [0_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 32, 46, 51, 6, 49, 7, 18] => [4_i8, 32, 46, 0, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3q_s8 => vqtbl3q_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 32, 46, 51, 6, 49, 7, 18]
+        => [0_i8, -15, -1, 24, 2, -13, -3, -29, 4, 32, 46, 0, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3_u8 => vqtbl3_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3q_u8 => vqtbl3q_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3_p8 => vqtbl3_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl3q_p8 => vqtbl3q_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 0, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4_s8 => vqtbl4_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [0_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 46, 64, 51, 6, 71, 7, 18] => [4_i8, 46, 0, -51, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4q_s8 => vqtbl4q_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 46, 64, 51, 6, 71, 7, 18]
+        => [0_i8, -15, -1, 24, 2, -13, -3, -29, 4, 46, 0, -51, 6, 0, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4_u8 => vqtbl4_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4q_u8 => vqtbl4q_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4_p8 => vqtbl4_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [0_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbl!(
+    test_vqtbl4q_p8 => vqtbl4q_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [0_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 0, 51, 6, 0, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1_s8 => vqtbx1_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [127_i8, 15, 1, 14, 2, 13, 3, 12] => [100_i8, -121, -17, -72, 34, -116, 51, -104] |
+    - ctrl[i8x8]: [4_i8, 11, 16, 10, 6, 19, 7, 18] => [68_i8, -117, 102, -84, 102, -105, 119, -107]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1q_s8 => vqtbx1q_s8:
+    - table[int8x16_t]: [
+        0_i8, -17, 34, 51, 68, 85, 102, 119,
+        -106, -93, -84, -117, -104, -116, -72, -121
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [127_i8, 15, 1, 14, 2, 13, 3, 12, 4_i8, 11, 16, 10, 6, 19, 7, 18]
+        => [100_i8, -121, -17, -72, 34, -116, 51, -104, 68, -117, 110, -84, 102, -113, 119, -115]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1_u8 => vqtbx1_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [100_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 102, 84, 102, 105, 119, 107]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1q_u8 => vqtbx1q_u8:
+    - table[uint8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [100_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 110, 84, 102, 113, 119, 115]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1_p8 => vqtbx1_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [127_u8, 15, 1, 14, 2, 13, 3, 12] => [100_u8, 121, 17, 72, 34, 116, 51, 104] |
+    - ctrl[u8x8]: [4_u8, 11, 16, 10, 6, 19, 7, 18] => [68_u8, 117, 102, 84, 102, 105, 119, 107]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx1q_p8 => vqtbx1q_p8:
+    - table[poly8x16_t]: [
+        0_u8, 17, 34, 51, 68, 85, 102, 119,
+        106, 93, 84, 117, 104, 116, 72, 121
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [127_u8, 15, 1, 14, 2, 13, 3, 12, 4_u8, 11, 16, 10, 6, 19, 7, 18]
+        => [100_u8, 121, 17, 72, 34, 116, 51, 104, 68, 117, 110, 84, 102, 113, 119, 115]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2_s8 => vqtbx2_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [100_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 31, 32, 10, 6, 49, 7, 18] => [4_i8, -31, 102, 10, 6, -105, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2q_s8 => vqtbx2q_s8:
+    - table[int8x16x2_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 31, 32, 10, 6, 49, 7, 18]
+        => [100_i8, -15, -1, 24, 2, -13, -3, -29, 4, -31, 110, 10, 6, -113, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2_u8 => vqtbx2_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 102, 10, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2q_u8 => vqtbx2q_u8:
+    - table[uint8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 110, 10, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2_p8 => vqtbx2_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 31, 32, 10, 6, 49, 7, 18] => [4_u8, 31, 102, 10, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx2q_p8 => vqtbx2q_p8:
+    - table[poly8x16x2_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 31, 32, 10, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 31, 110, 10, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3_s8 => vqtbx3_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [100_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 32, 46, 51, 6, 49, 7, 18] => [4_i8, 32, 46, -103, 6, -105, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3q_s8 => vqtbx3q_s8:
+    - table[int8x16x3_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 32, 46, 51, 6, 49, 7, 18]
+        => [100_i8, -15, -1, 24, 2, -13, -3, -29, 4, 32, 46, -111, 6, -113, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3_u8 => vqtbx3_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 103, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3q_u8 => vqtbx3q_u8:
+    - table[uint8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 111, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3_p8 => vqtbx3_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 32, 46, 51, 6, 49, 7, 18] => [4_u8, 32, 46, 103, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx3q_p8 => vqtbx3q_p8:
+    - table[poly8x16x3_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 32, 46, 51, 6, 49, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 32, 46, 111, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4_s8 => vqtbx4_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ext[int8x8_t]: [100_i8, -101, 102, -103, 104, -105, 106, -107] |
+    - ctrl[i8x8]: [80_i8, 15, 1, 24, 2, 13, 3, 29] => [100_i8, -15, -1, 24, 2, -13, -3, -29] |
+    - ctrl[i8x8]: [4_i8, 46, 64, 51, 6, 71, 7, 18] => [4_i8, 46, 102, -51, 6, -105, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4q_s8 => vqtbx4q_s8:
+    - table[int8x16x4_t]: [
+        0_i8, -1, 2, -3, 4, -5, 6, -7,
+        8, -9, 10, -11, 12, -13, 14, -15,
+        16, -17, 18, -19, 20, -21, 22, -23,
+        24, -25, 26, -27, 28, -29, 30, -31,
+        32, -33, 34, -35, 36, -37, 38, -39,
+        40, -41, 42, -43, 44, -45, 46, -47,
+        48, -49, 50, -51, 52, -53, 54, -55,
+        56, -57, 58, -59, 60, -61, 62, -63
+    ] |
+    - ext[int8x16_t]: [
+        100_i8, -101, 102, -103, 104, -105, 106, -107,
+        108, -109, 110, -111, 112, -113, 114, -115
+    ] |
+    - ctrl[i8x16]: [80_i8, 15, 1, 24, 2, 13, 3, 29, 4_i8, 46, 64, 51, 6, 71, 7, 18]
+        => [100_i8, -15, -1, 24, 2, -13, -3, -29, 4, 46, 110, -51, 6, -113, -7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4_u8 => vqtbx4_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[uint8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 102, 51, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4q_u8 => vqtbx4q_u8:
+    - table[uint8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[uint8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 110, 51, 6, 113, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4_p8 => vqtbx4_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[poly8x8_t]: [100_u8, 101, 102, 103, 104, 105, 106, 107] |
+    - ctrl[u8x8]: [80_u8, 15, 1, 24, 2, 13, 3, 29] => [100_u8, 15, 1, 24, 2, 13, 3, 29] |
+    - ctrl[u8x8]: [4_u8, 46, 64, 51, 6, 71, 7, 18] => [4_u8, 46, 102, 51, 6, 105, 7, 18]
+);
+
+#[cfg(target_arch = "aarch64")]
+test_vtbx!(
+    test_vqtbx4q_p8 => vqtbx4q_p8:
+    - table[poly8x16x4_t]: [
+        0_u8, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    ] |
+    - ext[poly8x16_t]: [
+        100_u8, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115
+    ] |
+    - ctrl[u8x16]: [80_u8, 15, 1, 24, 2, 13, 3, 29, 4_u8, 46, 64, 51, 6, 71, 7, 18]
+        => [100_u8, 15, 1, 24, 2, 13, 3, 29, 4, 46, 110, 51, 6, 113, 7, 18]
+);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/registers/aarch32.rs b/library/stdarch/crates/core_arch/src/arm_shared/registers/aarch32.rs
new file mode 100644
index 000000000..e0b71218a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/registers/aarch32.rs
@@ -0,0 +1,9 @@
+/// Application Program Status Register
+pub struct APSR;
+
+// Note (@Lokathor): Because this breaks the use of Rust on the Game Boy
+// Advance, this change must be reverted until Rust learns to handle cpu state
+// properly. See also: https://github.com/rust-lang/stdarch/issues/702
+
+//#[cfg(any(not(target_feature = "thumb-state"), target_feature = "v6t2"))]
+//rsr!(APSR);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/registers/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/registers/mod.rs
new file mode 100644
index 000000000..621efe2f5
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/registers/mod.rs
@@ -0,0 +1,121 @@
+#[allow(unused_macros)]
+macro_rules! rsr {
+    ($R:ident) => {
+        impl super::super::sealed::Rsr for $R {
+            unsafe fn __rsr(&self) -> u32 {
+                let r: u32;
+                crate::arch::asm!(concat!("mrs {},", stringify!($R)), out(reg) r, options(nomem, nostack));
+                r
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! rsrp {
+    ($R:ident) => {
+        impl super::super::sealed::Rsrp for $R {
+            unsafe fn __rsrp(&self) -> *const u8 {
+                let r: *const u8;
+                crate::arch::asm!(concat!("mrs {},", stringify!($R)), out(reg) r, options(nomem, nostack));
+                r
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! wsr {
+    ($R:ident) => {
+        impl super::super::sealed::Wsr for $R {
+            unsafe fn __wsr(&self, value: u32) {
+                crate::arch::asm!(concat!("msr ", stringify!($R), ", {}"), in(reg) value, options(nomem, nostack));
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! wsrp {
+    ($R:ident) => {
+        impl super::super::sealed::Wsrp for $R {
+            unsafe fn __wsrp(&self, value: *const u8) {
+                crate::arch::asm!(concat!("msr ", stringify!($R), ", {}"), in(reg) value, options(nomem, nostack));
+            }
+        }
+    };
+}
+
+#[cfg(target_feature = "mclass")]
+mod v6m;
+
+#[cfg(target_feature = "mclass")]
+pub use self::v6m::*;
+
+#[cfg(all(target_feature = "v7", target_feature = "mclass"))]
+mod v7m;
+
+#[cfg(all(target_feature = "v7", target_feature = "mclass"))]
+pub use self::v7m::*;
+
+#[cfg(not(target_arch = "aarch64"))]
+mod aarch32;
+
+#[cfg(not(target_arch = "aarch64"))]
+pub use self::aarch32::*;
+
+/// Reads a 32-bit system register
+#[inline(always)]
+pub unsafe fn __rsr<R>(reg: R) -> u32
+where
+    R: super::sealed::Rsr,
+{
+    reg.__rsr()
+}
+
+/// Reads a 64-bit system register
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub unsafe fn __rsr64<R>(reg: R) -> u64
+where
+    R: super::sealed::Rsr64,
+{
+    reg.__rsr64()
+}
+
+/// Reads a system register containing an address
+#[inline(always)]
+pub unsafe fn __rsrp<R>(reg: R) -> *const u8
+where
+    R: super::sealed::Rsrp,
+{
+    reg.__rsrp()
+}
+
+/// Writes a 32-bit system register
+#[inline(always)]
+pub unsafe fn __wsr<R>(reg: R, value: u32)
+where
+    R: super::sealed::Wsr,
+{
+    reg.__wsr(value)
+}
+
+/// Writes a 64-bit system register
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub unsafe fn __wsr64<R>(reg: R, value: u64)
+where
+    R: super::sealed::Wsr64,
+{
+    reg.__wsr64(value)
+}
+
+/// Writes a system register containing an address
+#[inline(always)]
+pub unsafe fn __wsrp<R>(reg: R, value: *const u8)
+where
+    R: super::sealed::Wsrp,
+{
+    reg.__wsrp(value)
+}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/registers/v6m.rs b/library/stdarch/crates/core_arch/src/arm_shared/registers/v6m.rs
new file mode 100644
index 000000000..7acc63b6d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/registers/v6m.rs
@@ -0,0 +1,39 @@
+/// CONTROL register
+pub struct CONTROL;
+
+rsr!(CONTROL);
+wsr!(CONTROL);
+
+/// Execution Program Status Register
+pub struct EPSR;
+
+rsr!(EPSR);
+
+/// Interrupt Program Status Register
+pub struct IPSR;
+
+rsr!(IPSR);
+
+/// Main Stack Pointer
+pub struct MSP;
+
+rsrp!(MSP);
+wsrp!(MSP);
+
+/// Priority Mask Register
+pub struct PRIMASK;
+
+rsr!(PRIMASK);
+wsr!(PRIMASK);
+
+/// Process Stack Pointer
+pub struct PSP;
+
+rsrp!(PSP);
+wsrp!(PSP);
+
+/// Program Status Register
+#[allow(non_camel_case_types)]
+pub struct xPSR;
+
+rsr!(xPSR);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/registers/v7m.rs b/library/stdarch/crates/core_arch/src/arm_shared/registers/v7m.rs
new file mode 100644
index 000000000..d1b1d474f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/registers/v7m.rs
@@ -0,0 +1,17 @@
+/// Base Priority Mask Register
+pub struct BASEPRI;
+
+rsr!(BASEPRI);
+wsr!(BASEPRI);
+
+/// Base Priority Mask Register (conditional write)
+#[allow(non_camel_case_types)]
+pub struct BASEPRI_MAX;
+
+wsr!(BASEPRI_MAX);
+
+/// Fault Mask Register
+pub struct FAULTMASK;
+
+rsr!(FAULTMASK);
+wsr!(FAULTMASK);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/test_support.rs b/library/stdarch/crates/core_arch/src/arm_shared/test_support.rs
new file mode 100644
index 000000000..ff752f25b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm_shared/test_support.rs
@@ -0,0 +1,836 @@
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+#[cfg(target_arch = "aarch64")]
+use crate::core_arch::aarch64::*;
+
+use crate::core_arch::simd::*;
+use std::{i16, i32, i8, mem::transmute, u16, u32, u8, vec::Vec};
+
+macro_rules! V_u8 {
+    () => {
+        vec![0x00u8, 0x01u8, 0x02u8, 0x0Fu8, 0x80u8, 0xF0u8, 0xFFu8]
+    };
+}
+macro_rules! V_u16 {
+    () => {
+        vec![
+            0x0000u16, 0x0101u16, 0x0202u16, 0x0F0Fu16, 0x8000u16, 0xF0F0u16, 0xFFFFu16,
+        ]
+    };
+}
+macro_rules! V_u32 {
+    () => {
+        vec![
+            0x00000000u32,
+            0x01010101u32,
+            0x02020202u32,
+            0x0F0F0F0Fu32,
+            0x80000000u32,
+            0xF0F0F0F0u32,
+            0xFFFFFFFFu32,
+        ]
+    };
+}
+macro_rules! V_u64 {
+    () => {
+        vec![
+            0x0000000000000000u64,
+            0x0101010101010101u64,
+            0x0202020202020202u64,
+            0x0F0F0F0F0F0F0F0Fu64,
+            0x8080808080808080u64,
+            0xF0F0F0F0F0F0F0F0u64,
+            0xFFFFFFFFFFFFFFFFu64,
+        ]
+    };
+}
+
+macro_rules! V_i8 {
+    () => {
+        vec![
+            0x00i8, 0x01i8, 0x02i8, 0x0Fi8, -128i8, /* 0x80 */
+            -16i8,  /* 0xF0 */
+            -1i8,   /* 0xFF */
+        ]
+    };
+}
+macro_rules! V_i16 {
+    () => {
+        vec![
+            0x0000i16, 0x0101i16, 0x0202i16, 0x0F0Fi16, -32768i16, /* 0x8000 */
+            -3856i16,  /* 0xF0F0 */
+            -1i16,     /* 0xFFF */
+        ]
+    };
+}
+macro_rules! V_i32 {
+    () => {
+        vec![
+            0x00000000i32,
+            0x01010101i32,
+            0x02020202i32,
+            0x0F0F0F0Fi32,
+            -2139062144i32, /* 0x80000000 */
+            -252645136i32,  /* 0xF0F0F0F0 */
+            -1i32,          /* 0xFFFFFFFF */
+        ]
+    };
+}
+
+macro_rules! V_i64 {
+    () => {
+        vec![
+            0x0000000000000000i64,
+            0x0101010101010101i64,
+            0x0202020202020202i64,
+            0x0F0F0F0F0F0F0F0Fi64,
+            -9223372036854775808i64, /* 0x8000000000000000 */
+            -1152921504606846976i64, /* 0xF000000000000000 */
+            -1i64,                   /* 0xFFFFFFFFFFFFFFFF */
+        ]
+    };
+}
+
+macro_rules! V_f32 {
+    () => {
+        vec![
+            0.0f32,
+            1.0f32,
+            -1.0f32,
+            1.2f32,
+            2.4f32,
+            std::f32::MAX,
+            std::f32::MIN,
+            std::f32::INFINITY,
+            std::f32::NEG_INFINITY,
+            std::f32::NAN,
+        ]
+    };
+}
+
+macro_rules! to64 {
+    ($t : ident) => {
+        |v: $t| -> u64 { transmute(v) }
+    };
+}
+
+macro_rules! to128 {
+    ($t : ident) => {
+        |v: $t| -> u128 { transmute(v) }
+    };
+}
+
+pub(crate) fn test<T, U, V, W, X>(
+    vals: Vec<T>,
+    fill1: fn(T) -> V,
+    fill2: fn(U) -> W,
+    cast: fn(W) -> X,
+    test_fun: fn(V, V) -> W,
+    verify_fun: fn(T, T) -> U,
+) where
+    T: Copy + core::fmt::Debug + std::cmp::PartialEq,
+    U: Copy + core::fmt::Debug + std::cmp::PartialEq,
+    V: Copy + core::fmt::Debug,
+    W: Copy + core::fmt::Debug,
+    X: Copy + core::fmt::Debug + std::cmp::PartialEq,
+{
+    let pairs = vals.iter().zip(vals.iter());
+
+    for (i, j) in pairs {
+        let a: V = fill1(*i);
+        let b: V = fill1(*j);
+
+        let actual_pre: W = test_fun(a, b);
+        let expected_pre: W = fill2(verify_fun(*i, *j));
+
+        let actual: X = cast(actual_pre);
+        let expected: X = cast(expected_pre);
+
+        assert_eq!(
+            actual, expected,
+            "[{:?}:{:?}] :\nf({:?}, {:?}) = {:?}\ng({:?}, {:?}) = {:?}\n",
+            *i, *j, &a, &b, actual_pre, &a, &b, expected_pre
+        );
+    }
+}
+
+macro_rules! gen_test_fn {
+    ($n: ident, $t: ident, $u: ident, $v: ident, $w: ident, $x: ident, $vals: expr, $fill1: expr, $fill2: expr, $cast: expr) => {
+        pub(crate) fn $n(test_fun: fn($v, $v) -> $w, verify_fun: fn($t, $t) -> $u) {
+            unsafe {
+                test::<$t, $u, $v, $w, $x>($vals, $fill1, $fill2, $cast, test_fun, verify_fun)
+            };
+        }
+    };
+}
+
+macro_rules! gen_fill_fn {
+    ($id: ident, $el_width: expr, $num_els: expr, $in_t : ident, $out_t: ident, $cmp_t: ident) => {
+        pub(crate) fn $id(val: $in_t) -> $out_t {
+            let initial: [$in_t; $num_els] = [val; $num_els];
+            let result: $cmp_t = unsafe { transmute(initial) };
+            let result_out: $out_t = unsafe { transmute(result) };
+
+            // println!("FILL: {:016x} as {} x {}: {:016x}", val.reverse_bits(), $el_width, $num_els, (result as u64).reverse_bits());
+
+            result_out
+        }
+    };
+}
+
+gen_fill_fn!(fill_u8, 8, 8, u8, uint8x8_t, u64);
+gen_fill_fn!(fill_s8, 8, 8, i8, int8x8_t, u64);
+gen_fill_fn!(fillq_u8, 8, 16, u8, uint8x16_t, u128);
+gen_fill_fn!(fillq_s8, 8, 16, i8, int8x16_t, u128);
+
+gen_fill_fn!(fill_u16, 16, 4, u16, uint16x4_t, u64);
+gen_fill_fn!(fill_s16, 16, 4, i16, int16x4_t, u64);
+gen_fill_fn!(fillq_u16, 16, 8, u16, uint16x8_t, u128);
+gen_fill_fn!(fillq_s16, 16, 8, i16, int16x8_t, u128);
+
+gen_fill_fn!(fill_u32, 32, 2, u32, uint32x2_t, u64);
+gen_fill_fn!(fill_s32, 32, 2, i32, int32x2_t, u64);
+gen_fill_fn!(fillq_u32, 32, 4, u32, uint32x4_t, u128);
+gen_fill_fn!(fillq_s32, 32, 4, i32, int32x4_t, u128);
+
+gen_fill_fn!(fill_u64, 64, 1, u64, uint64x1_t, u64);
+gen_fill_fn!(fill_s64, 64, 1, i64, int64x1_t, u64);
+gen_fill_fn!(fillq_u64, 64, 2, u64, uint64x2_t, u128);
+gen_fill_fn!(fillq_s64, 64, 2, i64, int64x2_t, u128);
+
+gen_fill_fn!(fill_f32, 32, 2, f32, float32x2_t, u64);
+gen_fill_fn!(fillq_f32, 32, 4, f32, float32x4_t, u128);
+
+gen_test_fn!(
+    test_ari_u8,
+    u8,
+    u8,
+    uint8x8_t,
+    uint8x8_t,
+    u64,
+    V_u8!(),
+    fill_u8,
+    fill_u8,
+    to64!(uint8x8_t)
+);
+gen_test_fn!(
+    test_bit_u8,
+    u8,
+    u8,
+    uint8x8_t,
+    uint8x8_t,
+    u64,
+    V_u8!(),
+    fill_u8,
+    fill_u8,
+    to64!(uint8x8_t)
+);
+gen_test_fn!(
+    test_cmp_u8,
+    u8,
+    u8,
+    uint8x8_t,
+    uint8x8_t,
+    u64,
+    V_u8!(),
+    fill_u8,
+    fill_u8,
+    to64!(uint8x8_t)
+);
+gen_test_fn!(
+    testq_ari_u8,
+    u8,
+    u8,
+    uint8x16_t,
+    uint8x16_t,
+    u128,
+    V_u8!(),
+    fillq_u8,
+    fillq_u8,
+    to128!(uint8x16_t)
+);
+gen_test_fn!(
+    testq_bit_u8,
+    u8,
+    u8,
+    uint8x16_t,
+    uint8x16_t,
+    u128,
+    V_u8!(),
+    fillq_u8,
+    fillq_u8,
+    to128!(uint8x16_t)
+);
+gen_test_fn!(
+    testq_cmp_u8,
+    u8,
+    u8,
+    uint8x16_t,
+    uint8x16_t,
+    u128,
+    V_u8!(),
+    fillq_u8,
+    fillq_u8,
+    to128!(uint8x16_t)
+);
+
+gen_test_fn!(
+    test_ari_s8,
+    i8,
+    i8,
+    int8x8_t,
+    int8x8_t,
+    u64,
+    V_i8!(),
+    fill_s8,
+    fill_s8,
+    to64!(int8x8_t)
+);
+gen_test_fn!(
+    test_bit_s8,
+    i8,
+    i8,
+    int8x8_t,
+    int8x8_t,
+    u64,
+    V_i8!(),
+    fill_s8,
+    fill_s8,
+    to64!(int8x8_t)
+);
+gen_test_fn!(
+    test_cmp_s8,
+    i8,
+    u8,
+    int8x8_t,
+    uint8x8_t,
+    u64,
+    V_i8!(),
+    fill_s8,
+    fill_u8,
+    to64!(uint8x8_t)
+);
+gen_test_fn!(
+    testq_ari_s8,
+    i8,
+    i8,
+    int8x16_t,
+    int8x16_t,
+    u128,
+    V_i8!(),
+    fillq_s8,
+    fillq_s8,
+    to128!(int8x16_t)
+);
+gen_test_fn!(
+    testq_bit_s8,
+    i8,
+    i8,
+    int8x16_t,
+    int8x16_t,
+    u128,
+    V_i8!(),
+    fillq_s8,
+    fillq_s8,
+    to128!(int8x16_t)
+);
+gen_test_fn!(
+    testq_cmp_s8,
+    i8,
+    u8,
+    int8x16_t,
+    uint8x16_t,
+    u128,
+    V_i8!(),
+    fillq_s8,
+    fillq_u8,
+    to128!(uint8x16_t)
+);
+
+gen_test_fn!(
+    test_ari_u16,
+    u16,
+    u16,
+    uint16x4_t,
+    uint16x4_t,
+    u64,
+    V_u16!(),
+    fill_u16,
+    fill_u16,
+    to64!(uint16x4_t)
+);
+gen_test_fn!(
+    test_bit_u16,
+    u16,
+    u16,
+    uint16x4_t,
+    uint16x4_t,
+    u64,
+    V_u16!(),
+    fill_u16,
+    fill_u16,
+    to64!(uint16x4_t)
+);
+gen_test_fn!(
+    test_cmp_u16,
+    u16,
+    u16,
+    uint16x4_t,
+    uint16x4_t,
+    u64,
+    V_u16!(),
+    fill_u16,
+    fill_u16,
+    to64!(uint16x4_t)
+);
+gen_test_fn!(
+    testq_ari_u16,
+    u16,
+    u16,
+    uint16x8_t,
+    uint16x8_t,
+    u128,
+    V_u16!(),
+    fillq_u16,
+    fillq_u16,
+    to128!(uint16x8_t)
+);
+gen_test_fn!(
+    testq_bit_u16,
+    u16,
+    u16,
+    uint16x8_t,
+    uint16x8_t,
+    u128,
+    V_u16!(),
+    fillq_u16,
+    fillq_u16,
+    to128!(uint16x8_t)
+);
+gen_test_fn!(
+    testq_cmp_u16,
+    u16,
+    u16,
+    uint16x8_t,
+    uint16x8_t,
+    u128,
+    V_u16!(),
+    fillq_u16,
+    fillq_u16,
+    to128!(uint16x8_t)
+);
+
+gen_test_fn!(
+    test_ari_s16,
+    i16,
+    i16,
+    int16x4_t,
+    int16x4_t,
+    u64,
+    V_i16!(),
+    fill_s16,
+    fill_s16,
+    to64!(int16x4_t)
+);
+gen_test_fn!(
+    test_bit_s16,
+    i16,
+    i16,
+    int16x4_t,
+    int16x4_t,
+    u64,
+    V_i16!(),
+    fill_s16,
+    fill_s16,
+    to64!(int16x4_t)
+);
+gen_test_fn!(
+    test_cmp_s16,
+    i16,
+    u16,
+    int16x4_t,
+    uint16x4_t,
+    u64,
+    V_i16!(),
+    fill_s16,
+    fill_u16,
+    to64!(uint16x4_t)
+);
+gen_test_fn!(
+    testq_ari_s16,
+    i16,
+    i16,
+    int16x8_t,
+    int16x8_t,
+    u128,
+    V_i16!(),
+    fillq_s16,
+    fillq_s16,
+    to128!(int16x8_t)
+);
+gen_test_fn!(
+    testq_bit_s16,
+    i16,
+    i16,
+    int16x8_t,
+    int16x8_t,
+    u128,
+    V_i16!(),
+    fillq_s16,
+    fillq_s16,
+    to128!(int16x8_t)
+);
+gen_test_fn!(
+    testq_cmp_s16,
+    i16,
+    u16,
+    int16x8_t,
+    uint16x8_t,
+    u128,
+    V_i16!(),
+    fillq_s16,
+    fillq_u16,
+    to128!(uint16x8_t)
+);
+
+gen_test_fn!(
+    test_ari_u32,
+    u32,
+    u32,
+    uint32x2_t,
+    uint32x2_t,
+    u64,
+    V_u32!(),
+    fill_u32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    test_bit_u32,
+    u32,
+    u32,
+    uint32x2_t,
+    uint32x2_t,
+    u64,
+    V_u32!(),
+    fill_u32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    test_cmp_u32,
+    u32,
+    u32,
+    uint32x2_t,
+    uint32x2_t,
+    u64,
+    V_u32!(),
+    fill_u32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    testq_ari_u32,
+    u32,
+    u32,
+    uint32x4_t,
+    uint32x4_t,
+    u128,
+    V_u32!(),
+    fillq_u32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
+gen_test_fn!(
+    testq_bit_u32,
+    u32,
+    u32,
+    uint32x4_t,
+    uint32x4_t,
+    u128,
+    V_u32!(),
+    fillq_u32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
+gen_test_fn!(
+    testq_cmp_u32,
+    u32,
+    u32,
+    uint32x4_t,
+    uint32x4_t,
+    u128,
+    V_u32!(),
+    fillq_u32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
+
+gen_test_fn!(
+    test_ari_s32,
+    i32,
+    i32,
+    int32x2_t,
+    int32x2_t,
+    u64,
+    V_i32!(),
+    fill_s32,
+    fill_s32,
+    to64!(int32x2_t)
+);
+gen_test_fn!(
+    test_bit_s32,
+    i32,
+    i32,
+    int32x2_t,
+    int32x2_t,
+    u64,
+    V_i32!(),
+    fill_s32,
+    fill_s32,
+    to64!(int32x2_t)
+);
+gen_test_fn!(
+    test_cmp_s32,
+    i32,
+    u32,
+    int32x2_t,
+    uint32x2_t,
+    u64,
+    V_i32!(),
+    fill_s32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    testq_ari_s32,
+    i32,
+    i32,
+    int32x4_t,
+    int32x4_t,
+    u128,
+    V_i32!(),
+    fillq_s32,
+    fillq_s32,
+    to128!(int32x4_t)
+);
+gen_test_fn!(
+    testq_bit_s32,
+    i32,
+    i32,
+    int32x4_t,
+    int32x4_t,
+    u128,
+    V_i32!(),
+    fillq_s32,
+    fillq_s32,
+    to128!(int32x4_t)
+);
+gen_test_fn!(
+    testq_cmp_s32,
+    i32,
+    u32,
+    int32x4_t,
+    uint32x4_t,
+    u128,
+    V_i32!(),
+    fillq_s32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
+
+gen_test_fn!(
+    test_ari_u64,
+    u64,
+    u64,
+    uint64x1_t,
+    uint64x1_t,
+    u64,
+    V_u64!(),
+    fill_u64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    test_bit_u64,
+    u64,
+    u64,
+    uint64x1_t,
+    uint64x1_t,
+    u64,
+    V_u64!(),
+    fill_u64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    test_cmp_u64,
+    u64,
+    u64,
+    uint64x1_t,
+    uint64x1_t,
+    u64,
+    V_u64!(),
+    fill_u64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    testq_ari_u64,
+    u64,
+    u64,
+    uint64x2_t,
+    uint64x2_t,
+    u128,
+    V_u64!(),
+    fillq_u64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+gen_test_fn!(
+    testq_bit_u64,
+    u64,
+    u64,
+    uint64x2_t,
+    uint64x2_t,
+    u128,
+    V_u64!(),
+    fillq_u64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+gen_test_fn!(
+    testq_cmp_u64,
+    u64,
+    u64,
+    uint64x2_t,
+    uint64x2_t,
+    u128,
+    V_u64!(),
+    fillq_u64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+
+gen_test_fn!(
+    test_ari_s64,
+    i64,
+    i64,
+    int64x1_t,
+    int64x1_t,
+    u64,
+    V_i64!(),
+    fill_s64,
+    fill_s64,
+    to64!(int64x1_t)
+);
+gen_test_fn!(
+    test_bit_s64,
+    i64,
+    i64,
+    int64x1_t,
+    int64x1_t,
+    u64,
+    V_i64!(),
+    fill_s64,
+    fill_s64,
+    to64!(int64x1_t)
+);
+gen_test_fn!(
+    test_cmp_s64,
+    i64,
+    u64,
+    int64x1_t,
+    uint64x1_t,
+    u64,
+    V_i64!(),
+    fill_s64,
+    fill_u64,
+    to64!(uint64x1_t)
+);
+gen_test_fn!(
+    testq_ari_s64,
+    i64,
+    i64,
+    int64x2_t,
+    int64x2_t,
+    u128,
+    V_i64!(),
+    fillq_s64,
+    fillq_s64,
+    to128!(int64x2_t)
+);
+gen_test_fn!(
+    testq_bit_s64,
+    i64,
+    i64,
+    int64x2_t,
+    int64x2_t,
+    u128,
+    V_i64!(),
+    fillq_s64,
+    fillq_s64,
+    to128!(int64x2_t)
+);
+gen_test_fn!(
+    testq_cmp_s64,
+    i64,
+    u64,
+    int64x2_t,
+    uint64x2_t,
+    u128,
+    V_i64!(),
+    fillq_s64,
+    fillq_u64,
+    to128!(uint64x2_t)
+);
+
+gen_test_fn!(
+    test_ari_f32,
+    f32,
+    f32,
+    float32x2_t,
+    float32x2_t,
+    u64,
+    V_f32!(),
+    fill_f32,
+    fill_f32,
+    to64!(float32x2_t)
+);
+gen_test_fn!(
+    test_cmp_f32,
+    f32,
+    u32,
+    float32x2_t,
+    uint32x2_t,
+    u64,
+    V_f32!(),
+    fill_f32,
+    fill_u32,
+    to64!(uint32x2_t)
+);
+gen_test_fn!(
+    testq_ari_f32,
+    f32,
+    f32,
+    float32x4_t,
+    float32x4_t,
+    u128,
+    V_f32!(),
+    fillq_f32,
+    fillq_f32,
+    to128!(float32x4_t)
+);
+gen_test_fn!(
+    testq_cmp_f32,
+    f32,
+    u32,
+    float32x4_t,
+    uint32x4_t,
+    u128,
+    V_f32!(),
+    fillq_f32,
+    fillq_u32,
+    to128!(uint32x4_t)
+);
diff --git a/library/stdarch/crates/core_arch/src/core_arch_docs.md b/library/stdarch/crates/core_arch/src/core_arch_docs.md
new file mode 100644
index 000000000..eddd1fc0c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/core_arch_docs.md
@@ -0,0 +1,344 @@
+SIMD and vendor intrinsics module.
+
+This module is intended to be the gateway to architecture-specific
+intrinsic functions, typically related to SIMD (but not always!). Each
+architecture that Rust compiles to may contain a submodule here, which
+means that this is not a portable module! If you're writing a portable
+library take care when using these APIs!
+
+Under this module you'll find an architecture-named module, such as
+`x86_64`. Each `#[cfg(target_arch)]` that Rust can compile to may have a
+module entry here, only present on that particular target. For example the
+`i686-pc-windows-msvc` target will have an `x86` module here, whereas
+`x86_64-pc-windows-msvc` has `x86_64`.
+
+[rfc]: https://github.com/rust-lang/rfcs/pull/2325
+[tracked]: https://github.com/rust-lang/rust/issues/48556
+
+# Overview
+
+This module exposes vendor-specific intrinsics that typically correspond to
+a single machine instruction. These intrinsics are not portable: their
+availability is architecture-dependent, and not all machines of that
+architecture might provide the intrinsic.
+
+The `arch` module is intended to be a low-level implementation detail for
+higher-level APIs. Using it correctly can be quite tricky as you need to
+ensure at least a few guarantees are upheld:
+
+* The correct architecture's module is used. For example the `arm` module
+  isn't available on the `x86_64-unknown-linux-gnu` target. This is
+  typically done by ensuring that `#[cfg]` is used appropriately when using
+  this module.
+* The CPU the program is currently running on supports the function being
+  called. For example it is unsafe to call an AVX2 function on a CPU that
+  doesn't actually support AVX2.
+
+As a result of the latter of these guarantees all intrinsics in this module
+are `unsafe` and extra care needs to be taken when calling them!
+
+# CPU Feature Detection
+
+In order to call these APIs in a safe fashion there's a number of
+mechanisms available to ensure that the correct CPU feature is available
+to call an intrinsic. Let's consider, for example, the `_mm256_add_epi64`
+intrinsics on the `x86` and `x86_64` architectures. This function requires
+the AVX2 feature as [documented by Intel][intel-dox] so to correctly call
+this function we need to (a) guarantee we only call it on `x86`/`x86_64`
+and (b) ensure that the CPU feature is available
+
+[intel-dox]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64&expand=100
+
+## Static CPU Feature Detection
+
+The first option available to us is to conditionally compile code via the
+`#[cfg]` attribute. CPU features correspond to the `target_feature` cfg
+available, and can be used like so:
+
+```ignore
+#[cfg(
+    all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "avx2"
+    )
+)]
+fn foo() {
+    #[cfg(target_arch = "x86")]
+    use std::arch::x86::_mm256_add_epi64;
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::_mm256_add_epi64;
+
+    unsafe {
+        _mm256_add_epi64(...);
+    }
+}
+```
+
+Here we're using `#[cfg(target_feature = "avx2")]` to conditionally compile
+this function into our module. This means that if the `avx2` feature is
+*enabled statically* then we'll use the `_mm256_add_epi64` function at
+runtime. The `unsafe` block here can be justified through the usage of
+`#[cfg]` to only compile the code in situations where the safety guarantees
+are upheld.
+
+Statically enabling a feature is typically done with the `-C
+target-feature` or `-C target-cpu` flags to the compiler. For example if
+your local CPU supports AVX2 then you can compile the above function with:
+
+```sh
+$ RUSTFLAGS='-C target-cpu=native' cargo build
+```
+
+Or otherwise you can specifically enable just the AVX2 feature:
+
+```sh
+$ RUSTFLAGS='-C target-feature=+avx2' cargo build
+```
+
+Note that when you compile a binary with a particular feature enabled it's
+important to ensure that you only run the binary on systems which satisfy
+the required feature set.
+
+## Dynamic CPU Feature Detection
+
+Sometimes statically dispatching isn't quite what you want. Instead you
+might want to build a portable binary that runs across a variety of CPUs,
+but at runtime it selects the most optimized implementation available. This
+allows you to build a "least common denominator" binary which has certain
+sections more optimized for different CPUs.
+
+Taking our previous example from before, we're going to compile our binary
+*without* AVX2 support, but we'd like to enable it for just one function.
+We can do that in a manner like:
+
+```ignore
+fn foo() {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx2") {
+            return unsafe { foo_avx2() };
+        }
+    }
+
+    // fallback implementation without using AVX2
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+unsafe fn foo_avx2() {
+    #[cfg(target_arch = "x86")]
+    use std::arch::x86::_mm256_add_epi64;
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::_mm256_add_epi64;
+
+    _mm256_add_epi64(...);
+}
+```
+
+There's a couple of components in play here, so let's go through them in
+detail!
+
+* First up we notice the `is_x86_feature_detected!` macro. Provided by
+  the standard library, this macro will perform necessary runtime detection
+  to determine whether the CPU the program is running on supports the
+  specified feature. In this case the macro will expand to a boolean
+expression evaluating to whether the local CPU has the AVX2 feature or
+not.
+
+  Note that this macro, like the `arch` module, is platform-specific. For
+  example calling `is_x86_feature_detected!("avx2")` on ARM will be a
+  compile time error. To ensure we don't hit this error a statement level
+  `#[cfg]` is used to only compile usage of the macro on `x86`/`x86_64`.
+
+* Next up we see our AVX2-enabled function, `foo_avx2`. This function is
+  decorated with the `#[target_feature]` attribute which enables a CPU
+  feature for just this one function. Using a compiler flag like `-C
+  target-feature=+avx2` will enable AVX2 for the entire program, but using
+  an attribute will only enable it for the one function. Usage of the
+  `#[target_feature]` attribute currently requires the function to also be
+  `unsafe`, as we see here. This is because the function can only be
+  correctly called on systems which have the AVX2 (like the intrinsics
+  themselves).
+
+And with all that we should have a working program! This program will run
+across all machines and it'll use the optimized AVX2 implementation on
+machines where support is detected.
+
+# Ergonomics
+
+It's important to note that using the `arch` module is not the easiest
+thing in the world, so if you're curious to try it out you may want to
+brace yourself for some wordiness!
+
+The primary purpose of this module is to enable stable crates on crates.io
+to build up much more ergonomic abstractions which end up using SIMD under
+the hood. Over time these abstractions may also move into the standard
+library itself, but for now this module is tasked with providing the bare
+minimum necessary to use vendor intrinsics on stable Rust.
+
+# Other architectures
+
+This documentation is only for one particular architecture, you can find
+others at:
+
+* [`x86`]
+* [`x86_64`]
+* [`arm`]
+* [`aarch64`]
+* [`riscv32`]
+* [`riscv64`]
+* [`mips`]
+* [`mips64`]
+* [`powerpc`]
+* [`powerpc64`]
+* [`nvptx`]
+* [`wasm32`]
+
+[`x86`]: ../../core/arch/x86/index.html
+[`x86_64`]: ../../core/arch/x86_64/index.html
+[`arm`]: ../../core/arch/arm/index.html
+[`aarch64`]: ../../core/arch/aarch64/index.html
+[`riscv32`]: ../../core/arch/riscv32/index.html
+[`riscv64`]: ../../core/arch/riscv64/index.html
+[`mips`]: ../../core/arch/mips/index.html
+[`mips64`]: ../../core/arch/mips64/index.html
+[`powerpc`]: ../../core/arch/powerpc/index.html
+[`powerpc64`]: ../../core/arch/powerpc64/index.html
+[`nvptx`]: ../../core/arch/nvptx/index.html
+[`wasm32`]: ../../core/arch/wasm32/index.html
+
+# Examples
+
+First let's take a look at not actually using any intrinsics but instead
+using LLVM's auto-vectorization to produce optimized vectorized code for
+AVX2 and also for the default platform.
+
+```rust
+fn main() {
+    let mut dst = [0];
+    add_quickly(&[1], &[2], &mut dst);
+    assert_eq!(dst[0], 3);
+}
+
+fn add_quickly(a: &[u8], b: &[u8], c: &mut [u8]) {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        // Note that this `unsafe` block is safe because we're testing
+        // that the `avx2` feature is indeed available on our CPU.
+        if is_x86_feature_detected!("avx2") {
+            return unsafe { add_quickly_avx2(a, b, c) };
+        }
+    }
+
+    add_quickly_fallback(a, b, c)
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+unsafe fn add_quickly_avx2(a: &[u8], b: &[u8], c: &mut [u8]) {
+    add_quickly_fallback(a, b, c) // the function below is inlined here
+}
+
+fn add_quickly_fallback(a: &[u8], b: &[u8], c: &mut [u8]) {
+    for ((a, b), c) in a.iter().zip(b).zip(c) {
+        *c = *a + *b;
+    }
+}
+```
+
+Next up let's take a look at an example of manually using intrinsics. Here
+we'll be using SSE4.1 features to implement hex encoding.
+
+```
+fn main() {
+    let mut dst = [0; 32];
+    hex_encode(b"\x01\x02\x03", &mut dst);
+    assert_eq!(&dst[..6], b"010203");
+
+    let mut src = [0; 16];
+    for i in 0..16 {
+        src[i] = (i + 1) as u8;
+    }
+    hex_encode(&src, &mut dst);
+    assert_eq!(&dst, b"0102030405060708090a0b0c0d0e0f10");
+}
+
+pub fn hex_encode(src: &[u8], dst: &mut [u8]) {
+    let len = src.len().checked_mul(2).unwrap();
+    assert!(dst.len() >= len);
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("sse4.1") {
+            return unsafe { hex_encode_sse41(src, dst) };
+        }
+    }
+
+    hex_encode_fallback(src, dst)
+}
+
+// translated from
+// <https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp>
+#[target_feature(enable = "sse4.1")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
+    #[cfg(target_arch = "x86")]
+    use std::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::*;
+
+    let ascii_zero = _mm_set1_epi8(b'0' as i8);
+    let nines = _mm_set1_epi8(9);
+    let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
+    let and4bits = _mm_set1_epi8(0xf);
+
+    let mut i = 0_isize;
+    while src.len() >= 16 {
+        let invec = _mm_loadu_si128(src.as_ptr() as *const _);
+
+        let masked1 = _mm_and_si128(invec, and4bits);
+        let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
+        let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = _mm_add_epi8(
+            masked1,
+            _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
+        );
+        let masked2 = _mm_add_epi8(
+            masked2,
+            _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
+        );
+
+        // interleave masked1 and masked2 bytes
+        let res1 = _mm_unpacklo_epi8(masked2, masked1);
+        let res2 = _mm_unpackhi_epi8(masked2, masked1);
+
+        _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+        _mm_storeu_si128(
+            dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
+            res2,
+        );
+        src = &src[16..];
+        i += 16;
+    }
+
+    let i = i as usize;
+    hex_encode_fallback(src, &mut dst[i * 2..]);
+}
+
+fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
+    fn hex(byte: u8) -> u8 {
+        static TABLE: &[u8] = b"0123456789abcdef";
+        TABLE[byte as usize]
+    }
+
+    for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
+        slots[0] = hex((*byte >> 4) & 0xf);
+        slots[1] = hex(*byte & 0xf);
+    }
+}
+```
diff --git a/library/stdarch/crates/core_arch/src/lib.rs b/library/stdarch/crates/core_arch/src/lib.rs
new file mode 100644
index 000000000..9240d0e84
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/lib.rs
@@ -0,0 +1,74 @@
+#![doc = include_str!("core_arch_docs.md")]
+#![allow(improper_ctypes_definitions)]
+#![allow(dead_code)]
+#![allow(unused_features)]
+#![deny(rust_2018_idioms)]
+#![feature(
+    custom_inner_attributes,
+    link_llvm_intrinsics,
+    platform_intrinsics,
+    repr_simd,
+    simd_ffi,
+    proc_macro_hygiene,
+    stmt_expr_attributes,
+    core_intrinsics,
+    no_core,
+    rustc_attrs,
+    stdsimd,
+    staged_api,
+    doc_cfg,
+    tbm_target_feature,
+    sse4a_target_feature,
+    arm_target_feature,
+    cmpxchg16b_target_feature,
+    avx512_target_feature,
+    mips_target_feature,
+    powerpc_target_feature,
+    wasm_target_feature,
+    abi_unadjusted,
+    rtm_target_feature,
+    f16c_target_feature,
+    allow_internal_unstable,
+    decl_macro,
+    bench_black_box,
+    asm_const
+)]
+#![cfg_attr(test, feature(test, abi_vectorcall))]
+#![deny(clippy::missing_inline_in_public_items)]
+#![allow(
+    clippy::inline_always,
+    clippy::too_many_arguments,
+    clippy::cast_sign_loss,
+    clippy::cast_lossless,
+    clippy::cast_possible_wrap,
+    clippy::cast_possible_truncation,
+    clippy::cast_precision_loss,
+    clippy::shadow_reuse,
+    clippy::cognitive_complexity,
+    clippy::similar_names,
+    clippy::many_single_char_names
+)]
+#![cfg_attr(test, allow(unused_imports))]
+#![no_std]
+#![unstable(feature = "stdsimd", issue = "27731")]
+#![doc(
+    test(attr(deny(warnings))),
+    test(attr(allow(dead_code, deprecated, unused_variables, unused_mut)))
+)]
+
+#[cfg(test)]
+#[macro_use]
+extern crate std;
+#[cfg(test)]
+#[macro_use]
+extern crate std_detect;
+#[path = "mod.rs"]
+mod core_arch;
+
+pub mod arch {
+    pub use crate::core_arch::arch::*;
+    pub use core::arch::asm;
+}
+
+#[allow(unused_imports)]
+use core::{convert, ffi, hint, intrinsics, marker, mem, ops, ptr, sync};
diff --git a/library/stdarch/crates/core_arch/src/macros.rs b/library/stdarch/crates/core_arch/src/macros.rs
new file mode 100644
index 000000000..1e6a3f405
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/macros.rs
@@ -0,0 +1,190 @@
+//! Utility macros.
+
+// Helper struct used to trigger const eval errors when the const generic immediate value `IMM` is
+// out of `[MIN-MAX]` range.
+pub(crate) struct ValidateConstImm<const IMM: i32, const MIN: i32, const MAX: i32>;
+impl<const IMM: i32, const MIN: i32, const MAX: i32> ValidateConstImm<IMM, MIN, MAX> {
+    pub(crate) const VALID: () = {
+        assert!(IMM >= MIN && IMM <= MAX, "IMM value not in expected range");
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm1 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 1) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm2 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 2) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm3 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 3) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm4 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 4) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm5 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 5) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm6 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 6) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm8 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 8) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm16 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 16) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused)]
+macro_rules! static_assert {
+    ($imm:ident : $ty:ty where $e:expr) => {{
+        struct Validate<const $imm: $ty>();
+        impl<const $imm: $ty> Validate<$imm> {
+            const VALID: () = {
+                assert!($e, concat!("Assertion failed: ", stringify!($e)));
+            };
+        }
+        let _ = Validate::<$imm>::VALID;
+    }};
+}
+
+#[allow(unused)]
+macro_rules! types {
+    ($(
+        $(#[$doc:meta])*
+        pub struct $name:ident($($fields:tt)*);
+    )*) => ($(
+        $(#[$doc])*
+        #[derive(Copy, Clone, Debug)]
+        #[allow(non_camel_case_types)]
+        #[repr(simd)]
+        #[allow(clippy::missing_inline_in_public_items)]
+        pub struct $name($($fields)*);
+    )*)
+}
+
+#[allow(unused)]
+macro_rules! simd_shuffle2 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+ $(,)?> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 2] = $idx;
+        }
+
+        simd_shuffle2($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 2] = $idx;
+        simd_shuffle2($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle4 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+ $(,)?> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 4] = $idx;
+        }
+
+        simd_shuffle4($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 4] = $idx;
+        simd_shuffle4($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle8 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+ $(,)?> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 8] = $idx;
+        }
+
+        simd_shuffle8($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 8] = $idx;
+        simd_shuffle8($x, $y, IDX)
+    }};
+}
+
+#[allow(unused)]
+macro_rules! simd_shuffle16 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+ $(,)?> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 16] = $idx;
+        }
+
+        simd_shuffle16($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 16] = $idx;
+        simd_shuffle16($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle32 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $(,)? $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 32] = $idx;
+        }
+
+        simd_shuffle32($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 32] = $idx;
+        simd_shuffle32($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle64 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+ $(,)?> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 64] = $idx;
+        }
+
+        simd_shuffle64($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 64] = $idx;
+        simd_shuffle64($x, $y, IDX)
+    }};
+}
diff --git a/library/stdarch/crates/core_arch/src/mips/mod.rs b/library/stdarch/crates/core_arch/src/mips/mod.rs
new file mode 100644
index 000000000..96905aedc
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mips/mod.rs
@@ -0,0 +1,18 @@
+//! MIPS
+
+// Building this module (even if unused) for non-fp64 targets fails with an LLVM
+// error.
+#[cfg(target_feature = "fp64")]
+mod msa;
+#[cfg(target_feature = "fp64")]
+pub use self::msa::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Generates the trap instruction `BREAK`
+#[cfg_attr(test, assert_instr(break))]
+#[inline]
+pub unsafe fn break_() -> ! {
+    crate::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/mips/msa.rs b/library/stdarch/crates/core_arch/src/mips/msa.rs
new file mode 100644
index 000000000..85ed30d18
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mips/msa.rs
@@ -0,0 +1,17894 @@
+//! MIPS SIMD Architecture intrinsics
+//!
+//! The reference is [MIPS Architecture for Programmers Volume IV-j: The
+//! MIPS32 SIMD Architecture Module Revision 1.12][msa_ref].
+//!
+//! [msa_ref]: http://cdn2.imgtec.com/documentation/MD00866-2B-MSA32-AFP-01.12.pdf
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::mem;
+
+#[macro_use]
+mod macros;
+
+types! {
+    // / MIPS-specific 128-bit wide vector of 16 packed `i8`.
+   pub struct v16i8(
+       i8, i8, i8, i8, i8, i8, i8, i8,
+       i8, i8, i8, i8, i8, i8, i8, i8,
+   );
+
+    // / MIPS-specific 128-bit wide vector of 8 packed `i16`.
+   pub struct v8i16(
+       i16, i16, i16, i16, i16, i16, i16, i16,
+   );
+
+    // / MIPS-specific 128-bit wide vector of 4 packed `i32`.
+   pub struct v4i32(
+       i32, i32, i32, i32,
+   );
+
+    // / MIPS-specific 128-bit wide vector of 2 packed `i64`.
+   pub struct v2i64(
+       i64, i64,
+   );
+
+    // / MIPS-specific 128-bit wide vector of 16 packed `u8`.
+   pub struct v16u8(
+       u8, u8, u8, u8, u8, u8, u8, u8,
+       u8, u8, u8, u8, u8, u8, u8, u8,
+   );
+
+    // / MIPS-specific 128-bit wide vector of 8 packed `u16`.
+   pub struct v8u16(
+       u16, u16, u16, u16, u16, u16, u16, u16,
+   );
+
+    // / MIPS-specific 128-bit wide vector of 4 packed `u32`.
+   pub struct v4u32(
+       u32, u32, u32, u32,
+   );
+
+    // / MIPS-specific 128-bit wide vector of 2 packed `u64`.
+   pub struct v2u64(
+       u64, u64,
+   );
+
+   // / MIPS-specific 128-bit wide vector of 4 packed `f32`.
+   pub struct v4f32(
+       f32, f32, f32, f32,
+   );
+
+    // / MIPS-specific 128-bit wide vector of 2 packed `f64`.
+   pub struct v2f64(
+       f64, f64,
+   );
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.mips.add.a.b"]
+    fn msa_add_a_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.add.a.h"]
+    fn msa_add_a_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.add.a.w"]
+    fn msa_add_a_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.add.a.d"]
+    fn msa_add_a_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.adds.a.b"]
+    fn msa_adds_a_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.adds.a.h"]
+    fn msa_adds_a_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.adds.a.w"]
+    fn msa_adds_a_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.adds.a.d"]
+    fn msa_adds_a_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.adds.s.b"]
+    fn msa_adds_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.adds.s.h"]
+    fn msa_adds_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.adds.s.w"]
+    fn msa_adds_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.adds.s.d"]
+    fn msa_adds_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.adds.u.b"]
+    fn msa_adds_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.adds.u.h"]
+    fn msa_adds_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.adds.u.w"]
+    fn msa_adds_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.adds.u.d"]
+    fn msa_adds_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.addv.b"]
+    fn msa_addv_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.addv.h"]
+    fn msa_addv_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.addv.w"]
+    fn msa_addv_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.addv.d"]
+    fn msa_addv_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.addvi.b"]
+    fn msa_addvi_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.addvi.h"]
+    fn msa_addvi_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.addvi.w"]
+    fn msa_addvi_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.addvi.d"]
+    fn msa_addvi_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.and.v"]
+    fn msa_and_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.andi.b"]
+    fn msa_andi_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.asub.s.b"]
+    fn msa_asub_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.asub.s.h"]
+    fn msa_asub_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.asub.s.w"]
+    fn msa_asub_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.asub.s.d"]
+    fn msa_asub_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.asub.u.b"]
+    fn msa_asub_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.asub.u.h"]
+    fn msa_asub_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.asub.u.w"]
+    fn msa_asub_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.asub.u.d"]
+    fn msa_asub_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.ave.s.b"]
+    fn msa_ave_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ave.s.h"]
+    fn msa_ave_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ave.s.w"]
+    fn msa_ave_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ave.s.d"]
+    fn msa_ave_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ave.u.b"]
+    fn msa_ave_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.ave.u.h"]
+    fn msa_ave_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.ave.u.w"]
+    fn msa_ave_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.ave.u.d"]
+    fn msa_ave_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.aver.s.b"]
+    fn msa_aver_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.aver.s.h"]
+    fn msa_aver_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.aver.s.w"]
+    fn msa_aver_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.aver.s.d"]
+    fn msa_aver_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.aver.u.b"]
+    fn msa_aver_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.aver.u.h"]
+    fn msa_aver_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.aver.u.w"]
+    fn msa_aver_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.aver.u.d"]
+    fn msa_aver_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.bclr.b"]
+    fn msa_bclr_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bclr.h"]
+    fn msa_bclr_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.bclr.w"]
+    fn msa_bclr_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.bclr.d"]
+    fn msa_bclr_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.bclri.b"]
+    fn msa_bclri_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.bclri.h"]
+    fn msa_bclri_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.bclri.w"]
+    fn msa_bclri_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.bclri.d"]
+    fn msa_bclri_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.binsl.b"]
+    fn msa_binsl_b(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.binsl.h"]
+    fn msa_binsl_h(a: v8u16, b: v8u16, c: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.binsl.w"]
+    fn msa_binsl_w(a: v4u32, b: v4u32, c: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.binsl.d"]
+    fn msa_binsl_d(a: v2u64, b: v2u64, c: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.binsli.b"]
+    fn msa_binsli_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.binsli.h"]
+    fn msa_binsli_h(a: v8u16, b: v8u16, c: i32) -> v8u16;
+    #[link_name = "llvm.mips.binsli.w"]
+    fn msa_binsli_w(a: v4u32, b: v4u32, c: i32) -> v4u32;
+    #[link_name = "llvm.mips.binsli.d"]
+    fn msa_binsli_d(a: v2u64, b: v2u64, c: i32) -> v2u64;
+    #[link_name = "llvm.mips.binsr.b"]
+    fn msa_binsr_b(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.binsr.h"]
+    fn msa_binsr_h(a: v8u16, b: v8u16, c: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.binsr.w"]
+    fn msa_binsr_w(a: v4u32, b: v4u32, c: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.binsr.d"]
+    fn msa_binsr_d(a: v2u64, b: v2u64, c: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.binsri.b"]
+    fn msa_binsri_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.binsri.h"]
+    fn msa_binsri_h(a: v8u16, b: v8u16, c: i32) -> v8u16;
+    #[link_name = "llvm.mips.binsri.w"]
+    fn msa_binsri_w(a: v4u32, b: v4u32, c: i32) -> v4u32;
+    #[link_name = "llvm.mips.binsri.d"]
+    fn msa_binsri_d(a: v2u64, b: v2u64, c: i32) -> v2u64;
+    #[link_name = "llvm.mips.bmnz.v"]
+    fn msa_bmnz_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bmnzi.b"]
+    fn msa_bmnzi_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.bmz.v"]
+    fn msa_bmz_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bmzi.b"]
+    fn msa_bmzi_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.bneg.b"]
+    fn msa_bneg_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bneg.h"]
+    fn msa_bneg_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.bneg.w"]
+    fn msa_bneg_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.bneg.d"]
+    fn msa_bneg_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.bnegi.b"]
+    fn msa_bnegi_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.bnegi.h"]
+    fn msa_bnegi_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.bnegi.w"]
+    fn msa_bnegi_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.bnegi.d"]
+    fn msa_bnegi_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.bnz.b"]
+    fn msa_bnz_b(a: v16u8) -> i32;
+    #[link_name = "llvm.mips.bnz.h"]
+    fn msa_bnz_h(a: v8u16) -> i32;
+    #[link_name = "llvm.mips.bnz.w"]
+    fn msa_bnz_w(a: v4u32) -> i32;
+    #[link_name = "llvm.mips.bnz.d"]
+    fn msa_bnz_d(a: v2u64) -> i32;
+    #[link_name = "llvm.mips.bnz.v"]
+    fn msa_bnz_v(a: v16u8) -> i32;
+    #[link_name = "llvm.mips.bsel.v"]
+    fn msa_bsel_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bseli.b"]
+    fn msa_bseli_b(a: v16u8, b: v16u8, c: i32) -> v16u8;
+    #[link_name = "llvm.mips.bset.b"]
+    fn msa_bset_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.bset.h"]
+    fn msa_bset_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.bset.w"]
+    fn msa_bset_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.bset.d"]
+    fn msa_bset_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.bseti.b"]
+    fn msa_bseti_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.bseti.h"]
+    fn msa_bseti_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.bseti.w"]
+    fn msa_bseti_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.bseti.d"]
+    fn msa_bseti_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.bz.b"]
+    fn msa_bz_b(a: v16u8) -> i32;
+    #[link_name = "llvm.mips.bz.h"]
+    fn msa_bz_h(a: v8u16) -> i32;
+    #[link_name = "llvm.mips.bz.w"]
+    fn msa_bz_w(a: v4u32) -> i32;
+    #[link_name = "llvm.mips.bz.d"]
+    fn msa_bz_d(a: v2u64) -> i32;
+    #[link_name = "llvm.mips.bz.v"]
+    fn msa_bz_v(a: v16u8) -> i32;
+    #[link_name = "llvm.mips.ceq.b"]
+    fn msa_ceq_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ceq.h"]
+    fn msa_ceq_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ceq.w"]
+    fn msa_ceq_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ceq.d"]
+    fn msa_ceq_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ceqi.b"]
+    fn msa_ceqi_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.ceqi.h"]
+    fn msa_ceqi_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.ceqi.w"]
+    fn msa_ceqi_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.ceqi.d"]
+    fn msa_ceqi_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.cfcmsa"]
+    fn msa_cfcmsa(a: i32) -> i32;
+    #[link_name = "llvm.mips.cle.s.b"]
+    fn msa_cle_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.cle.s.h"]
+    fn msa_cle_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.cle.s.w"]
+    fn msa_cle_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.cle.s.d"]
+    fn msa_cle_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.cle.u.b"]
+    fn msa_cle_u_b(a: v16u8, b: v16u8) -> v16i8;
+    #[link_name = "llvm.mips.cle.u.h"]
+    fn msa_cle_u_h(a: v8u16, b: v8u16) -> v8i16;
+    #[link_name = "llvm.mips.cle.u.w"]
+    fn msa_cle_u_w(a: v4u32, b: v4u32) -> v4i32;
+    #[link_name = "llvm.mips.cle.u.d"]
+    fn msa_cle_u_d(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.mips.clei.s.b"]
+    fn msa_clei_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.clei.s.h"]
+    fn msa_clei_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.clei.s.w"]
+    fn msa_clei_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.clei.s.d"]
+    fn msa_clei_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.clei.u.b"]
+    fn msa_clei_u_b(a: v16u8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.clei.u.h"]
+    fn msa_clei_u_h(a: v8u16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.clei.u.w"]
+    fn msa_clei_u_w(a: v4u32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.clei.u.d"]
+    fn msa_clei_u_d(a: v2u64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.clt.s.b"]
+    fn msa_clt_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.clt.s.h"]
+    fn msa_clt_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.clt.s.w"]
+    fn msa_clt_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.clt.s.d"]
+    fn msa_clt_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.clt.u.b"]
+    fn msa_clt_u_b(a: v16u8, b: v16u8) -> v16i8;
+    #[link_name = "llvm.mips.clt.u.h"]
+    fn msa_clt_u_h(a: v8u16, b: v8u16) -> v8i16;
+    #[link_name = "llvm.mips.clt.u.w"]
+    fn msa_clt_u_w(a: v4u32, b: v4u32) -> v4i32;
+    #[link_name = "llvm.mips.clt.u.d"]
+    fn msa_clt_u_d(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.mips.clti.s.b"]
+    fn msa_clti_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.clti.s.h"]
+    fn msa_clti_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.clti.s.w"]
+    fn msa_clti_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.clti.s.d"]
+    fn msa_clti_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.clti.u.b"]
+    fn msa_clti_u_b(a: v16u8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.clti.u.h"]
+    fn msa_clti_u_h(a: v8u16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.clti.u.w"]
+    fn msa_clti_u_w(a: v4u32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.clti.u.d"]
+    fn msa_clti_u_d(a: v2u64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.copy.s.b"]
+    fn msa_copy_s_b(a: v16i8, b: i32) -> i32;
+    #[link_name = "llvm.mips.copy.s.h"]
+    fn msa_copy_s_h(a: v8i16, b: i32) -> i32;
+    #[link_name = "llvm.mips.copy.s.w"]
+    fn msa_copy_s_w(a: v4i32, b: i32) -> i32;
+    #[link_name = "llvm.mips.copy.s.d"]
+    fn msa_copy_s_d(a: v2i64, b: i32) -> i64;
+    #[link_name = "llvm.mips.copy.u.b"]
+    fn msa_copy_u_b(a: v16i8, b: i32) -> u32;
+    #[link_name = "llvm.mips.copy.u.h"]
+    fn msa_copy_u_h(a: v8i16, b: i32) -> u32;
+    #[link_name = "llvm.mips.copy.u.w"]
+    fn msa_copy_u_w(a: v4i32, b: i32) -> u32;
+    #[link_name = "llvm.mips.copy.u.d"]
+    fn msa_copy_u_d(a: v2i64, b: i32) -> u64;
+    #[link_name = "llvm.mips.ctcmsa"]
+    fn msa_ctcmsa(imm5: i32, a: i32) -> ();
+    #[link_name = "llvm.mips.div.s.b"]
+    fn msa_div_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.div.s.h"]
+    fn msa_div_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.div.s.w"]
+    fn msa_div_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.div.s.d"]
+    fn msa_div_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.div.u.b"]
+    fn msa_div_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.div.u.h"]
+    fn msa_div_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.div.u.w"]
+    fn msa_div_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.div.u.d"]
+    fn msa_div_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.dotp.s.h"]
+    fn msa_dotp_s_h(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.dotp.s.w"]
+    fn msa_dotp_s_w(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.dotp.s.d"]
+    fn msa_dotp_s_d(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.dotp.u.h"]
+    fn msa_dotp_u_h(a: v16u8, b: v16u8) -> v8u16;
+    #[link_name = "llvm.mips.dotp.u.w"]
+    fn msa_dotp_u_w(a: v8u16, b: v8u16) -> v4u32;
+    #[link_name = "llvm.mips.dotp.u.d"]
+    fn msa_dotp_u_d(a: v4u32, b: v4u32) -> v2u64;
+    #[link_name = "llvm.mips.dpadd.s.h"]
+    fn msa_dpadd_s_h(a: v8i16, b: v16i8, c: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.dpadd.s.w"]
+    fn msa_dpadd_s_w(a: v4i32, b: v8i16, c: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.dpadd.s.d"]
+    fn msa_dpadd_s_d(a: v2i64, b: v4i32, c: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.dpadd.u.h"]
+    fn msa_dpadd_u_h(a: v8u16, b: v16u8, c: v16u8) -> v8u16;
+    #[link_name = "llvm.mips.dpadd.u.w"]
+    fn msa_dpadd_u_w(a: v4u32, b: v8u16, c: v8u16) -> v4u32;
+    #[link_name = "llvm.mips.dpadd.u.d"]
+    fn msa_dpadd_u_d(a: v2u64, b: v4u32, c: v4u32) -> v2u64;
+    #[link_name = "llvm.mips.dpsub.s.h"]
+    fn msa_dpsub_s_h(a: v8i16, b: v16i8, c: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.dpsub.s.w"]
+    fn msa_dpsub_s_w(a: v4i32, b: v8i16, c: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.dpsub.s.d"]
+    fn msa_dpsub_s_d(a: v2i64, b: v4i32, c: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.dpsub.u.h"]
+    fn msa_dpsub_u_h(a: v8i16, b: v16u8, c: v16u8) -> v8i16;
+    #[link_name = "llvm.mips.dpsub.u.w"]
+    fn msa_dpsub_u_w(a: v4i32, b: v8u16, c: v8u16) -> v4i32;
+    #[link_name = "llvm.mips.dpsub.u.d"]
+    fn msa_dpsub_u_d(a: v2i64, b: v4u32, c: v4u32) -> v2i64;
+    #[link_name = "llvm.mips.fadd.w"]
+    fn msa_fadd_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fadd.d"]
+    fn msa_fadd_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fcaf.w"]
+    fn msa_fcaf_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcaf.d"]
+    fn msa_fcaf_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fceq.w"]
+    fn msa_fceq_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fceq.d"]
+    fn msa_fceq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fclass.w"]
+    fn msa_fclass_w(a: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fclass.d"]
+    fn msa_fclass_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcle.w"]
+    fn msa_fcle_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcle.d"]
+    fn msa_fcle_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fclt.w"]
+    fn msa_fclt_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fclt.d"]
+    fn msa_fclt_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcne.w"]
+    fn msa_fcne_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcne.d"]
+    fn msa_fcne_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcor.w"]
+    fn msa_fcor_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcor.d"]
+    fn msa_fcor_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcueq.w"]
+    fn msa_fcueq_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcueq.d"]
+    fn msa_fcueq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcule.w"]
+    fn msa_fcule_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcule.d"]
+    fn msa_fcule_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcult.w"]
+    fn msa_fcult_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcult.d"]
+    fn msa_fcult_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcun.w"]
+    fn msa_fcun_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcun.d"]
+    fn msa_fcun_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fcune.w"]
+    fn msa_fcune_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fcune.d"]
+    fn msa_fcune_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fdiv.w"]
+    fn msa_fdiv_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fdiv.d"]
+    fn msa_fdiv_d(a: v2f64, b: v2f64) -> v2f64;
+    // FIXME: 16-bit floats
+    // #[link_name = "llvm.mips.fexdo.h"]
+    // fn msa_fexdo_h(a: v4f32, b: v4f32) -> f16x8;
+    #[link_name = "llvm.mips.fexdo.w"]
+    fn msa_fexdo_w(a: v2f64, b: v2f64) -> v4f32;
+    #[link_name = "llvm.mips.fexp2.w"]
+    fn msa_fexp2_w(a: v4f32, b: v4i32) -> v4f32;
+    #[link_name = "llvm.mips.fexp2.d"]
+    fn msa_fexp2_d(a: v2f64, b: v2i64) -> v2f64;
+    // FIXME: 16-bit floats
+    // #[link_name = "llvm.mips.fexupl.w"]
+    // fn msa_fexupl_w(a: f16x8) -> v4f32;
+    #[link_name = "llvm.mips.fexupl.d"]
+    fn msa_fexupl_d(a: v4f32) -> v2f64;
+    // FIXME: 16-bit floats
+    // #[link_name = "llvm.mips.fexupr.w"]
+    // fn msa_fexupr_w(a: f16x8) -> v4f32;
+    #[link_name = "llvm.mips.fexupr.d"]
+    fn msa_fexupr_d(a: v4f32) -> v2f64;
+    #[link_name = "llvm.mips.ffint.s.w"]
+    fn msa_ffint_s_w(a: v4i32) -> v4f32;
+    #[link_name = "llvm.mips.ffint.s.d"]
+    fn msa_ffint_s_d(a: v2i64) -> v2f64;
+    #[link_name = "llvm.mips.ffint.u.w"]
+    fn msa_ffint_u_w(a: v4u32) -> v4f32;
+    #[link_name = "llvm.mips.ffint.u.d"]
+    fn msa_ffint_u_d(a: v2u64) -> v2f64;
+    #[link_name = "llvm.mips.ffql.w"]
+    fn msa_ffql_w(a: v8i16) -> v4f32;
+    #[link_name = "llvm.mips.ffql.d"]
+    fn msa_ffql_d(a: v4i32) -> v2f64;
+    #[link_name = "llvm.mips.ffqr.w"]
+    fn msa_ffqr_w(a: v8i16) -> v4f32;
+    #[link_name = "llvm.mips.ffqr.d"]
+    fn msa_ffqr_d(a: v4i32) -> v2f64;
+    #[link_name = "llvm.mips.fill.b"]
+    fn msa_fill_b(a: i32) -> v16i8;
+    #[link_name = "llvm.mips.fill.h"]
+    fn msa_fill_h(a: i32) -> v8i16;
+    #[link_name = "llvm.mips.fill.w"]
+    fn msa_fill_w(a: i32) -> v4i32;
+    #[link_name = "llvm.mips.fill.d"]
+    fn msa_fill_d(a: i64) -> v2i64;
+    #[link_name = "llvm.mips.flog2.w"]
+    fn msa_flog2_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.flog2.d"]
+    fn msa_flog2_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmadd.w"]
+    fn msa_fmadd_w(a: v4f32, b: v4f32, c: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmadd.d"]
+    fn msa_fmadd_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmax.w"]
+    fn msa_fmax_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmax.d"]
+    fn msa_fmax_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmax.a.w"]
+    fn msa_fmax_a_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmax.a.d"]
+    fn msa_fmax_a_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmin.w"]
+    fn msa_fmin_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmin.d"]
+    fn msa_fmin_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmin.a.w"]
+    fn msa_fmin_a_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmin.a.d"]
+    fn msa_fmin_a_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmsub.w"]
+    fn msa_fmsub_w(a: v4f32, b: v4f32, c: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmsub.d"]
+    fn msa_fmsub_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fmul.w"]
+    fn msa_fmul_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fmul.d"]
+    fn msa_fmul_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.frint.w"]
+    fn msa_frint_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.frint.d"]
+    fn msa_frint_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.frcp.w"]
+    fn msa_frcp_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.frcp.d"]
+    fn msa_frcp_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.frsqrt.w"]
+    fn msa_frsqrt_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.frsqrt.d"]
+    fn msa_frsqrt_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fsaf.w"]
+    fn msa_fsaf_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsaf.d"]
+    fn msa_fsaf_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fseq.w"]
+    fn msa_fseq_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fseq.d"]
+    fn msa_fseq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsle.w"]
+    fn msa_fsle_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsle.d"]
+    fn msa_fsle_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fslt.w"]
+    fn msa_fslt_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fslt.d"]
+    fn msa_fslt_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsne.w"]
+    fn msa_fsne_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsne.d"]
+    fn msa_fsne_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsor.w"]
+    fn msa_fsor_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsor.d"]
+    fn msa_fsor_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsqrt.w"]
+    fn msa_fsqrt_w(a: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fsqrt.d"]
+    fn msa_fsqrt_d(a: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fsub.w"]
+    fn msa_fsub_w(a: v4f32, b: v4f32) -> v4f32;
+    #[link_name = "llvm.mips.fsub.d"]
+    fn msa_fsub_d(a: v2f64, b: v2f64) -> v2f64;
+    #[link_name = "llvm.mips.fsueq.w"]
+    fn msa_fsueq_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsueq.d"]
+    fn msa_fsueq_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsule.w"]
+    fn msa_fsule_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsule.d"]
+    fn msa_fsule_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsult.w"]
+    fn msa_fsult_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsult.d"]
+    fn msa_fsult_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsun.w"]
+    fn msa_fsun_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsun.d"]
+    fn msa_fsun_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.fsune.w"]
+    fn msa_fsune_w(a: v4f32, b: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.fsune.d"]
+    fn msa_fsune_d(a: v2f64, b: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.ftint.s.w"]
+    fn msa_ftint_s_w(a: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.ftint.s.d"]
+    fn msa_ftint_s_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.ftint.u.w"]
+    fn msa_ftint_u_w(a: v4f32) -> v4u32;
+    #[link_name = "llvm.mips.ftint.u.d"]
+    fn msa_ftint_u_d(a: v2f64) -> v2u64;
+    #[link_name = "llvm.mips.ftq.h"]
+    fn msa_ftq_h(a: v4f32, b: v4f32) -> v8i16;
+    #[link_name = "llvm.mips.ftq.w"]
+    fn msa_ftq_w(a: v2f64, b: v2f64) -> v4i32;
+    #[link_name = "llvm.mips.ftrunc.s.w"]
+    fn msa_ftrunc_s_w(a: v4f32) -> v4i32;
+    #[link_name = "llvm.mips.ftrunc.s.d"]
+    fn msa_ftrunc_s_d(a: v2f64) -> v2i64;
+    #[link_name = "llvm.mips.ftrunc.u.w"]
+    fn msa_ftrunc_u_w(a: v4f32) -> v4u32;
+    #[link_name = "llvm.mips.ftrunc.u.d"]
+    fn msa_ftrunc_u_d(a: v2f64) -> v2u64;
+    #[link_name = "llvm.mips.hadd.s.h"]
+    fn msa_hadd_s_h(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.hadd.s.w"]
+    fn msa_hadd_s_w(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.hadd.s.d"]
+    fn msa_hadd_s_d(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.hadd.u.h"]
+    fn msa_hadd_u_h(a: v16u8, b: v16u8) -> v8u16;
+    #[link_name = "llvm.mips.hadd.u.w"]
+    fn msa_hadd_u_w(a: v8u16, b: v8u16) -> v4u32;
+    #[link_name = "llvm.mips.hadd.u.d"]
+    fn msa_hadd_u_d(a: v4u32, b: v4u32) -> v2u64;
+    #[link_name = "llvm.mips.hsub.s.h"]
+    fn msa_hsub_s_h(a: v16i8, b: v16i8) -> v8i16;
+    #[link_name = "llvm.mips.hsub.s.w"]
+    fn msa_hsub_s_w(a: v8i16, b: v8i16) -> v4i32;
+    #[link_name = "llvm.mips.hsub.s.d"]
+    fn msa_hsub_s_d(a: v4i32, b: v4i32) -> v2i64;
+    #[link_name = "llvm.mips.hsub.u.h"]
+    fn msa_hsub_u_h(a: v16u8, b: v16u8) -> v8i16;
+    #[link_name = "llvm.mips.hsub.u.w"]
+    fn msa_hsub_u_w(a: v8u16, b: v8u16) -> v4i32;
+    #[link_name = "llvm.mips.hsub.u.d"]
+    fn msa_hsub_u_d(a: v4u32, b: v4u32) -> v2i64;
+    #[link_name = "llvm.mips.ilvev.b"]
+    fn msa_ilvev_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ilvev.h"]
+    fn msa_ilvev_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ilvev.w"]
+    fn msa_ilvev_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ilvev.d"]
+    fn msa_ilvev_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ilvl.b"]
+    fn msa_ilvl_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ilvl.h"]
+    fn msa_ilvl_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ilvl.w"]
+    fn msa_ilvl_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ilvl.d"]
+    fn msa_ilvl_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ilvod.b"]
+    fn msa_ilvod_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ilvod.h"]
+    fn msa_ilvod_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ilvod.w"]
+    fn msa_ilvod_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ilvod.d"]
+    fn msa_ilvod_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ilvr.b"]
+    fn msa_ilvr_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.ilvr.h"]
+    fn msa_ilvr_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.ilvr.w"]
+    fn msa_ilvr_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.ilvr.d"]
+    fn msa_ilvr_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.insert.b"]
+    fn msa_insert_b(a: v16i8, b: i32, c: i32) -> v16i8;
+    #[link_name = "llvm.mips.insert.h"]
+    fn msa_insert_h(a: v8i16, b: i32, c: i32) -> v8i16;
+    #[link_name = "llvm.mips.insert.w"]
+    fn msa_insert_w(a: v4i32, b: i32, c: i32) -> v4i32;
+    #[link_name = "llvm.mips.insert.d"]
+    fn msa_insert_d(a: v2i64, b: i32, c: i64) -> v2i64;
+    #[link_name = "llvm.mips.insve.b"]
+    fn msa_insve_b(a: v16i8, b: i32, c: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.insve.h"]
+    fn msa_insve_h(a: v8i16, b: i32, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.insve.w"]
+    fn msa_insve_w(a: v4i32, b: i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.insve.d"]
+    fn msa_insve_d(a: v2i64, b: i32, c: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.ld.b"]
+    fn msa_ld_b(mem_addr: *mut u8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.ld.h"]
+    fn msa_ld_h(mem_addr: *mut u8, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.ld.w"]
+    fn msa_ld_w(mem_addr: *mut u8, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.ld.d"]
+    fn msa_ld_d(mem_addr: *mut u8, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.ldi.b"]
+    fn msa_ldi_b(a: i32) -> v16i8;
+    #[link_name = "llvm.mips.ldi.h"]
+    fn msa_ldi_h(a: i32) -> v8i16;
+    #[link_name = "llvm.mips.ldi.w"]
+    fn msa_ldi_w(a: i32) -> v4i32;
+    #[link_name = "llvm.mips.ldi.d"]
+    fn msa_ldi_d(a: i32) -> v2i64;
+    #[link_name = "llvm.mips.madd.q.h"]
+    fn msa_madd_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.madd.q.w"]
+    fn msa_madd_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.maddr.q.h"]
+    fn msa_maddr_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.maddr.q.w"]
+    fn msa_maddr_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.maddv.b"]
+    fn msa_maddv_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.maddv.h"]
+    fn msa_maddv_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.maddv.w"]
+    fn msa_maddv_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.maddv.d"]
+    fn msa_maddv_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.max.a.b"]
+    fn msa_max_a_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.max.a.h"]
+    fn msa_max_a_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.max.a.w"]
+    fn msa_max_a_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.max.a.d"]
+    fn msa_max_a_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.max.s.b"]
+    fn msa_max_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.max.s.h"]
+    fn msa_max_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.max.s.w"]
+    fn msa_max_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.max.s.d"]
+    fn msa_max_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.max.u.b"]
+    fn msa_max_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.max.u.h"]
+    fn msa_max_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.max.u.w"]
+    fn msa_max_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.max.u.d"]
+    fn msa_max_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.maxi.s.b"]
+    fn msa_maxi_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.maxi.s.h"]
+    fn msa_maxi_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.maxi.s.w"]
+    fn msa_maxi_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.maxi.s.d"]
+    fn msa_maxi_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.maxi.u.b"]
+    fn msa_maxi_u_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.maxi.u.h"]
+    fn msa_maxi_u_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.maxi.u.w"]
+    fn msa_maxi_u_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.maxi.u.d"]
+    fn msa_maxi_u_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.min.a.b"]
+    fn msa_min_a_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.min.a.h"]
+    fn msa_min_a_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.min.a.w"]
+    fn msa_min_a_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.min.a.d"]
+    fn msa_min_a_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.min.s.b"]
+    fn msa_min_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.min.s.h"]
+    fn msa_min_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.min.s.w"]
+    fn msa_min_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.min.s.d"]
+    fn msa_min_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.min.u.b"]
+    fn msa_min_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.min.u.h"]
+    fn msa_min_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.min.u.w"]
+    fn msa_min_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.min.u.d"]
+    fn msa_min_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.mini.s.b"]
+    fn msa_mini_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.mini.s.h"]
+    fn msa_mini_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.mini.s.w"]
+    fn msa_mini_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.mini.s.d"]
+    fn msa_mini_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.mini.u.b"]
+    fn msa_mini_u_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.mini.u.h"]
+    fn msa_mini_u_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.mini.u.w"]
+    fn msa_mini_u_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.mini.u.d"]
+    fn msa_mini_u_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.mod.s.b"]
+    fn msa_mod_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.mod.s.h"]
+    fn msa_mod_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.mod.s.w"]
+    fn msa_mod_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.mod.s.d"]
+    fn msa_mod_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.mod.u.b"]
+    fn msa_mod_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.mod.u.h"]
+    fn msa_mod_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.mod.u.w"]
+    fn msa_mod_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.mod.u.d"]
+    fn msa_mod_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.move.v"]
+    fn msa_move_v(a: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.msub.q.h"]
+    fn msa_msub_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.msub.q.w"]
+    fn msa_msub_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.msubr.q.h"]
+    fn msa_msubr_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.msubr.q.w"]
+    fn msa_msubr_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.msubv.b"]
+    fn msa_msubv_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.msubv.h"]
+    fn msa_msubv_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.msubv.w"]
+    fn msa_msubv_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.msubv.d"]
+    fn msa_msubv_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.mul.q.h"]
+    fn msa_mul_q_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.mul.q.w"]
+    fn msa_mul_q_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.mulr.q.h"]
+    fn msa_mulr_q_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.mulr.q.w"]
+    fn msa_mulr_q_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.mulv.b"]
+    fn msa_mulv_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.mulv.h"]
+    fn msa_mulv_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.mulv.w"]
+    fn msa_mulv_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.mulv.d"]
+    fn msa_mulv_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.nloc.b"]
+    fn msa_nloc_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.nloc.h"]
+    fn msa_nloc_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.nloc.w"]
+    fn msa_nloc_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.nloc.d"]
+    fn msa_nloc_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.nlzc.b"]
+    fn msa_nlzc_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.nlzc.h"]
+    fn msa_nlzc_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.nlzc.w"]
+    fn msa_nlzc_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.nlzc.d"]
+    fn msa_nlzc_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.nor.v"]
+    fn msa_nor_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.nori.b"]
+    fn msa_nori_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.or.v"]
+    fn msa_or_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.ori.b"]
+    fn msa_ori_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.pckev.b"]
+    fn msa_pckev_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.pckev.h"]
+    fn msa_pckev_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.pckev.w"]
+    fn msa_pckev_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.pckev.d"]
+    fn msa_pckev_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.pckod.b"]
+    fn msa_pckod_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.pckod.h"]
+    fn msa_pckod_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.pckod.w"]
+    fn msa_pckod_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.pckod.d"]
+    fn msa_pckod_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.pcnt.b"]
+    fn msa_pcnt_b(a: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.pcnt.h"]
+    fn msa_pcnt_h(a: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.pcnt.w"]
+    fn msa_pcnt_w(a: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.pcnt.d"]
+    fn msa_pcnt_d(a: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.sat.s.b"]
+    fn msa_sat_s_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.sat.s.h"]
+    fn msa_sat_s_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.sat.s.w"]
+    fn msa_sat_s_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.sat.s.d"]
+    fn msa_sat_s_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.sat.u.b"]
+    fn msa_sat_u_b(a: v16u8, b: i32) -> v16u8;
+    #[link_name = "llvm.mips.sat.u.h"]
+    fn msa_sat_u_h(a: v8u16, b: i32) -> v8u16;
+    #[link_name = "llvm.mips.sat.u.w"]
+    fn msa_sat_u_w(a: v4u32, b: i32) -> v4u32;
+    #[link_name = "llvm.mips.sat.u.d"]
+    fn msa_sat_u_d(a: v2u64, b: i32) -> v2u64;
+    #[link_name = "llvm.mips.shf.b"]
+    fn msa_shf_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.shf.h"]
+    fn msa_shf_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.shf.w"]
+    fn msa_shf_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.sld.b"]
+    fn msa_sld_b(a: v16i8, b: v16i8, c: i32) -> v16i8;
+    #[link_name = "llvm.mips.sld.h"]
+    fn msa_sld_h(a: v8i16, b: v8i16, c: i32) -> v8i16;
+    #[link_name = "llvm.mips.sld.w"]
+    fn msa_sld_w(a: v4i32, b: v4i32, c: i32) -> v4i32;
+    #[link_name = "llvm.mips.sld.d"]
+    fn msa_sld_d(a: v2i64, b: v2i64, c: i32) -> v2i64;
+    #[link_name = "llvm.mips.sldi.b"]
+    fn msa_sldi_b(a: v16i8, b: v16i8, c: i32) -> v16i8;
+    #[link_name = "llvm.mips.sldi.h"]
+    fn msa_sldi_h(a: v8i16, b: v8i16, c: i32) -> v8i16;
+    #[link_name = "llvm.mips.sldi.w"]
+    fn msa_sldi_w(a: v4i32, b: v4i32, c: i32) -> v4i32;
+    #[link_name = "llvm.mips.sldi.d"]
+    fn msa_sldi_d(a: v2i64, b: v2i64, c: i32) -> v2i64;
+    #[link_name = "llvm.mips.sll.b"]
+    fn msa_sll_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.sll.h"]
+    fn msa_sll_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.sll.w"]
+    fn msa_sll_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.sll.d"]
+    fn msa_sll_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.slli.b"]
+    fn msa_slli_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.slli.h"]
+    fn msa_slli_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.slli.w"]
+    fn msa_slli_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.slli.d"]
+    fn msa_slli_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.splat.b"]
+    fn msa_splat_b(a: v16i8, c: i32) -> v16i8;
+    #[link_name = "llvm.mips.splat.h"]
+    fn msa_splat_h(a: v8i16, c: i32) -> v8i16;
+    #[link_name = "llvm.mips.splat.w"]
+    fn msa_splat_w(a: v4i32, w: i32) -> v4i32;
+    #[link_name = "llvm.mips.splat.d"]
+    fn msa_splat_d(a: v2i64, c: i32) -> v2i64;
+    #[link_name = "llvm.mips.splati.b"]
+    fn msa_splati_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.splati.h"]
+    fn msa_splati_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.splati.w"]
+    fn msa_splati_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.splati.d"]
+    fn msa_splati_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.sra.b"]
+    fn msa_sra_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.sra.h"]
+    fn msa_sra_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.sra.w"]
+    fn msa_sra_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.sra.d"]
+    fn msa_sra_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.srai.b"]
+    fn msa_srai_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.srai.h"]
+    fn msa_srai_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.srai.w"]
+    fn msa_srai_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.srai.d"]
+    fn msa_srai_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.srar.b"]
+    fn msa_srar_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.srar.h"]
+    fn msa_srar_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.srar.w"]
+    fn msa_srar_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.srar.d"]
+    fn msa_srar_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.srari.b"]
+    fn msa_srari_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.srari.h"]
+    fn msa_srari_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.srari.w"]
+    fn msa_srari_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.srari.d"]
+    fn msa_srari_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.srl.b"]
+    fn msa_srl_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.srl.h"]
+    fn msa_srl_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.srl.w"]
+    fn msa_srl_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.srl.d"]
+    fn msa_srl_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.srli.b"]
+    fn msa_srli_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.srli.h"]
+    fn msa_srli_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.srli.w"]
+    fn msa_srli_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.srli.d"]
+    fn msa_srli_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.srlr.b"]
+    fn msa_srlr_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.srlr.h"]
+    fn msa_srlr_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.srlr.w"]
+    fn msa_srlr_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.srlr.d"]
+    fn msa_srlr_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.srlri.b"]
+    fn msa_srlri_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.srlri.h"]
+    fn msa_srlri_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.srlri.w"]
+    fn msa_srlri_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.srlri.d"]
+    fn msa_srlri_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.st.b"]
+    fn msa_st_b(a: v16i8, mem_addr: *mut u8, imm_s10: i32) -> ();
+    #[link_name = "llvm.mips.st.h"]
+    fn msa_st_h(a: v8i16, mem_addr: *mut u8, imm_s11: i32) -> ();
+    #[link_name = "llvm.mips.st.w"]
+    fn msa_st_w(a: v4i32, mem_addr: *mut u8, imm_s12: i32) -> ();
+    #[link_name = "llvm.mips.st.d"]
+    fn msa_st_d(a: v2i64, mem_addr: *mut u8, imm_s13: i32) -> ();
+    #[link_name = "llvm.mips.subs.s.b"]
+    fn msa_subs_s_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.subs.s.h"]
+    fn msa_subs_s_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.subs.s.w"]
+    fn msa_subs_s_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.subs.s.d"]
+    fn msa_subs_s_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.subs.u.b"]
+    fn msa_subs_u_b(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.subs.u.h"]
+    fn msa_subs_u_h(a: v8u16, b: v8u16) -> v8u16;
+    #[link_name = "llvm.mips.subs.u.w"]
+    fn msa_subs_u_w(a: v4u32, b: v4u32) -> v4u32;
+    #[link_name = "llvm.mips.subs.u.d"]
+    fn msa_subs_u_d(a: v2u64, b: v2u64) -> v2u64;
+    #[link_name = "llvm.mips.subsus.u.b"]
+    fn msa_subsus_u_b(a: v16u8, b: v16i8) -> v16u8;
+    #[link_name = "llvm.mips.subsus.u.h"]
+    fn msa_subsus_u_h(a: v8u16, b: v8i16) -> v8u16;
+    #[link_name = "llvm.mips.subsus.u.w"]
+    fn msa_subsus_u_w(a: v4u32, b: v4i32) -> v4u32;
+    #[link_name = "llvm.mips.subsus.u.d"]
+    fn msa_subsus_u_d(a: v2u64, b: v2i64) -> v2u64;
+    #[link_name = "llvm.mips.subsuu.s.b"]
+    fn msa_subsuu_s_b(a: v16u8, b: v16u8) -> v16i8;
+    #[link_name = "llvm.mips.subsuu.s.h"]
+    fn msa_subsuu_s_h(a: v8u16, b: v8u16) -> v8i16;
+    #[link_name = "llvm.mips.subsuu.s.w"]
+    fn msa_subsuu_s_w(a: v4u32, b: v4u32) -> v4i32;
+    #[link_name = "llvm.mips.subsuu.s.d"]
+    fn msa_subsuu_s_d(a: v2u64, b: v2u64) -> v2i64;
+    #[link_name = "llvm.mips.subv.b"]
+    fn msa_subv_b(a: v16i8, b: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.subv.h"]
+    fn msa_subv_h(a: v8i16, b: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.subv.w"]
+    fn msa_subv_w(a: v4i32, b: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.subv.d"]
+    fn msa_subv_d(a: v2i64, b: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.subvi.b"]
+    fn msa_subvi_b(a: v16i8, b: i32) -> v16i8;
+    #[link_name = "llvm.mips.subvi.h"]
+    fn msa_subvi_h(a: v8i16, b: i32) -> v8i16;
+    #[link_name = "llvm.mips.subvi.w"]
+    fn msa_subvi_w(a: v4i32, b: i32) -> v4i32;
+    #[link_name = "llvm.mips.subvi.d"]
+    fn msa_subvi_d(a: v2i64, b: i32) -> v2i64;
+    #[link_name = "llvm.mips.vshf.b"]
+    fn msa_vshf_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8;
+    #[link_name = "llvm.mips.vshf.h"]
+    fn msa_vshf_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16;
+    #[link_name = "llvm.mips.vshf.w"]
+    fn msa_vshf_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32;
+    #[link_name = "llvm.mips.vshf.d"]
+    fn msa_vshf_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64;
+    #[link_name = "llvm.mips.xor.v"]
+    fn msa_xor_v(a: v16u8, b: v16u8) -> v16u8;
+    #[link_name = "llvm.mips.xori.b"]
+    fn msa_xori_b(a: v16u8, b: i32) -> v16u8;
+}
+
+/// Vector Add Absolute Values.
+///
+/// The absolute values of the elements in vector in `a` (sixteen signed 8-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(add_a.b))]
+pub unsafe fn __msa_add_a_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_add_a_b(a, mem::transmute(b))
+}
+
+/// Vector Add Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (eight signed 16-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(add_a.h))]
+pub unsafe fn __msa_add_a_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_add_a_h(a, mem::transmute(b))
+}
+
+/// Vector Add Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (four signed 32-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(add_a.w))]
+pub unsafe fn __msa_add_a_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_add_a_w(a, mem::transmute(b))
+}
+
+/// Vector Add Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (two signed 64-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(add_a.d))]
+pub unsafe fn __msa_add_a_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_add_a_d(a, mem::transmute(b))
+}
+
+/// Signed Saturated Vector Saturated Add of Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (sixteen signed 8-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The saturated signed result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_a.b))]
+pub unsafe fn __msa_adds_a_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_adds_a_b(a, mem::transmute(b))
+}
+
+/// Vector Saturated Add of Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (eight signed 16-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The saturated signed result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_a.h))]
+pub unsafe fn __msa_adds_a_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_adds_a_h(a, mem::transmute(b))
+}
+
+/// Vector Saturated Add of Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (four signed 32-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (four signed 32-bit integer numbers).
+/// The saturated signed result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_a.w))]
+pub unsafe fn __msa_adds_a_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_adds_a_w(a, mem::transmute(b))
+}
+
+/// Vector Saturated Add of Absolute Values
+///
+/// The absolute values of the elements in vector in `a` (two signed 64-bit integer numbers)
+/// are added to the absolute values of the elements in vector `b` (two signed 64-bit integer numbers).
+/// The saturated signed result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_a.d))]
+pub unsafe fn __msa_adds_a_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_adds_a_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Add of Signed Values
+///
+/// The elements in vector in `a` (sixteen signed 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_s.b))]
+pub unsafe fn __msa_adds_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_adds_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Add of Signed Values
+///
+/// The elements in vector in `a` (eight signed 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight signed 16-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_s.h))]
+pub unsafe fn __msa_adds_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_adds_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Add of Signed Values
+///
+/// The elements in vector in `a` (four signed 32-bit integer numbers)
+/// are added to the elements in vector `b` (four signed 32-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_s.w))]
+pub unsafe fn __msa_adds_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_adds_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Add of Signed Values
+///
+/// The elements in vector in `a` (two signed 64-bit integer numbers)
+/// are added to the elements in vector `b` (two signed 64-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_s.d))]
+pub unsafe fn __msa_adds_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_adds_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Add of Unsigned Values
+///
+/// The elements in vector in `a` (sixteen unsigned 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_u.b))]
+pub unsafe fn __msa_adds_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_adds_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Add of Unsigned Values
+///
+/// The elements in vector in `a` (eight unsigned 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_u.h))]
+pub unsafe fn __msa_adds_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_adds_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Add of Unsigned Values
+///
+/// The elements in vector in `a` (four unsigned 32-bit integer numbers)
+/// are added to the elements in vector `b` (four unsigned 32-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_u.w))]
+pub unsafe fn __msa_adds_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_adds_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Add of Unsigned Values
+///
+/// The elements in vector in `a` (two unsigned 64-bit integer numbers)
+/// are added to the elements in vector `b` (two unsigned 64-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(adds_u.d))]
+pub unsafe fn __msa_adds_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_adds_u_d(a, mem::transmute(b))
+}
+
+/// Vector Add
+///
+/// The elements in vector in `a` (sixteen signed 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addv.b))]
+pub unsafe fn __msa_addv_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_addv_b(a, mem::transmute(b))
+}
+
+/// Vector Add
+///
+/// The elements in vector in `a` (eight signed 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addv.h))]
+pub unsafe fn __msa_addv_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_addv_h(a, mem::transmute(b))
+}
+
+/// Vector Add
+///
+/// The elements in vector in `a` (four signed 32-bit integer numbers)
+/// are added to the elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addv.w))]
+pub unsafe fn __msa_addv_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_addv_w(a, mem::transmute(b))
+}
+
+/// Vector Add
+///
+/// The elements in vector in `a` (two signed 64-bit integer numbers)
+/// are added to the elements in vector `b` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addv.d))]
+pub unsafe fn __msa_addv_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_addv_d(a, mem::transmute(b))
+}
+
+/// Immediate Add
+///
+/// The 5-bit immediate unsigned value `imm5` is added to the elements
+/// vector in `a` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addvi.b, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_addvi_b<const IMM5: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm5!(IMM5);
+    msa_addvi_b(a, IMM5)
+}
+
+/// Immediate Add
+///
+/// The 5-bit immediate unsigned value `imm5` is added to the elements
+/// vector in `a` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addvi.h, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_addvi_h<const IMM5: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm5!(IMM5);
+    msa_addvi_h(a, IMM5)
+}
+
+/// Immediate Add
+///
+/// The 5-bit immediate unsigned value `imm5` is added to the elements
+/// vector in `a` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addvi.w, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_addvi_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm5!(IMM5);
+    msa_addvi_w(a, IMM5)
+}
+
+/// Immediate Add
+///
+/// The 5-bit immediate unsigned value `imm5` is added to the elements
+/// vector in `a` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(addvi.d, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_addvi_d<const IMM5: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm5!(IMM5);
+    msa_addvi_d(a, IMM5)
+}
+
+/// Vector Logical And
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the corresponding bit of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// in a bitwise logical AND operation.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(and.v))]
+pub unsafe fn __msa_and_v(a: v16u8, b: v16u8) -> v16u8 {
+    msa_and_v(a, mem::transmute(b))
+}
+
+/// Immediate Logical And
+///
+/// Each byte element of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the 8-bit immediate i8 (signed 8-bit integer number) in a bitwise logical AND operation.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(andi.b, imm8 = 0b10010111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_andi_b<const IMM8: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm8!(IMM8);
+    msa_andi_b(a, IMM8)
+}
+
+/// Vector Absolute Values of Signed Subtract
+///
+/// The signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the signed elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The absolute value of the signed result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_s.b))]
+pub unsafe fn __msa_asub_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_asub_s_b(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Signed Subtract
+///
+/// The signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// are subtracted from the signed elements in vector `b` (eight signed 16-bit integer numbers).
+/// The absolute value of the signed result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_s.h))]
+pub unsafe fn __msa_asub_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_asub_s_h(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Signed Subtract
+///
+/// The signed elements in vector `a` (four signed 32-bit integer numbers)
+/// are subtracted from the signed elements in vector `b` (four signed 32-bit integer numbers).
+/// The absolute value of the signed result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_s.w))]
+pub unsafe fn __msa_asub_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_asub_s_w(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Signed Subtract
+///
+/// The signed elements in vector `a` (two signed 64-bit integer numbers)
+/// are subtracted from the signed elements in vector `b` (two signed 64-bit integer numbers).
+/// The absolute value of the signed result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_s.d))]
+pub unsafe fn __msa_asub_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_asub_s_d(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Unsigned Subtract
+///
+/// The unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The absolute value of the unsigned result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_u.b))]
+pub unsafe fn __msa_asub_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_asub_u_b(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Unsigned Subtract
+///
+/// The unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The absolute value of the unsigned result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_u.h))]
+pub unsafe fn __msa_asub_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_asub_u_h(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Unsigned Subtract
+///
+/// The unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The absolute value of the unsigned result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_u.w))]
+pub unsafe fn __msa_asub_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_asub_u_w(a, mem::transmute(b))
+}
+
+/// Vector Absolute Values of Unsigned Subtract
+///
+/// The unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The absolute value of the unsigned result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(asub_u.d))]
+pub unsafe fn __msa_asub_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_asub_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Average
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The addition is done signed with full precision, i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_s.b))]
+pub unsafe fn __msa_ave_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ave_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Average
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The addition is done signed with full precision, i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_s.h))]
+pub unsafe fn __msa_ave_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ave_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Average
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are added to the elements in vector `b` (four signed 32-bit integer numbers).
+/// The addition is done signed with full precision, i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_s.w))]
+pub unsafe fn __msa_ave_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ave_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Average
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are added to the elements in vector `b` (two signed 64-bit integer numbers).
+/// The addition is done signed with full precision, i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_s.d))]
+pub unsafe fn __msa_ave_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ave_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average
+///
+/// The elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The addition is done unsigned with full precision, i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_u.b))]
+pub unsafe fn __msa_ave_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_ave_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average
+///
+/// The elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The addition is done unsigned with full precision, i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_u.h))]
+pub unsafe fn __msa_ave_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_ave_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average
+///
+/// The elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are added to the elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The addition is done unsigned with full precision, i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_u.w))]
+pub unsafe fn __msa_ave_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_ave_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average
+///
+/// The elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are added to the elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The addition is done unsigned with full precision, i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ave_u.d))]
+pub unsafe fn __msa_ave_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_ave_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Average Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done signed with full precision,
+/// i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_s.b))]
+pub unsafe fn __msa_aver_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_aver_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Average Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight signed 16-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done signed with full precision,
+/// i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_s.h))]
+pub unsafe fn __msa_aver_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_aver_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Average Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are added to the elements in vector `b` (four signed 32-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done signed with full precision,
+/// i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_s.w))]
+pub unsafe fn __msa_aver_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_aver_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Average Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are added to the elements in vector `b` (two signed 64-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done signed with full precision,
+/// i.e. the result has one extra bit.
+/// Signed division by 2 (or arithmetic shift right by one bit) is performed before
+/// writing the result to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_s.d))]
+pub unsafe fn __msa_aver_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_aver_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average Rounded
+///
+/// The elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are added to the elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done unsigned with full precision,
+/// i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_u.b))]
+pub unsafe fn __msa_aver_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_aver_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average Rounded
+///
+/// The elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are added to the elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done unsigned with full precision,
+/// i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_u.h))]
+pub unsafe fn __msa_aver_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_aver_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average Rounded
+///
+/// The elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are added to the elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done unsigned with full precision,
+/// i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_u.w))]
+pub unsafe fn __msa_aver_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_aver_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Average Rounded
+///
+/// The elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are added to the elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The addition of the elements plus 1 (for rounding) is done unsigned with full precision,
+/// i.e. the result has one extra bit.
+/// Unsigned division by 2 (or logical shift right by one bit) is performed before
+/// writing the result to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(aver_u.d))]
+pub unsafe fn __msa_aver_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_aver_u_d(a, mem::transmute(b))
+}
+
+/// Vector Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by the elements in `b` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclr.b))]
+pub unsafe fn __msa_bclr_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_bclr_b(a, mem::transmute(b))
+}
+
+/// Vector Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by the elements in `b` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclr.h))]
+pub unsafe fn __msa_bclr_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_bclr_h(a, mem::transmute(b))
+}
+
+/// Vector Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by the elements in `b` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclr.w))]
+pub unsafe fn __msa_bclr_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_bclr_w(a, mem::transmute(b))
+}
+
+/// Vector Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by the elements in `b` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclr.d))]
+pub unsafe fn __msa_bclr_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_bclr_d(a, mem::transmute(b))
+}
+
+/// Immediate Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by the immediate `m` modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclri.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bclri_b<const IMM3: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm3!(IMM3);
+    msa_bclri_b(a, IMM3)
+}
+
+/// Immediate Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by the immediate `m` modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclri.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bclri_h<const IMM4: i32>(a: v8u16) -> v8u16 {
+    static_assert_imm4!(IMM4);
+    msa_bclri_h(a, IMM4)
+}
+
+/// Immediate Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by the immediate `m` modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclri.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bclri_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_imm5!(IMM5);
+    msa_bclri_w(a, IMM5)
+}
+
+/// Immediate Bit Clear
+///
+/// Clear (set to 0) one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by the immediate `m` modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bclri.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bclri_d<const IMM6: i32>(a: v2u64) -> v2u64 {
+    static_assert_imm6!(IMM6);
+    msa_bclri_d(a, IMM6)
+}
+
+/// Vector Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// to elements in vector `a` (sixteen unsigned 8-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the elements in vector `c` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsl.b))]
+pub unsafe fn __msa_binsl_b(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_binsl_b(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (eight unsigned 16-bit integer numbers)
+/// to elements in vector `a` (eight unsigned 16-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the elements in vector `c` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsl.h))]
+pub unsafe fn __msa_binsl_h(a: v8u16, b: v8u16, c: v8u16) -> v8u16 {
+    msa_binsl_h(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (four unsigned 32-bit integer numbers)
+/// to elements in vector `a` (four unsigned 32-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the elements in vector `c` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsl.w))]
+pub unsafe fn __msa_binsl_w(a: v4u32, b: v4u32, c: v4u32) -> v4u32 {
+    msa_binsl_w(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (two unsigned 64-bit integer numbers)
+/// to elements in vector `a` (two unsigned 64-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the elements in vector `c` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsl.d))]
+pub unsafe fn __msa_binsl_d(a: v2u64, b: v2u64, c: v2u64) -> v2u64 {
+    msa_binsl_d(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// to elements in vector `a` (sixteen unsigned 8-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the immediate `imm3` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsli.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_binsli_b<const IMM3: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_imm3!(IMM3);
+    msa_binsli_b(a, mem::transmute(b), IMM3)
+}
+
+/// Immediate Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (eight unsigned 16-bit integer numbers)
+/// to elements in vector `a` (eight unsigned 16-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the immediate `imm4` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsli.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_binsli_h<const IMM4: i32>(a: v8u16, b: v8u16) -> v8u16 {
+    static_assert_imm4!(IMM4);
+    msa_binsli_h(a, mem::transmute(b), IMM4)
+}
+
+/// Immediate Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (four unsigned 32-bit integer numbers)
+/// to elements in vector `a` (four unsigned 32-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the immediate `imm5` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsli.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_binsli_w<const IMM5: i32>(a: v4u32, b: v4u32) -> v4u32 {
+    static_assert_imm5!(IMM5);
+    msa_binsli_w(a, mem::transmute(b), IMM5)
+}
+
+/// Immediate Bit Insert Left
+///
+/// Copy most significant (left) bits in each element of vector `b` (two unsigned 64-bit integer numbers)
+/// to elements in vector `a` (two unsigned 64-bit integer numbers) while preserving the least significant (right) bits.
+/// The number of bits to copy is given by the immediate `imm6` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsli.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_binsli_d<const IMM6: i32>(a: v2u64, b: v2u64) -> v2u64 {
+    static_assert_imm6!(IMM6);
+    msa_binsli_d(a, mem::transmute(b), IMM6)
+}
+
+/// Vector Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// to elements in vector `a` (sixteen unsigned 8-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the elements in vector `c` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsr.b))]
+pub unsafe fn __msa_binsr_b(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_binsr_b(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (eight unsigned 16-bit integer numbers)
+/// to elements in vector `a` (eight unsigned 16-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the elements in vector `c` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsr.h))]
+pub unsafe fn __msa_binsr_h(a: v8u16, b: v8u16, c: v8u16) -> v8u16 {
+    msa_binsr_h(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (four unsigned 32-bit integer numbers)
+/// to elements in vector `a` (four unsigned 32-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the elements in vector `c` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsr.w))]
+pub unsafe fn __msa_binsr_w(a: v4u32, b: v4u32, c: v4u32) -> v4u32 {
+    msa_binsr_w(a, mem::transmute(b), c)
+}
+
+/// Vector Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (two unsigned 64-bit integer numbers)
+/// to elements in vector `a` (two unsigned 64-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the elements in vector `c` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsr.d))]
+pub unsafe fn __msa_binsr_d(a: v2u64, b: v2u64, c: v2u64) -> v2u64 {
+    msa_binsr_d(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// to elements in vector `a` (sixteen unsigned 8-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the immediate `imm3` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsri.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_binsri_b<const IMM3: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_imm3!(IMM3);
+    msa_binsri_b(a, mem::transmute(b), IMM3)
+}
+
+/// Immediate Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (eight unsigned 16-bit integer numbers)
+/// to elements in vector `a` (eight unsigned 16-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the immediate `imm4` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsri.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_binsri_h<const IMM4: i32>(a: v8u16, b: v8u16) -> v8u16 {
+    static_assert_imm4!(IMM4);
+    msa_binsri_h(a, mem::transmute(b), IMM4)
+}
+
+/// Immediate Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (four unsigned 32-bit integer numbers)
+/// to elements in vector `a` (four unsigned 32-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the immediate `imm5` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsri.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_binsri_w<const IMM5: i32>(a: v4u32, b: v4u32) -> v4u32 {
+    static_assert_imm5!(IMM5);
+    msa_binsri_w(a, mem::transmute(b), IMM5)
+}
+
+/// Immediate Bit Insert Right
+///
+/// Copy most significant (right) bits in each element of vector `b` (two unsigned 64-bit integer numbers)
+/// to elements in vector `a` (two unsigned 64-bit integer numbers) while preserving the least significant (left) bits.
+/// The number of bits to copy is given by the immediate `imm6` modulo the size of the element in bits plus 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(binsri.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_binsri_d<const IMM6: i32>(a: v2u64, b: v2u64) -> v2u64 {
+    static_assert_imm6!(IMM6);
+    msa_binsri_d(a, mem::transmute(b), IMM6)
+}
+
+/// Vector Bit Move If Not Zero
+///
+/// Copy to destination vector `a` (sixteen unsigned 8-bit integer numbers) all bits from source vector
+/// `b` (sixteen unsigned 8-bit integer numbers) for which the corresponding bits from target vector `c`
+/// (sixteen unsigned 8-bit integer numbers) are 1 and leaves unchanged all destination bits
+/// for which the corresponding target bits are 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bmnz.v))]
+pub unsafe fn __msa_bmnz_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_bmnz_v(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Move If Not Zero
+///
+/// Copy to destination vector `a` (sixteen unsigned 8-bit integer numbers) all bits from source vector
+/// `b` (sixteen unsigned 8-bit integer numbers) for which the corresponding bits from from immediate `imm8`
+/// are 1 and leaves unchanged all destination bits for which the corresponding target bits are 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bmnzi.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_bmnzi_b<const IMM8: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_imm8!(IMM8);
+    msa_bmnzi_b(a, mem::transmute(b), IMM8)
+}
+
+/// Vector Bit Move If Zero
+///
+/// Copy to destination vector `a` (sixteen unsigned 8-bit integer numbers) all bits from source vector
+/// `b` (sixteen unsigned 8-bit integer numbers) for which the corresponding bits from target vector `c`
+/// (sixteen unsigned 8-bit integer numbers) are 0 and leaves unchanged all destination bits
+/// for which the corresponding target bits are 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bmz.v))]
+pub unsafe fn __msa_bmz_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_bmz_v(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Move If Zero
+///
+/// Copy to destination vector `a` (sixteen unsigned 8-bit integer numbers) all bits from source vector
+/// `b` (sixteen unsigned 8-bit integer numbers) for which the corresponding bits from from immediate `imm8`
+/// are 0 and leaves unchanged all destination bits for which the corresponding immediate bits are 1.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bmzi.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_bmzi_b<const IMM8: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_imm8!(IMM8);
+    msa_bmzi_b(a, mem::transmute(b), IMM8)
+}
+
+/// Vector Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bneg.b))]
+pub unsafe fn __msa_bneg_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_bneg_b(a, mem::transmute(b))
+}
+
+/// Vector Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bneg.h))]
+pub unsafe fn __msa_bneg_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_bneg_h(a, mem::transmute(b))
+}
+
+/// Vector Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bneg.w))]
+pub unsafe fn __msa_bneg_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_bneg_w(a, mem::transmute(b))
+}
+
+/// Vector Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bneg.d))]
+pub unsafe fn __msa_bneg_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_bneg_d(a, mem::transmute(b))
+}
+
+/// Immediate Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by immediate `imm3` modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnegi.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bnegi_b<const IMM3: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm3!(IMM3);
+    msa_bnegi_b(a, IMM3)
+}
+
+/// Immediate Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by immediate `imm4` modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnegi.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bnegi_h<const IMM4: i32>(a: v8u16) -> v8u16 {
+    static_assert_imm4!(IMM4);
+    msa_bnegi_h(a, IMM4)
+}
+
+/// Immediate Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by immediate `imm5` modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnegi.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bnegi_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_imm5!(IMM5);
+    msa_bnegi_w(a, IMM5)
+}
+
+/// Immediate Bit Negate
+///
+/// Negate (complement) one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by immediate `imm6` modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnegi.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bnegi_d<const IMM6: i32>(a: v2u64) -> v2u64 {
+    static_assert_imm6!(IMM6);
+    msa_bnegi_d(a, IMM6)
+}
+
+/// Immediate Branch If All Elements Are Not Zero
+///
+/// PC-relative branch if all elements in `a` (sixteen unsigned 8-bit integer numbers) are not zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.b))]
+pub unsafe fn __msa_bnz_b(a: v16u8) -> i32 {
+    msa_bnz_b(a)
+}
+
+/// Immediate Branch If All Elements Are Not Zero
+///
+/// PC-relative branch if all elements in `a` (eight unsigned 16-bit integer numbers) are not zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.h))]
+pub unsafe fn __msa_bnz_h(a: v8u16) -> i32 {
+    msa_bnz_h(a)
+}
+
+/// Immediate Branch If All Elements Are Not Zero
+///
+/// PC-relative branch if all elements in `a` (four unsigned 32-bit integer numbers) are not zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.w))]
+pub unsafe fn __msa_bnz_w(a: v4u32) -> i32 {
+    msa_bnz_w(a)
+}
+
+/// Immediate Branch If All Elements Are Not Zero
+///
+/// PC-relative branch if all elements in `a` (two unsigned 64-bit integer numbers) are not zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.d))]
+pub unsafe fn __msa_bnz_d(a: v2u64) -> i32 {
+    msa_bnz_d(a)
+}
+
+/// Immediate Branch If Not Zero (At Least One Element of Any Format Is Not Zero)
+///
+/// PC-relative branch if at least one bit in `a` (four unsigned 32-bit integer numbers) are not zero.
+/// i.e at least one element is not zero regardless of the data format.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bnz.v))]
+pub unsafe fn __msa_bnz_v(a: v16u8) -> i32 {
+    msa_bnz_v(a)
+}
+
+/// Vector Bit Select
+///
+/// Selectively copy bits from the source vectors `b` (eight unsigned 16-bit integer numbers)
+/// and `c` (eight unsigned 16-bit integer numbers)
+/// into destination vector `a` (eight unsigned 16-bit integer numbers) based on the corresponding bit in `a`:
+/// if 0 copies the bit from `b`, if 1 copies the bit from `c`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bsel.v))]
+pub unsafe fn __msa_bsel_v(a: v16u8, b: v16u8, c: v16u8) -> v16u8 {
+    msa_bsel_v(a, mem::transmute(b), c)
+}
+
+/// Immediate Bit Select
+///
+/// Selectively copy bits from the 8-bit immediate `imm8` and `c` (eight unsigned 16-bit integer numbers)
+/// into destination vector `a` (eight unsigned 16-bit integer numbers) based on the corresponding bit in `a`:
+/// if 0 copies the bit from `b`, if 1 copies the bit from `c`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseli.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_bseli_b<const IMM8: i32>(a: v16u8, b: v16u8) -> v16u8 {
+    static_assert_imm8!(IMM8);
+    msa_bseli_b(a, mem::transmute(b), IMM8)
+}
+
+/// Vector Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bset.b))]
+pub unsafe fn __msa_bset_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_bset_b(a, mem::transmute(b))
+}
+
+/// Vector Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bset.h))]
+pub unsafe fn __msa_bset_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_bset_h(a, mem::transmute(b))
+}
+
+/// Vector Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (four unsigned 32-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bset.w))]
+pub unsafe fn __msa_bset_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_bset_w(a, mem::transmute(b))
+}
+
+/// Vector Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by the elements in vector `b` (two unsigned 64-bit integer numbers)
+/// modulo the size of the element in bits.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bset.d))]
+pub unsafe fn __msa_bset_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_bset_d(a, mem::transmute(b))
+}
+
+/// Immediate Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The bit position is given by immediate `imm3`.
+/// The result is written to vector `a` (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseti.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bseti_b<const IMM3: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm3!(IMM3);
+    msa_bseti_b(a, IMM3)
+}
+
+/// Immediate Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (eight unsigned 16-bit integer numbers).
+/// The bit position is given by immediate `imm4`.
+/// The result is written to vector `a` (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseti.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bseti_h<const IMM4: i32>(a: v8u16) -> v8u16 {
+    static_assert_imm4!(IMM4);
+    msa_bseti_h(a, IMM4)
+}
+
+/// Immediate Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (four unsigned 32-bit integer numbers).
+/// The bit position is given by immediate `imm5`.
+/// The result is written to vector `a` (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseti.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bseti_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_imm5!(IMM5);
+    msa_bseti_w(a, IMM5)
+}
+
+/// Immediate Bit Set
+///
+/// Set to 1 one bit in each element of vector `a` (two unsigned 64-bit integer numbers).
+/// The bit position is given by immediate `imm6`.
+/// The result is written to vector `a` (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bseti.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_bseti_d<const IMM6: i32>(a: v2u64) -> v2u64 {
+    static_assert_imm6!(IMM6);
+    msa_bseti_d(a, IMM6)
+}
+
+/// Immediate Branch If At Least One Element Is Zero
+///
+/// PC-relative branch if at least one element in `a` (sixteen unsigned 8-bit integer numbers) is zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.b))]
+pub unsafe fn __msa_bz_b(a: v16u8) -> i32 {
+    msa_bz_b(a)
+}
+
+/// Immediate Branch If At Least One Element Is Zero
+///
+/// PC-relative branch if at least one element in `a` (eight unsigned 16-bit integer numbers) is zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.h))]
+pub unsafe fn __msa_bz_h(a: v8u16) -> i32 {
+    msa_bz_h(a)
+}
+
+/// Immediate Branch If At Least One Element Is Zero
+///
+/// PC-relative branch if at least one element in `a` (four unsigned 32-bit integer numbers) is zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.w))]
+pub unsafe fn __msa_bz_w(a: v4u32) -> i32 {
+    msa_bz_w(a)
+}
+
+/// Immediate Branch If At Least One Element Is Zero
+///
+/// PC-relative branch if at least one element in `a` (two unsigned 64-bit integer numbers) is zero.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.d))]
+pub unsafe fn __msa_bz_d(a: v2u64) -> i32 {
+    msa_bz_d(a)
+}
+
+/// Immediate Branch If Zero (All Elements of Any Format Are Zero)
+///
+/// PC-relative branch if all elements in `a` (sixteen unsigned 8-bit integer numbers) bits are zero,
+/// i.e. all elements are zero regardless of the data format.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(bz.v))]
+pub unsafe fn __msa_bz_v(a: v16u8) -> i32 {
+    msa_bz_v(a)
+}
+
+/// Vector Compare Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) and `b` (sixteen signed 8-bit integer numbers)
+/// elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceq.b))]
+pub unsafe fn __msa_ceq_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ceq_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) and `b` (eight signed 16-bit integer numbers)
+/// elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceq.h))]
+pub unsafe fn __msa_ceq_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ceq_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) and `b` (four signed 32-bit integer numbers)
+/// elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceq.w))]
+pub unsafe fn __msa_ceq_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ceq_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) and `b` (two signed 64-bit integer numbers)
+/// elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceq.d))]
+pub unsafe fn __msa_ceq_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ceq_d(a, mem::transmute(b))
+}
+
+/// Immediate Compare Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) the 5-bit signed immediate imm_s5
+/// are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceqi.b, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_ceqi_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_ceqi_b(a, IMM_S5)
+}
+
+/// Immediate Compare Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) the 5-bit signed immediate imm_s5
+/// are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceqi.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_ceqi_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_ceqi_h(a, IMM_S5)
+}
+
+/// Immediate Compare Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) the 5-bit signed immediate imm_s5
+/// are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceqi.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_ceqi_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_ceqi_w(a, IMM_S5)
+}
+
+/// Immediate Compare Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) the 5-bit signed immediate imm_s5
+/// are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ceqi.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_ceqi_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_ceqi_d(a, IMM_S5)
+}
+
+/// GPR Copy from MSA Control Register
+///
+/// The sign extended content of MSA control register cs is copied to GPR rd.
+///
+/// Can not be tested in user mode
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cfcmsa, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __msa_cfcmsa<const IMM5: i32>() -> i32 {
+    static_assert_imm5!(IMM5);
+    msa_cfcmsa(IMM5)
+}
+
+/// Vector Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) element
+/// are signed less than or equal to `b` (sixteen signed 8-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_s.b))]
+pub unsafe fn __msa_cle_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_cle_s_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) element
+/// are signed less than or equal to `b` (eight signed 16-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_s.h))]
+pub unsafe fn __msa_cle_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_cle_s_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) element
+/// are signed less than or equal to `b` (four signed 32-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_s.w))]
+pub unsafe fn __msa_cle_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_cle_s_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) element
+/// are signed less than or equal to `b` (two signed 64-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_s.d))]
+pub unsafe fn __msa_cle_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_cle_s_d(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen unsigned 8-bit integer numbers) element
+/// are unsigned less than or equal to `b` (sixteen unsigned 8-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_u.b))]
+pub unsafe fn __msa_cle_u_b(a: v16u8, b: v16u8) -> v16i8 {
+    msa_cle_u_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight unsigned 16-bit integer numbers) element
+/// are unsigned less than or equal to `b` (eight unsigned 16-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_u.h))]
+pub unsafe fn __msa_cle_u_h(a: v8u16, b: v8u16) -> v8i16 {
+    msa_cle_u_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four unsigned 32-bit integer numbers) element
+/// are unsigned less than or equal to `b` (four unsigned 32-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_u.w))]
+pub unsafe fn __msa_cle_u_w(a: v4u32, b: v4u32) -> v4i32 {
+    msa_cle_u_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two unsigned 64-bit integer numbers) element
+/// are unsigned less than or equal to `b` (two unsigned 64-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(cle_u.d))]
+pub unsafe fn __msa_cle_u_d(a: v2u64, b: v2u64) -> v2i64 {
+    msa_cle_u_d(a, mem::transmute(b))
+}
+
+/// Immediate Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) element
+/// is less than or equal to the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_s.b, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clei_s_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_clei_s_b(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) element
+/// is less than or equal to the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_s.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clei_s_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_clei_s_h(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) element
+/// is less than or equal to the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_s.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clei_s_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_clei_s_w(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) element
+/// is less than or equal to the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_s.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clei_s_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_clei_s_d(a, IMM_S5)
+}
+
+/// Immediate Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen unsigned 8-bit integer numbers) element
+/// is unsigned less than or equal to the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_u.b, imm5 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clei_u_b<const IMM5: i32>(a: v16u8) -> v16i8 {
+    static_assert_imm5!(IMM5);
+    msa_clei_u_b(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight unsigned 16-bit integer numbers) element
+/// is unsigned less than or equal to the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_u.h, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clei_u_h<const IMM5: i32>(a: v8u16) -> v8i16 {
+    static_assert_imm5!(IMM5);
+    msa_clei_u_h(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four unsigned 32-bit integer numbers) element
+/// is unsigned less than or equal to the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_u.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clei_u_w<const IMM5: i32>(a: v4u32) -> v4i32 {
+    static_assert_imm5!(IMM5);
+    msa_clei_u_w(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two unsigned 64-bit integer numbers) element
+/// is unsigned less than or equal to the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clei_u.d, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clei_u_d<const IMM5: i32>(a: v2u64) -> v2i64 {
+    static_assert_imm5!(IMM5);
+    msa_clei_u_d(a, IMM5)
+}
+
+/// Vector Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) element
+/// are signed less than `b` (sixteen signed 8-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_s.b))]
+pub unsafe fn __msa_clt_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_clt_s_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) element
+/// are signed less than `b` (eight signed 16-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_s.h))]
+pub unsafe fn __msa_clt_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_clt_s_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) element
+/// are signed less than `b` (four signed 32-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_s.w))]
+pub unsafe fn __msa_clt_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_clt_s_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) element
+/// are signed less than `b` (two signed 64-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_s.d))]
+pub unsafe fn __msa_clt_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_clt_s_d(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen unsigned 8-bit integer numbers) element
+/// are unsigned less than `b` (sixteen unsigned 8-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_u.b))]
+pub unsafe fn __msa_clt_u_b(a: v16u8, b: v16u8) -> v16i8 {
+    msa_clt_u_b(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight unsigned 16-bit integer numbers) element
+/// are unsigned less than `b` (eight unsigned 16-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_u.h))]
+pub unsafe fn __msa_clt_u_h(a: v8u16, b: v8u16) -> v8i16 {
+    msa_clt_u_h(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four unsigned 32-bit integer numbers) element
+/// are unsigned less than `b` (four unsigned 32-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_u.w))]
+pub unsafe fn __msa_clt_u_w(a: v4u32, b: v4u32) -> v4i32 {
+    msa_clt_u_w(a, mem::transmute(b))
+}
+
+/// Vector Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two unsigned 64-bit integer numbers) element
+/// are unsigned less than `b` (two unsigned 64-bit integer numbers) element.
+/// Otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clt_u.d))]
+pub unsafe fn __msa_clt_u_d(a: v2u64, b: v2u64) -> v2i64 {
+    msa_clt_u_d(a, mem::transmute(b))
+}
+
+/// Immediate Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen signed 8-bit integer numbers) element
+/// is less than the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_s.b, imm_s5 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clti_s_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_clti_s_b(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight signed 16-bit integer numbers) element
+/// is less than the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_s.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clti_s_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_clti_s_h(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four signed 32-bit integer numbers) element
+/// is less than the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_s.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clti_s_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_clti_s_w(a, IMM_S5)
+}
+
+/// Immediate Compare Signed Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two signed 64-bit integer numbers) element
+/// is less than the 5-bit signed immediate imm_s5,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_s.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clti_s_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_clti_s_d(a, IMM_S5)
+}
+
+/// Immediate Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (sixteen signed 8-bit integer numbers) elements
+/// if the corresponding `a` (sixteen unsigned 8-bit integer numbers) element
+/// is unsigned less than the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_u.b, imm5 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clti_u_b<const IMM5: i32>(a: v16u8) -> v16i8 {
+    static_assert_imm5!(IMM5);
+    msa_clti_u_b(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (eight signed 16-bit integer numbers) elements
+/// if the corresponding `a` (eight unsigned 16-bit integer numbers) element
+/// is unsigned less than the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_u.h, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clti_u_h<const IMM5: i32>(a: v8u16) -> v8i16 {
+    static_assert_imm5!(IMM5);
+    msa_clti_u_h(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four unsigned 32-bit integer numbers) element
+/// is unsigned less than the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_u.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clti_u_w<const IMM5: i32>(a: v4u32) -> v4i32 {
+    static_assert_imm5!(IMM5);
+    msa_clti_u_w(a, IMM5)
+}
+
+/// Immediate Compare Unsigned Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two unsigned 64-bit integer numbers) element
+/// is unsigned less than the 5-bit unsigned immediate `imm5`,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(clti_u.d, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_clti_u_d<const IMM5: i32>(a: v2u64) -> v2i64 {
+    static_assert_imm5!(IMM5);
+    msa_clti_u_d(a, IMM5)
+}
+
+/// Element Copy to GPR Signed
+///
+/// Sign-extend element `imm4` of vector `a` (sixteen signed 8-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_s.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_copy_s_b<const IMM4: i32>(a: v16i8) -> i32 {
+    static_assert_imm4!(IMM4);
+    msa_copy_s_b(a, IMM4)
+}
+
+/// Element Copy to GPR Signed
+///
+/// Sign-extend element `imm3` of vector `a` (eight signed 16-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_s.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_copy_s_h<const IMM3: i32>(a: v8i16) -> i32 {
+    static_assert_imm3!(IMM3);
+    msa_copy_s_h(a, IMM3)
+}
+
+/// Element Copy to GPR Signed
+///
+/// Sign-extend element `imm2` of vector `a` (four signed 32-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_s.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_copy_s_w<const IMM2: i32>(a: v4i32) -> i32 {
+    static_assert_imm2!(IMM2);
+    msa_copy_s_w(a, IMM2)
+}
+
+/// Element Copy to GPR Signed
+///
+/// Sign-extend element `imm1` of vector `a` (two signed 64-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_s.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_copy_s_d<const IMM1: i32>(a: v2i64) -> i64 {
+    static_assert_imm1!(IMM1);
+    msa_copy_s_d(a, IMM1)
+}
+
+/// Element Copy to GPR Unsigned
+///
+/// Zero-extend element `imm4` of vector `a` (sixteen signed 8-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_u.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_copy_u_b<const IMM4: i32>(a: v16i8) -> u32 {
+    static_assert_imm4!(IMM4);
+    msa_copy_u_b(a, IMM4)
+}
+
+/// Element Copy to GPR Unsigned
+///
+/// Zero-extend element `imm3` of vector `a` (eight signed 16-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_u.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_copy_u_h<const IMM3: i32>(a: v8i16) -> u32 {
+    static_assert_imm3!(IMM3);
+    msa_copy_u_h(a, IMM3)
+}
+
+/// Element Copy to GPR Unsigned
+///
+/// Zero-extend element `imm2` of vector `a` (four signed 32-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_u.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_copy_u_w<const IMM2: i32>(a: v4i32) -> u32 {
+    static_assert_imm2!(IMM2);
+    msa_copy_u_w(a, IMM2)
+}
+
+/// Element Copy to GPR Unsigned
+///
+/// Zero-extend element `imm1` of vector `a` (two signed 64-bit integer numbers)
+/// and copy the result to GPR rd.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(copy_u.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_copy_u_d<const IMM1: i32>(a: v2i64) -> u64 {
+    static_assert_imm1!(IMM1);
+    msa_copy_u_d(a, IMM1)
+}
+
+/// GPR Copy to MSA Control Register
+///
+/// The content of the least significant 31 bits of GPR `imm1` is copied to
+/// MSA control register cd.
+///
+/// Can not be tested in user mode
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ctcmsa, imm1 = 0b1))]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __msa_ctcmsa<const IMM5: i32>(a: i32) -> () {
+    static_assert_imm5!(IMM5);
+    msa_ctcmsa(IMM5, a)
+}
+
+/// Vector Signed Divide
+///
+/// The signed integer elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_s.b))]
+pub unsafe fn __msa_div_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_div_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Divide
+///
+/// The signed integer elements in vector `a` (eight signed 16-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_s.h))]
+pub unsafe fn __msa_div_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_div_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Divide
+///
+/// The signed integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_s.w))]
+pub unsafe fn __msa_div_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_div_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Divide
+///
+/// The signed integer elements in vector `a` (two signed 64-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_s.d))]
+pub unsafe fn __msa_div_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_div_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Divide
+///
+/// The unsigned integer elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The result is written to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_u.b))]
+pub unsafe fn __msa_div_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_div_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Divide
+///
+/// The unsigned integer elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_u.h))]
+pub unsafe fn __msa_div_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_div_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Divide
+///
+/// The unsigned integer elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_u.w))]
+pub unsafe fn __msa_div_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_div_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Divide
+///
+/// The unsigned integer elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(div_u.d))]
+pub unsafe fn __msa_div_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_div_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Dot Product
+///
+/// The signed integer elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are multiplied by signed integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_s.h))]
+pub unsafe fn __msa_dotp_s_h(a: v16i8, b: v16i8) -> v8i16 {
+    msa_dotp_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Dot Product
+///
+/// The signed integer elements in vector `a` (eight signed 16-bit integer numbers)
+/// are multiplied by signed integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_s.w))]
+pub unsafe fn __msa_dotp_s_w(a: v8i16, b: v8i16) -> v4i32 {
+    msa_dotp_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Dot Product
+///
+/// The signed integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are multiplied by signed integer elements in vector `b` (four signed 32-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_s.d))]
+pub unsafe fn __msa_dotp_s_d(a: v4i32, b: v4i32) -> v2i64 {
+    msa_dotp_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Dot Product
+///
+/// The unsigned integer elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_u.h))]
+pub unsafe fn __msa_dotp_u_h(a: v16u8, b: v16u8) -> v8u16 {
+    msa_dotp_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Dot Product
+///
+/// The unsigned integer elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_u.w))]
+pub unsafe fn __msa_dotp_u_w(a: v8u16, b: v8u16) -> v4u32 {
+    msa_dotp_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Dot Product
+///
+/// The unsigned integer elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers)
+/// producing a result the size of the input operands. The multiplication results of
+/// adjacent odd/even elements are added and stored to the destination
+/// vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dotp_u.d))]
+pub unsafe fn __msa_dotp_u_d(a: v4u32, b: v4u32) -> v2u64 {
+    msa_dotp_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (sixteen signed 8-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_s.h))]
+pub unsafe fn __msa_dpadd_s_h(a: v8i16, b: v16i8, c: v16i8) -> v8i16 {
+    msa_dpadd_s_h(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (eight signed 16-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_s.w))]
+pub unsafe fn __msa_dpadd_s_w(a: v4i32, b: v8i16, c: v8i16) -> v4i32 {
+    msa_dpadd_s_w(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (four signed 32-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (four signed 32-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_s.d))]
+pub unsafe fn __msa_dpadd_s_d(a: v2i64, b: v4i32, c: v4i32) -> v2i64 {
+    msa_dpadd_s_d(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (sixteen unsigned 8-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_u.h))]
+pub unsafe fn __msa_dpadd_u_h(a: v8u16, b: v16u8, c: v16u8) -> v8u16 {
+    msa_dpadd_u_h(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (eight unsigned 16-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_u.w))]
+pub unsafe fn __msa_dpadd_u_w(a: v4u32, b: v8u16, c: v8u16) -> v4u32 {
+    msa_dpadd_u_w(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (four unsigned 32-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are added to the vector `a` (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpadd_u.d))]
+pub unsafe fn __msa_dpadd_u_d(a: v2u64, b: v4u32, c: v4u32) -> v2u64 {
+    msa_dpadd_u_d(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (sixteen signed 8-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_s.h))]
+pub unsafe fn __msa_dpsub_s_h(a: v8i16, b: v16i8, c: v16i8) -> v8i16 {
+    msa_dpsub_s_h(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (eight signed 16-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_s.w))]
+pub unsafe fn __msa_dpsub_s_w(a: v4i32, b: v8i16, c: v8i16) -> v4i32 {
+    msa_dpsub_s_w(a, mem::transmute(b), c)
+}
+
+/// Vector Signed Dot Product and Add
+///
+/// The signed integer elements in vector `b` (four signed 32-bit integer numbers)
+/// are multiplied by signed integer elements in vector `c` (four signed 32-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_s.d))]
+pub unsafe fn __msa_dpsub_s_d(a: v2i64, b: v4i32, c: v4i32) -> v2i64 {
+    msa_dpsub_s_d(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (sixteen unsigned 8-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_u.h))]
+pub unsafe fn __msa_dpsub_u_h(a: v8i16, b: v16u8, c: v16u8) -> v8i16 {
+    msa_dpsub_u_h(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (eight unsigned 16-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_u.w))]
+pub unsafe fn __msa_dpsub_u_w(a: v4i32, b: v8u16, c: v8u16) -> v4i32 {
+    msa_dpsub_u_w(a, mem::transmute(b), c)
+}
+
+/// Vector Unsigned Dot Product and Add
+///
+/// The unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are multiplied by unsigned integer elements in vector `c` (four unsigned 32-bit integer numbers)
+/// producing a result twice the size of the input operands. The multiplication results
+/// of adjacent odd/even elements are subtracted from the integer elements in vector `a`
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(dpsub_u.d))]
+pub unsafe fn __msa_dpsub_u_d(a: v2i64, b: v4u32, c: v4u32) -> v2i64 {
+    msa_dpsub_u_d(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Addition
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are added to the floating-point elements in `bc` (four 32-bit floating point numbers).
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fadd.w))]
+pub unsafe fn __msa_fadd_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fadd_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Addition
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are added to the floating-point elements in `bc` (two 64-bit floating point numbers).
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fadd.d))]
+pub unsafe fn __msa_fadd_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fadd_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Always False
+///
+/// Set all bits to 0 in vector (four signed 32-bit integer numbers).
+/// Signaling NaN elements in `a` (four 32-bit floating point numbers)
+/// or `b` (four 32-bit floating point numbers) signal Invalid Operation exception.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcaf.w))]
+pub unsafe fn __msa_fcaf_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcaf_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Always False
+///
+/// Set all bits to 0 in vector (two signed 64-bit integer numbers).
+/// Signaling NaN elements in `a` (two 64-bit floating point numbers)
+/// or `b` (two 64-bit floating point numbers) signal Invalid Operation exception.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcaf.d))]
+pub unsafe fn __msa_fcaf_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcaf_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding in `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) elements are ordered and equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fceq.w))]
+pub unsafe fn __msa_fceq_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fceq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding in `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) elements are ordered and equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fceq.d))]
+pub unsafe fn __msa_fceq_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fceq_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Class Mask
+///
+/// Store in each element of vector (four signed 32-bit integer numbers)
+/// a bit mask reflecting the floating-point class of the corresponding element of vector
+/// `a` (four 32-bit floating point numbers).
+/// The mask has 10 bits as follows. Bits 0 and 1 indicate NaN values: signaling NaN (bit 0) and quiet NaN (bit 1).
+/// Bits 2, 3, 4, 5 classify negative values: infinity (bit 2), normal (bit 3), subnormal (bit 4), and zero (bit 5).
+/// Bits 6, 7, 8, 9 classify positive values: infinity (bit 6), normal (bit 7), subnormal (bit 8), and zero (bit 9).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fclass.w))]
+pub unsafe fn __msa_fclass_w(a: v4f32) -> v4i32 {
+    msa_fclass_w(a)
+}
+
+/// Vector Floating-Point Class Mask
+///
+/// Store in each element of vector (two signed 64-bit integer numbers)
+/// a bit mask reflecting the floating-point class of the corresponding element of vector
+/// `a` (two 64-bit floating point numbers).
+/// The mask has 10 bits as follows. Bits 0 and 1 indicate NaN values: signaling NaN (bit 0) and quiet NaN (bit 1).
+/// Bits 2, 3, 4, 5 classify negative values: infinity (bit 2), normal (bit 3), subnormal (bit 4), and zero (bit 5).
+/// Bits 6, 7, 8, 9 classify positive values: infinity (bit 6), normal (bit 7), subnormal (bit 8), and zero (bit 9).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fclass.d))]
+pub unsafe fn __msa_fclass_d(a: v2f64) -> v2i64 {
+    msa_fclass_d(a)
+}
+
+/// Vector Floating-Point Quiet Compare Less or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) elements are ordered
+/// and either less than or equal to `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcle.w))]
+pub unsafe fn __msa_fcle_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcle_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Less or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) elements are ordered
+/// and either less than or equal to `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcle.d))]
+pub unsafe fn __msa_fcle_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcle_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) elements are ordered
+/// and less than `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fclt.w))]
+pub unsafe fn __msa_fclt_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fclt_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) elements are ordered
+/// and less than `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fclt.d))]
+pub unsafe fn __msa_fclt_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fclt_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Not Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are ordered and not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcne.w))]
+pub unsafe fn __msa_fcne_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcne_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Not Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are ordered and not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcne.d))]
+pub unsafe fn __msa_fcne_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcne_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Ordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are ordered, i.e. both elements are not NaN values,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcor.w))]
+pub unsafe fn __msa_fcor_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcor_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Ordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are ordered, i.e. both elements are not NaN values,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcor.d))]
+pub unsafe fn __msa_fcor_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcor_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are unordered or equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcueq.w))]
+pub unsafe fn __msa_fcueq_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcueq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are unordered or equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcueq.d))]
+pub unsafe fn __msa_fcueq_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcueq_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Less or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding elements in `a` (four 32-bit floating point numbers)
+/// are unordered or less than or equal to `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcule.w))]
+pub unsafe fn __msa_fcule_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcule_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Less or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding elements in `a` (two 64-bit floating point numbers)
+/// are unordered or less than or equal to `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcule.d))]
+pub unsafe fn __msa_fcule_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcule_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding elements in `a` (four 32-bit floating point numbers)
+/// are unordered or less than `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcult.w))]
+pub unsafe fn __msa_fcult_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcult_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding elements in `a` (two 64-bit floating point numbers)
+/// are unordered or less than `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcult.d))]
+pub unsafe fn __msa_fcult_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcult_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) elements are unordered,
+/// i.e. at least one element is a NaN value, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcun.w))]
+pub unsafe fn __msa_fcun_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcun_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) elements are unordered,
+/// i.e. at least one element is a NaN value, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcun.d))]
+pub unsafe fn __msa_fcun_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcun_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Not Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers)
+/// elements if the corresponding `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) elements are unordered or not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcune.w))]
+pub unsafe fn __msa_fcune_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fcune_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Quiet Compare Unordered or Not Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers)
+/// elements if the corresponding `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) elements are unordered or not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fcune.d))]
+pub unsafe fn __msa_fcune_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fcune_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Division
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are divided by the floating-point elements in vector `b` (four 32-bit floating point numbers).
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fdiv.w))]
+pub unsafe fn __msa_fdiv_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fdiv_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Division
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are divided by the floating-point elements in vector `b` (two 64-bit floating point numbers).
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fdiv.d))]
+pub unsafe fn __msa_fdiv_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fdiv_d(a, mem::transmute(b))
+}
+
+/* FIXME: 16-bit float
+/// Vector Floating-Point Down-Convert Interchange Format
+///
+/// The floating-point elements in vector `a` (four 64-bit floating point numbers)
+/// and vector `b` (four 64-bit floating point numbers) are down-converted
+/// to a smaller interchange format, i.e. from 64-bit to 32-bit, or from 32-bit to 16-bit.
+/// The result is written to vector (8 16-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexdo.h))]
+pub unsafe fn __msa_fexdo_h(a: v4f32, b: v4f32) -> f16x8 {
+    msa_fexdo_h(a, mem::transmute(b))
+}*/
+
+/// Vector Floating-Point Down-Convert Interchange Format
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// and vector `b` (two 64-bit floating point numbers) are down-converted
+/// to a smaller interchange format, i.e. from 64-bit to 32-bit, or from 32-bit to 16-bit.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexdo.w))]
+pub unsafe fn __msa_fexdo_w(a: v2f64, b: v2f64) -> v4f32 {
+    msa_fexdo_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Down-Convert Interchange Format
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are scaled, i.e. multiplied, by 2 to the power of integer elements in vector `b`
+/// (four signed 32-bit integer numbers).
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexp2.w))]
+pub unsafe fn __msa_fexp2_w(a: v4f32, b: v4i32) -> v4f32 {
+    msa_fexp2_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Down-Convert Interchange Format
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are scaled, i.e. multiplied, by 2 to the power of integer elements in vector `b`
+/// (two signed 64-bit integer numbers).
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexp2.d))]
+pub unsafe fn __msa_fexp2_d(a: v2f64, b: v2i64) -> v2f64 {
+    msa_fexp2_d(a, mem::transmute(b))
+}
+
+/* FIXME: 16-bit float
+/// Vector Floating-Point Up-Convert Interchange Format Left
+///
+/// The left half floating-point elements in vector `a` (two 16-bit floating point numbers)
+/// are up-converted to a larger interchange format,
+/// i.e. from 16-bit to 32-bit, or from 32-bit to 64-bit.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexupl.w))]
+pub unsafe fn __msa_fexupl_w(a: f16x8) -> v4f32 {
+    msa_fexupl_w(a)
+}*/
+
+/// Vector Floating-Point Up-Convert Interchange Format Left
+///
+/// The left half floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are up-converted to a larger interchange format,
+/// i.e. from 16-bit to 32-bit, or from 32-bit to 64-bit.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexupl.d))]
+pub unsafe fn __msa_fexupl_d(a: v4f32) -> v2f64 {
+    msa_fexupl_d(a)
+}
+
+/* FIXME: 16-bit float
+/// Vector Floating-Point Up-Convert Interchange Format Left
+///
+/// The right half floating-point elements in vector `a` (two 16-bit floating point numbers)
+/// are up-converted to a larger interchange format,
+/// i.e. from 16-bit to 32-bit, or from 32-bit to 64-bit.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexupr.w))]
+pub unsafe fn __msa_fexupr_w(a: f16x8) -> v4f32 {
+    msa_fexupr_w(a)
+} */
+
+/// Vector Floating-Point Up-Convert Interchange Format Left
+///
+/// The right half floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are up-converted to a larger interchange format,
+/// i.e. from 16-bit to 32-bit, or from 32-bit to 64-bit.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fexupr.d))]
+pub unsafe fn __msa_fexupr_d(a: v4f32) -> v2f64 {
+    msa_fexupr_d(a)
+}
+
+/// Vector Floating-Point Round and Convert from Signed Integer
+///
+/// The signed integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are converted to floating-point values.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffint_s.w))]
+pub unsafe fn __msa_ffint_s_w(a: v4i32) -> v4f32 {
+    msa_ffint_s_w(a)
+}
+
+/// Vector Floating-Point Round and Convert from Signed Integer
+///
+/// The signed integer elements in vector `a` (two signed 64-bit integer numbers)
+/// are converted to floating-point values.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffint_s.d))]
+pub unsafe fn __msa_ffint_s_d(a: v2i64) -> v2f64 {
+    msa_ffint_s_d(a)
+}
+
+/// Vector Floating-Point Round and Convert from Unsigned Integer
+///
+/// The unsigned integer elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are converted to floating-point values.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffint_u.w))]
+pub unsafe fn __msa_ffint_u_w(a: v4u32) -> v4f32 {
+    msa_ffint_u_w(a)
+}
+
+/// Vector Floating-Point Round and Convert from Unsigned Integer
+///
+/// The unsigned integer elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are converted to floating-point values.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffint_u.d))]
+pub unsafe fn __msa_ffint_u_d(a: v2u64) -> v2f64 {
+    msa_ffint_u_d(a)
+}
+
+/// Vector Floating-Point Convert from Fixed-Point Left
+///
+/// The left half fixed-point elements in vector `a` (eight signed 16-bit integer numbers)
+/// are up-converted to floating-point data format.
+/// i.e. from 16-bit Q15 to 32-bit floating-point, or from 32-bit Q31 to 64-bit floating-point.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffql.w))]
+pub unsafe fn __msa_ffql_w(a: v8i16) -> v4f32 {
+    msa_ffql_w(a)
+}
+
+/// Vector Floating-Point Convert from Fixed-Point Left
+///
+/// The left half fixed-point elements in vector `a` (four signed 32-bit integer numbers)
+/// are up-converted to floating-point data format.
+/// i.e. from 16-bit Q15 to 32-bit floating-point, or from 32-bit Q31 to 64-bit floating-point.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffql.d))]
+pub unsafe fn __msa_ffql_d(a: v4i32) -> v2f64 {
+    msa_ffql_d(a)
+}
+
+/// Vector Floating-Point Convert from Fixed-Point Left
+///
+/// The right half fixed-point elements in vector `a` (eight signed 16-bit integer numbers)
+/// are up-converted to floating-point data format.
+/// i.e. from 16-bit Q15 to 32-bit floating-point, or from 32-bit Q31 to 64-bit floating-point.
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffqr.w))]
+pub unsafe fn __msa_ffqr_w(a: v8i16) -> v4f32 {
+    msa_ffqr_w(a)
+}
+
+/// Vector Floating-Point Convert from Fixed-Point Left
+///
+/// The right half fixed-point elements in vector `a` (four signed 32-bit integer numbers)
+/// are up-converted to floating-point data format.
+/// i.e. from 16-bit Q15 to 32-bit floating-point, or from 32-bit Q31 to 64-bit floating-point.
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ffqr.d))]
+pub unsafe fn __msa_ffqr_d(a: v4i32) -> v2f64 {
+    msa_ffqr_d(a)
+}
+
+/// Vector Fill from GPR
+///
+/// Replicate GPR rs value to all elements in vector (sixteen signed 8-bit integer numbers).
+/// If the source GPR is wider than the destination data format, the destination's elements
+/// will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fill.b))]
+pub unsafe fn __msa_fill_b(a: i32) -> v16i8 {
+    msa_fill_b(a)
+}
+
+/// Vector Fill from GPR
+///
+/// Replicate GPR rs value to all elements in vector (eight signed 16-bit integer numbers).
+/// If the source GPR is wider than the destination data format, the destination's elements
+/// will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fill.h))]
+pub unsafe fn __msa_fill_h(a: i32) -> v8i16 {
+    msa_fill_h(a)
+}
+
+/// Vector Fill from GPR
+///
+/// Replicate GPR rs value to all elements in vector (four signed 32-bit integer numbers).
+/// If the source GPR is wider than the destination data format, the destination's elements
+/// will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fill.w))]
+pub unsafe fn __msa_fill_w(a: i32) -> v4i32 {
+    msa_fill_w(a)
+}
+
+/// Vector Fill from GPR
+///
+/// Replicate GPR rs value to all elements in vector (two signed 64-bit integer numbers).
+/// If the source GPR is wider than the destination data format, the destination's elements
+/// will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fill.d))]
+pub unsafe fn __msa_fill_d(a: i64) -> v2i64 {
+    msa_fill_d(a)
+}
+
+/// Vector Floating-Point Base 2 Logarithm
+///
+/// The signed integral base 2 exponents of floating-point elements in vector `a`
+/// (four 32-bit floating point numbers) are written as floating-point values to vector elements
+/// (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(flog2.w))]
+pub unsafe fn __msa_flog2_w(a: v4f32) -> v4f32 {
+    msa_flog2_w(a)
+}
+
+/// Vector Floating-Point Base 2 Logarithm
+///
+/// The signed integral base 2 exponents of floating-point elements in vector `a`
+/// (two 64-bit floating point numbers) are written as floating-point values to vector elements
+/// (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(flog2.d))]
+pub unsafe fn __msa_flog2_d(a: v2f64) -> v2f64 {
+    msa_flog2_d(a)
+}
+
+/// Vector Floating-Point Multiply-Add
+///
+/// The floating-point elements in vector `b` (four 32-bit floating point numbers)
+/// multiplied by floating-point elements in vector `c` (four 32-bit floating point numbers)
+/// are added to the floating-point elements in vector `a` (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmadd.w))]
+pub unsafe fn __msa_fmadd_w(a: v4f32, b: v4f32, c: v4f32) -> v4f32 {
+    msa_fmadd_w(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Multiply-Add
+///
+/// The floating-point elements in vector `b` (two 64-bit floating point numbers)
+/// multiplied by floating-point elements in vector `c` (two 64-bit floating point numbers)
+/// are added to the floating-point elements in vector `a` (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmadd.d))]
+pub unsafe fn __msa_fmadd_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64 {
+    msa_fmadd_d(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Maximum
+///
+/// The largest values between corresponding floating-point elements in vector `a`
+/// (four 32-bit floating point numbers) and vector `b` (four 32-bit floating point numbers)
+/// are written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmax.w))]
+pub unsafe fn __msa_fmax_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmax_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Maximum
+///
+/// The largest values between corresponding floating-point elements in vector `a`
+/// (two 64-bit floating point numbers) and vector `b` (two 64-bit floating point numbers)
+/// are written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmax.d))]
+pub unsafe fn __msa_fmax_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmax_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// and vector `b` (four 32-bit floating point numbers)
+/// are written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmax_a.w))]
+pub unsafe fn __msa_fmax_a_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmax_a_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// and vector `b` (two 64-bit floating point numbers)
+/// are written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmax_a.d))]
+pub unsafe fn __msa_fmax_a_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmax_a_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Minimum
+///
+/// The smallest values between corresponding floating-point elements in vector `a`
+/// (four 32-bit floating point numbers) and vector `b` (four 32-bit floating point numbers)
+/// are written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmin.w))]
+pub unsafe fn __msa_fmin_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmin_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Minimum
+///
+/// The smallest values between corresponding floating-point elements in vector `a`
+/// (two 64-bit floating point numbers) and vector `b` (two 64-bit floating point numbers)
+/// are written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmin.d))]
+pub unsafe fn __msa_fmin_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmin_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Minimum Based on Absolute Values
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// and vector `b` (four 32-bit floating point numbers)
+/// are written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmin_a.w))]
+pub unsafe fn __msa_fmin_a_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmin_a_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Minimum Based on Absolute Values
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// and vector `b` (two 64-bit floating point numbers)
+/// are written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmin_a.d))]
+pub unsafe fn __msa_fmin_a_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmin_a_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Multiply-Sub
+///
+/// The floating-point elements in vector `b` (four 32-bit floating point numbers)
+/// multiplied by floating-point elements in vector `c` (four 32-bit floating point numbers)
+/// are subtracted from the floating-point elements in vector `a` (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmsub.w))]
+pub unsafe fn __msa_fmsub_w(a: v4f32, b: v4f32, c: v4f32) -> v4f32 {
+    msa_fmsub_w(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Multiply-Sub
+///
+/// The floating-point elements in vector `b` (two 64-bit floating point numbers)
+/// multiplied by floating-point elements in vector `c` (two 64-bit floating point numbers)
+/// are subtracted from the floating-point elements in vector `a` (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmsub.d))]
+pub unsafe fn __msa_fmsub_d(a: v2f64, b: v2f64, c: v2f64) -> v2f64 {
+    msa_fmsub_d(a, mem::transmute(b), c)
+}
+
+/// Vector Floating-Point Multiplication
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers) are
+/// multiplied by floating-point elements in vector `b` (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmul.w))]
+pub unsafe fn __msa_fmul_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fmul_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Multiplication
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers) are
+/// multiplied by floating-point elements in vector `b` (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fmul.d))]
+pub unsafe fn __msa_fmul_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fmul_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Round to Integer
+///
+/// The floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are rounded to an integral valued floating-point number in the same format based
+/// on the rounding mode bits RM in MSA Control and Status Register MSACSR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frint.w))]
+pub unsafe fn __msa_frint_w(a: v4f32) -> v4f32 {
+    msa_frint_w(a)
+}
+
+/// Vector Floating-Point Round to Integer
+///
+/// The floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are rounded to an integral valued floating-point number in the same format based
+/// on the rounding mode bits RM in MSA Control and Status Register MSACSR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frint.d))]
+pub unsafe fn __msa_frint_d(a: v2f64) -> v2f64 {
+    msa_frint_d(a)
+}
+
+/// Vector Approximate Floating-Point Reciprocal
+///
+/// The reciprocals of floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are calculated and the result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frcp.w))]
+pub unsafe fn __msa_frcp_w(a: v4f32) -> v4f32 {
+    msa_frcp_w(a)
+}
+
+/// Vector Approximate Floating-Point Reciprocal
+///
+/// The reciprocals of floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are calculated and the result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frcp.d))]
+pub unsafe fn __msa_frcp_d(a: v2f64) -> v2f64 {
+    msa_frcp_d(a)
+}
+
+/// Vector Approximate Floating-Point Reciprocal of Square Root
+///
+/// The reciprocals of the square roots of floating-point elements in vector `a` (four 32-bit floating point numbers)
+/// are calculated and the result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frsqrt.w))]
+pub unsafe fn __msa_frsqrt_w(a: v4f32) -> v4f32 {
+    msa_frsqrt_w(a)
+}
+
+/// Vector Approximate Floating-Point Reciprocal of Square Root
+///
+/// The reciprocals of the square roots of floating-point elements in vector `a` (two 64-bit floating point numbers)
+/// are calculated and the result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(frsqrt.d))]
+pub unsafe fn __msa_frsqrt_d(a: v2f64) -> v2f64 {
+    msa_frsqrt_d(a)
+}
+
+/// Vector Floating-Point Signaling Compare Always False
+///
+/// Set all bits to 0 in vector (four signed 32-bit integer numbers) elements.
+/// Signaling and quiet NaN elements in vector `a` (four 32-bit floating point numbers)
+/// or `b` (four 32-bit floating point numbers) signal Invalid Operation exception.
+/// In case of a floating-point exception, the default result has all bits set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsaf.w))]
+pub unsafe fn __msa_fsaf_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsaf_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Always False
+///
+/// Set all bits to 0 in vector (two signed 64-bit integer numbers) elements.
+/// Signaling and quiet NaN elements in vector `a` (two 64-bit floating point numbers)
+/// or `b` (two 64-bit floating point numbers) signal Invalid Operation exception.
+/// In case of a floating-point exception, the default result has all bits set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsaf.d))]
+pub unsafe fn __msa_fsaf_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsaf_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fseq.w))]
+pub unsafe fn __msa_fseq_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fseq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) elements are equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fseq.d))]
+pub unsafe fn __msa_fseq_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fseq_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Less or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) elements
+/// are less than or equal to `b` (four 32-bit floating point numbers) elements, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsle.w))]
+pub unsafe fn __msa_fsle_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsle_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Less or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) elements
+/// are less than or equal to `b` (two 64-bit floating point numbers) elements, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsle.d))]
+pub unsafe fn __msa_fsle_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsle_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) elements
+/// are less than `b` (four 32-bit floating point numbers) elements, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fslt.w))]
+pub unsafe fn __msa_fslt_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fslt_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) elements
+/// are less than `b` (two 64-bit floating point numbers) elements, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fslt.d))]
+pub unsafe fn __msa_fslt_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fslt_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Not Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are not equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsne.w))]
+pub unsafe fn __msa_fsne_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsne_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Not Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are not equal, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsne.d))]
+pub unsafe fn __msa_fsne_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsne_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Ordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are ordered,
+/// i.e. both elements are not NaN values, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsor.w))]
+pub unsafe fn __msa_fsor_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsor_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Ordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are ordered,
+/// i.e. both elements are not NaN values, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsor.d))]
+pub unsafe fn __msa_fsor_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsor_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Square Root
+///
+/// The square roots of floating-point elements in vector `a`
+/// (four 32-bit floating point numbers) are written to vector
+/// (four 32-bit floating point numbers) elements are ordered,.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsqrt.w))]
+pub unsafe fn __msa_fsqrt_w(a: v4f32) -> v4f32 {
+    msa_fsqrt_w(a)
+}
+
+/// Vector Floating-Point Square Root
+///
+/// The square roots of floating-point elements in vector `a`
+/// (two 64-bit floating point numbers) are written to vector
+/// (two 64-bit floating point numbers) elements are ordered,.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsqrt.d))]
+pub unsafe fn __msa_fsqrt_d(a: v2f64) -> v2f64 {
+    msa_fsqrt_d(a)
+}
+
+/// Vector Floating-Point Subtraction
+///
+/// The floating-point elements in vector `b` (four 32-bit floating point numbers)
+/// are subtracted from the floating-point elements in vector `a`
+/// (four 32-bit floating point numbers).
+/// The result is written to vector (four 32-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsub.w))]
+pub unsafe fn __msa_fsub_w(a: v4f32, b: v4f32) -> v4f32 {
+    msa_fsub_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Subtraction
+///
+/// The floating-point elements in vector `b` (two 64-bit floating point numbers)
+/// are subtracted from the floating-point elements in vector `a`
+/// (two 64-bit floating point numbers).
+/// The result is written to vector (two 64-bit floating point numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsub.d))]
+pub unsafe fn __msa_fsub_d(a: v2f64, b: v2f64) -> v2f64 {
+    msa_fsub_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Ordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are unordered or equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsueq.w))]
+pub unsafe fn __msa_fsueq_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsueq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Ordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are unordered or equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsueq.d))]
+pub unsafe fn __msa_fsueq_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsueq_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Less or Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) elements are
+/// unordered or less than or equal to `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsule.w))]
+pub unsafe fn __msa_fsule_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsule_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Less or Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) elements are
+/// unordered or less than or equal to `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsule.d))]
+pub unsafe fn __msa_fsule_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsule_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Less Than
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) elements
+/// are unordered or less than `b` (four 32-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsult.w))]
+pub unsafe fn __msa_fsult_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsult_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Less Than
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) elements
+/// are unordered or less than `b` (two 64-bit floating point numbers) elements,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsult.d))]
+pub unsafe fn __msa_fsult_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsult_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are unordered,
+/// i.e. at least one element is a NaN value, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsun.w))]
+pub unsafe fn __msa_fsun_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsun_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are unordered,
+/// i.e. at least one element is a NaN value, otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsun.d))]
+pub unsafe fn __msa_fsun_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsun_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Not Equal
+///
+/// Set all bits to 1 in vector (four signed 32-bit integer numbers) elements
+/// if the corresponding `a` (four 32-bit floating point numbers) and
+/// `b` (four 32-bit floating point numbers) elements are unordered or not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsune.w))]
+pub unsafe fn __msa_fsune_w(a: v4f32, b: v4f32) -> v4i32 {
+    msa_fsune_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Signaling Compare Unordered or Not Equal
+///
+/// Set all bits to 1 in vector (two signed 64-bit integer numbers) elements
+/// if the corresponding `a` (two 64-bit floating point numbers) and
+/// `b` (two 64-bit floating point numbers) elements are unordered or not equal,
+/// otherwise set all bits to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(fsune.d))]
+pub unsafe fn __msa_fsune_d(a: v2f64, b: v2f64) -> v2i64 {
+    msa_fsune_d(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Convert to Signed Integer
+///
+///The elements in vector `a` (four 32-bit floating point numbers)
+/// are rounded and converted to signed integer values based on the
+/// rounding mode bits RM in MSA Control and Status Register MSACSR.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftint_s.w))]
+pub unsafe fn __msa_ftint_s_w(a: v4f32) -> v4i32 {
+    msa_ftint_s_w(a)
+}
+
+/// Vector Floating-Point Convert to Signed Integer
+///
+///The elements in vector `a` (two 64-bit floating point numbers)
+/// are rounded and converted to signed integer values based on the
+/// rounding mode bits RM in MSA Control and Status Register MSACSR.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftint_s.d))]
+pub unsafe fn __msa_ftint_s_d(a: v2f64) -> v2i64 {
+    msa_ftint_s_d(a)
+}
+
+/// Vector Floating-Point Convert to Unsigned Integer
+///
+/// The elements in vector `a` (four 32-bit floating point numbers)
+/// are rounded and converted to signed integer values based on the
+/// rounding mode bits RM in MSA Control and Status Register MSACSR.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftint_u.w))]
+pub unsafe fn __msa_ftint_u_w(a: v4f32) -> v4u32 {
+    msa_ftint_u_w(a)
+}
+
+/// Vector Floating-Point Convert to Unsigned Integer
+///
+/// The elements in vector `a` (two 64-bit floating point numbers)
+/// are rounded and converted to signed integer values based on the
+/// rounding mode bits RM in MSA Control and Status Register MSACSR.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftint_u.d))]
+pub unsafe fn __msa_ftint_u_d(a: v2f64) -> v2u64 {
+    msa_ftint_u_d(a)
+}
+
+/// Vector Floating-Point Convert to Fixed-Point
+///
+/// The elements in vector `a` (four 32-bit floating point numbers)
+/// and `b` (four 32-bit floating point numbers) are down-converted to a fixed-point
+/// representation, i.e. from 64-bit floating-point to 32-bit Q31 fixed-point
+/// representation, or from 32-bit floating-point to 16-bit Q15 fixed-point representation.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftq.h))]
+pub unsafe fn __msa_ftq_h(a: v4f32, b: v4f32) -> v8i16 {
+    msa_ftq_h(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Convert to Fixed-Point
+///
+/// The elements in vector `a` (two 64-bit floating point numbers)
+/// and `b` (two 64-bit floating point numbers) are down-converted to a fixed-point
+/// representation, i.e. from 64-bit floating-point to 32-bit Q31 fixed-point
+/// representation, or from 32-bit floating-point to 16-bit Q15 fixed-point representation.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftq.w))]
+pub unsafe fn __msa_ftq_w(a: v2f64, b: v2f64) -> v4i32 {
+    msa_ftq_w(a, mem::transmute(b))
+}
+
+/// Vector Floating-Point Truncate and Convert to Signed Integer
+///
+/// The elements in vector `a` (four 32-bit floating point numbers)
+/// are truncated, i.e. rounded toward zero, to signed integer values.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftrunc_s.w))]
+pub unsafe fn __msa_ftrunc_s_w(a: v4f32) -> v4i32 {
+    msa_ftrunc_s_w(a)
+}
+
+/// Vector Floating-Point Truncate and Convert to Signed Integer
+///
+/// The elements in vector `a` (two 64-bit floating point numbers)
+/// are truncated, i.e. rounded toward zero, to signed integer values.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftrunc_s.d))]
+pub unsafe fn __msa_ftrunc_s_d(a: v2f64) -> v2i64 {
+    msa_ftrunc_s_d(a)
+}
+
+/// Vector Floating-Point Truncate and Convert to Unsigned Integer
+///
+/// The elements in vector `a` (four 32-bit floating point numbers)
+/// are truncated, i.e. rounded toward zero, to unsigned integer values.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftrunc_u.w))]
+pub unsafe fn __msa_ftrunc_u_w(a: v4f32) -> v4u32 {
+    msa_ftrunc_u_w(a)
+}
+
+/// Vector Floating-Point Truncate and Convert to Unsigned Integer
+///
+/// The elements in vector `a` (two 64-bit floating point numbers)
+/// are truncated, i.e. rounded toward zero, to unsigned integer values.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ftrunc_u.d))]
+pub unsafe fn __msa_ftrunc_u_d(a: v2f64) -> v2u64 {
+    msa_ftrunc_u_d(a)
+}
+
+/// Vector Signed Horizontal Add
+///
+/// The sign-extended odd elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are added to the sign-extended even elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_s.h))]
+pub unsafe fn __msa_hadd_s_h(a: v16i8, b: v16i8) -> v8i16 {
+    msa_hadd_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Add
+///
+/// The sign-extended odd elements in vector `a` (eight signed 16-bit integer numbers)
+/// are added to the sign-extended even elements in vector `b` (eight signed 16-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_s.w))]
+pub unsafe fn __msa_hadd_s_w(a: v8i16, b: v8i16) -> v4i32 {
+    msa_hadd_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Add
+///
+/// The sign-extended odd elements in vector `a` (four signed 32-bit integer numbers)
+/// are added to the sign-extended even elements in vector `b` (four signed 32-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_s.d))]
+pub unsafe fn __msa_hadd_s_d(a: v4i32, b: v4i32) -> v2i64 {
+    msa_hadd_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Add
+///
+/// The zero-extended odd elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are added to the zero-extended even elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_u.h))]
+pub unsafe fn __msa_hadd_u_h(a: v16u8, b: v16u8) -> v8u16 {
+    msa_hadd_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Add
+///
+/// The zero-extended odd elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are added to the zero-extended even elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_u.w))]
+pub unsafe fn __msa_hadd_u_w(a: v8u16, b: v8u16) -> v4u32 {
+    msa_hadd_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Add
+///
+/// The zero-extended odd elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are added to the zero-extended even elements in vector `b` (four unsigned 32-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hadd_u.d))]
+pub unsafe fn __msa_hadd_u_d(a: v4u32, b: v4u32) -> v2u64 {
+    msa_hadd_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Subtract
+///
+/// The sign-extended odd elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the sign-extended elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_s.h))]
+pub unsafe fn __msa_hsub_s_h(a: v16i8, b: v16i8) -> v8i16 {
+    msa_hsub_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Subtract
+///
+/// The sign-extended odd elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the sign-extended elements in vector `a` (eight signed 16-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_s.w))]
+pub unsafe fn __msa_hsub_s_w(a: v8i16, b: v8i16) -> v4i32 {
+    msa_hsub_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Horizontal Subtract
+///
+/// The sign-extended odd elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the sign-extended elements in vector `a` (four signed 32-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_s.d))]
+pub unsafe fn __msa_hsub_s_d(a: v4i32, b: v4i32) -> v2i64 {
+    msa_hsub_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Subtract
+///
+/// The zero-extended odd elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are subtracted from the zero-extended elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_u.h))]
+pub unsafe fn __msa_hsub_u_h(a: v16u8, b: v16u8) -> v8i16 {
+    msa_hsub_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Subtract
+///
+/// The zero-extended odd elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are subtracted from the zero-extended elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_u.w))]
+pub unsafe fn __msa_hsub_u_w(a: v8u16, b: v8u16) -> v4i32 {
+    msa_hsub_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Horizontal Subtract
+///
+/// The zero-extended odd elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are subtracted from the zero-extended elements in vector `a` (four unsigned 32-bit integer numbers)
+/// producing a result twice the size of the input operands.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(hsub_u.d))]
+pub unsafe fn __msa_hsub_u_d(a: v4u32, b: v4u32) -> v2i64 {
+    msa_hsub_u_d(a, mem::transmute(b))
+}
+
+/// Vector Interleave Even
+///
+/// Even elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// and vector `b` (sixteen signed 8-bit integer numbers) are copied to the result
+/// (sixteen signed 8-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvev.b))]
+pub unsafe fn __msa_ilvev_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ilvev_b(a, mem::transmute(b))
+}
+
+/// Vector Interleave Even
+///
+/// Even elements in vectors `a` (eight signed 16-bit integer numbers)
+/// and vector `b` (eight signed 16-bit integer numbers) are copied to the result
+/// (eight signed 16-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvev.h))]
+pub unsafe fn __msa_ilvev_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ilvev_h(a, mem::transmute(b))
+}
+
+/// Vector Interleave Even
+///
+/// Even elements in vectors `a` (four signed 32-bit integer numbers)
+/// and vector `b` (four signed 32-bit integer numbers) are copied to the result
+/// (four signed 32-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvev.w))]
+pub unsafe fn __msa_ilvev_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ilvev_w(a, mem::transmute(b))
+}
+
+/// Vector Interleave Even
+///
+/// Even elements in vectors `a` (two signed 64-bit integer numbers)
+/// and vector `b` (two signed 64-bit integer numbers) are copied to the result
+/// (two signed 64-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvev.d))]
+pub unsafe fn __msa_ilvev_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ilvev_d(a, mem::transmute(b))
+}
+
+/// Vector Interleave Left
+///
+/// The left half elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// and vector `b` (sixteen signed 8-bit integer numbers) are copied to the result
+/// (sixteen signed 8-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvl.b))]
+pub unsafe fn __msa_ilvl_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ilvl_b(a, mem::transmute(b))
+}
+
+/// Vector Interleave Left
+///
+/// The left half elements in vectors `a` (eight signed 16-bit integer numbers)
+/// and vector `b` (eight signed 16-bit integer numbers) are copied to the result
+/// (eight signed 16-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvl.h))]
+pub unsafe fn __msa_ilvl_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ilvl_h(a, mem::transmute(b))
+}
+
+/// Vector Interleave Left
+///
+/// The left half elements in vectors `a` (four signed 32-bit integer numbers)
+/// and vector `b` (four signed 32-bit integer numbers) are copied to the result
+/// (four signed 32-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvl.w))]
+pub unsafe fn __msa_ilvl_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ilvl_w(a, mem::transmute(b))
+}
+
+/// Vector Interleave Left
+///
+/// The left half elements in vectors `a` (two signed 64-bit integer numbers)
+/// and vector `b` (two signed 64-bit integer numbers) are copied to the result
+/// (two signed 64-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvl.d))]
+pub unsafe fn __msa_ilvl_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ilvl_d(a, mem::transmute(b))
+}
+
+/// Vector Interleave Odd
+///
+/// Odd elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// and vector `b` (sixteen signed 8-bit integer numbers) are copied to the result
+/// (sixteen signed 8-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvod.b))]
+pub unsafe fn __msa_ilvod_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ilvod_b(a, mem::transmute(b))
+}
+
+/// Vector Interleave Odd
+///
+/// Odd elements in vectors `a` (eight signed 16-bit integer numbers)
+/// and vector `b` (eight signed 16-bit integer numbers) are copied to the result
+/// (eight signed 16-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvod.h))]
+pub unsafe fn __msa_ilvod_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ilvod_h(a, mem::transmute(b))
+}
+
+/// Vector Interleave Odd
+///
+/// Odd elements in vectors `a` (four signed 32-bit integer numbers)
+/// and vector `b` (four signed 32-bit integer numbers) are copied to the result
+/// (four signed 32-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvod.w))]
+pub unsafe fn __msa_ilvod_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ilvod_w(a, mem::transmute(b))
+}
+
+/// Vector Interleave Odd
+///
+/// Odd elements in vectors `a` (two signed 64-bit integer numbers)
+/// and vector `b` (two signed 64-bit integer numbers) are copied to the result
+/// (two signed 64-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvod.d))]
+pub unsafe fn __msa_ilvod_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ilvod_d(a, mem::transmute(b))
+}
+
+/// Vector Interleave Right
+///
+/// The right half elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// and vector `b` (sixteen signed 8-bit integer numbers) are copied to the result
+/// (sixteen signed 8-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvr.b))]
+pub unsafe fn __msa_ilvr_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_ilvr_b(a, mem::transmute(b))
+}
+
+/// Vector Interleave Right
+///
+/// The right half elements in vectors `a` (eight signed 16-bit integer numbers)
+/// and vector `b` (eight signed 16-bit integer numbers) are copied to the result
+/// (eight signed 16-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvr.h))]
+pub unsafe fn __msa_ilvr_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_ilvr_h(a, mem::transmute(b))
+}
+
+/// Vector Interleave Right
+///
+/// The right half elements in vectors `a` (four signed 32-bit integer numbers)
+/// and vector `b` (four signed 32-bit integer numbers) are copied to the result
+/// (four signed 32-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvr.w))]
+pub unsafe fn __msa_ilvr_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_ilvr_w(a, mem::transmute(b))
+}
+
+/// Vector Interleave Right
+///
+/// The right half elements in vectors `a` (two signed 64-bit integer numbers)
+/// and vector `b` (two signed 64-bit integer numbers) are copied to the result
+/// (two signed 64-bit integer numbers)
+/// alternating one element from `a` with one element from `b`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ilvr.d))]
+pub unsafe fn __msa_ilvr_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_ilvr_d(a, mem::transmute(b))
+}
+
+/// GPR Insert Element
+///
+/// Set element `imm4` in vector `a` (sixteen signed 8-bit integer numbers) to GPR `c` value.
+/// All other elements in vector `a` are unchanged. If the source GPR is wider than the
+/// destination data format, the destination's elements will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insert.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_insert_b<const IMM4: i32>(a: v16i8, c: i32) -> v16i8 {
+    static_assert_imm4!(IMM4);
+    msa_insert_b(a, IMM4, c)
+}
+
+/// GPR Insert Element
+///
+/// Set element `imm3` in vector `a` (eight signed 16-bit integer numbers) to GPR `c` value.
+/// All other elements in vector `a` are unchanged. If the source GPR is wider than the
+/// destination data format, the destination's elements will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insert.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_insert_h<const IMM3: i32>(a: v8i16, c: i32) -> v8i16 {
+    static_assert_imm3!(IMM3);
+    msa_insert_h(a, IMM3, c)
+}
+
+/// GPR Insert Element
+///
+/// Set element `imm2` in vector `a` (four signed 32-bit integer numbers) to GPR `c` value.
+/// All other elements in vector `a` are unchanged. If the source GPR is wider than the
+/// destination data format, the destination's elements will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insert.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_insert_w<const IMM2: i32>(a: v4i32, c: i32) -> v4i32 {
+    static_assert_imm2!(IMM2);
+    msa_insert_w(a, IMM2, c)
+}
+
+/// GPR Insert Element
+///
+/// Set element `imm1` in vector `a` (two signed 64-bit integer numbers) to GPR `c` value.
+/// All other elements in vector `a` are unchanged. If the source GPR is wider than the
+/// destination data format, the destination's elements will be set to the least significant bits of the GPR.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insert.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_insert_d<const IMM1: i32>(a: v2i64, c: i64) -> v2i64 {
+    static_assert_imm1!(IMM1);
+    msa_insert_d(a, IMM1, c)
+}
+
+/// Element Insert Element
+///
+/// Set element `imm1` in the result vector `a` (sixteen signed 8-bit integer numbers) to element 0
+/// in vector `c` (sixteen signed 8-bit integer numbers) value.
+/// All other elements in vector `a` are unchanged.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insve.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_insve_b<const IMM4: i32>(a: v16i8, c: v16i8) -> v16i8 {
+    static_assert_imm4!(IMM4);
+    msa_insve_b(a, IMM4, c)
+}
+
+/// Element Insert Element
+///
+/// Set element `imm1` in the result vector `a` (eight signed 16-bit integer numbers) to element 0
+/// in vector `c` (eight signed 16-bit integer numbers) value.
+/// All other elements in vector `a` are unchanged.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insve.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_insve_h<const IMM3: i32>(a: v8i16, c: v8i16) -> v8i16 {
+    static_assert_imm3!(IMM3);
+    msa_insve_h(a, IMM3, c)
+}
+
+/// Element Insert Element
+///
+/// Set element `imm1` in the result vector `a` (four signed 32-bit integer numbers) to element 0
+/// in vector `c` (four signed 32-bit integer numbers) value.
+/// All other elements in vector `a` are unchanged.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insve.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_insve_w<const IMM2: i32>(a: v4i32, c: v4i32) -> v4i32 {
+    static_assert_imm2!(IMM2);
+    msa_insve_w(a, IMM2, c)
+}
+
+/// Element Insert Element
+///
+/// Set element `imm1` in the result vector `a` (two signed 64-bit integer numbers) to element 0
+/// in vector `c` (two signed 64-bit integer numbers) value.
+/// All other elements in vector `a` are unchanged.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(insve.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_insve_d<const IMM1: i32>(a: v2i64, c: v2i64) -> v2i64 {
+    static_assert_imm1!(IMM1);
+    msa_insve_d(a, IMM1, c)
+}
+
+/// Vector Load
+///
+/// The WRLEN / 8 bytes at the effective memory location addressed by the base
+/// `mem_addr` and the 10-bit signed immediate offset `imm_s10` are fetched and placed in
+/// the vector (sixteen signed 8-bit integer numbers) value.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ld.b, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_ld_b<const IMM_S10: i32>(mem_addr: *mut u8) -> v16i8 {
+    static_assert_imm_s10!(IMM_S10);
+    msa_ld_b(mem_addr, IMM_S10)
+}
+
+/// Vector Load
+///
+/// The WRLEN / 8 bytes at the effective memory location addressed by the base
+/// `mem_addr` and the 10-bit signed immediate offset `imm_s11` are fetched and placed in
+/// the vector (eight signed 16-bit integer numbers) value.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ld.h, imm_s11 = 0b11111111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_ld_h<const IMM_S11: i32>(mem_addr: *mut u8) -> v8i16 {
+    static_assert_imm_s11!(IMM_S11);
+    static_assert!(IMM_S11: i32 where IMM_S11 % 2 == 0);
+    msa_ld_h(mem_addr, IMM_S11)
+}
+
+/// Vector Load
+///
+/// The WRLEN / 8 bytes at the effective memory location addressed by the base
+/// `mem_addr` and the 10-bit signed immediate offset `imm_s12` are fetched and placed in
+/// the vector (four signed 32-bit integer numbers) value.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ld.w, imm_s12 = 0b111111111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_ld_w<const IMM_S12: i32>(mem_addr: *mut u8) -> v4i32 {
+    static_assert_imm_s12!(IMM_S12);
+    static_assert!(IMM_S12: i32 where IMM_S12 % 4 == 0);
+    msa_ld_w(mem_addr, IMM_S12)
+}
+
+/// Vector Load
+///
+/// The WRLEN / 8 bytes at the effective memory location addressed by the base
+/// `mem_addr` and the 10-bit signed immediate offset `imm_s13` are fetched and placed in
+/// the vector (two signed 64-bit integer numbers) value.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ld.d, imm_s13 = 0b1111111111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_ld_d<const IMM_S13: i32>(mem_addr: *mut u8) -> v2i64 {
+    static_assert_imm_s13!(IMM_S13);
+    static_assert!(IMM_S13: i32 where IMM_S13 % 8 == 0);
+    msa_ld_d(mem_addr, IMM_S13)
+}
+
+/// Immediate Load
+///
+/// The signed immediate imm_s10 is replicated in all vector
+/// (sixteen signed 8-bit integer numbers) elements. For byte elements,
+/// only the least significant 8 bits of imm_s10 will be used.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ldi.b, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __msa_ldi_b<const IMM_S10: i32>() -> v16i8 {
+    static_assert_imm_s10!(IMM_S10);
+    msa_ldi_b(IMM_S10)
+}
+
+/// Immediate Load
+///
+/// The signed immediate imm_s10 is replicated in all vector
+/// (eight signed 16-bit integer numbers) elements. For byte elements,
+/// only the least significant 8 bits of imm_s10 will be used.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ldi.h, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __msa_ldi_h<const IMM_S10: i32>() -> v8i16 {
+    static_assert_imm_s10!(IMM_S10);
+    msa_ldi_h(IMM_S10)
+}
+
+/// Immediate Load
+///
+/// The signed immediate imm_s10 is replicated in all vector
+/// (four signed 32-bit integer numbers) elements. For byte elements,
+/// only the least significant 8 bits of imm_s10 will be used.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ldi.w, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __msa_ldi_w<const IMM_S10: i32>() -> v4i32 {
+    static_assert_imm_s10!(IMM_S10);
+    msa_ldi_w(IMM_S10)
+}
+
+/// Immediate Load
+///
+/// The signed immediate imm_s10 is replicated in all vector
+/// (two signed 64-bit integer numbers) elements. For byte elements,
+/// only the least significant 8 bits of imm_s10 will be used.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ldi.d, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __msa_ldi_d<const IMM_S10: i32>() -> v2i64 {
+    static_assert_imm_s10!(IMM_S10);
+    msa_ldi_d(IMM_S10)
+}
+
+/// Vector Fixed-Point Multiply and Add
+///
+/// The products of fixed-point elements in `b` (eight signed 16-bit integer numbers)
+/// by fixed-point elements in vector `c` (eight signed 16-bit integer numbers)
+/// are added to the fixed-point elements in vector `a` (eight signed 16-bit integer numbers).
+/// The multiplication result is not saturated, i.e. exact (-1) * (-1) = 1 is added to the destination.
+/// The saturated fixed-point results are stored to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(madd_q.h))]
+pub unsafe fn __msa_madd_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_madd_q_h(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Add
+///
+/// The products of fixed-point elements in `b` (four signed 32-bit integer numbers)
+/// by fixed-point elements in vector `c` (four signed 32-bit integer numbers)
+/// are added to the fixed-point elements in vector `a` (four signed 32-bit integer numbers).
+/// The multiplication result is not saturated, i.e. exact (-1) * (-1) = 1 is added to the destination.
+/// The saturated fixed-point results are stored to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(madd_q.w))]
+pub unsafe fn __msa_madd_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_madd_q_w(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Add Rounded
+///
+/// The products of fixed-point elements in `b` (eight signed 16-bit integer numbers)
+/// by fixed-point elements in vector `c` (eight signed 16-bit integer numbers)
+/// are added to the fixed-point elements in vector `a` (eight signed 16-bit integer numbers).
+/// The multiplication result is not saturated, i.e. exact (-1) * (-1) = 1 is added to the destination.
+/// The rounded and saturated fixed-point results are stored to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddr_q.h))]
+pub unsafe fn __msa_maddr_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_maddr_q_h(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Add Rounded
+///
+/// The products of fixed-point elements in `b` (four signed 32-bit integer numbers)
+/// by fixed-point elements in vector `c` (four signed 32-bit integer numbers)
+/// are added to the fixed-point elements in vector `a` (four signed 32-bit integer numbers).
+/// The multiplication result is not saturated, i.e. exact (-1) * (-1) = 1 is added to the destination.
+/// The rounded and saturated fixed-point results are stored to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddr_q.w))]
+pub unsafe fn __msa_maddr_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_maddr_q_w(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Add
+///
+/// The integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are multiplied by integer elements in vector `c` (sixteen signed 8-bit integer numbers)
+/// and added to the integer elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddv.b))]
+pub unsafe fn __msa_maddv_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    msa_maddv_b(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Add
+///
+/// The integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// are multiplied by integer elements in vector `c` (eight signed 16-bit integer numbers)
+/// and added to the integer elements in vector `a` (eight signed 16-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddv.h))]
+pub unsafe fn __msa_maddv_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_maddv_h(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Add
+///
+/// The integer elements in vector `b` (four signed 32-bit integer numbers)
+/// are multiplied by integer elements in vector `c` (four signed 32-bit integer numbers)
+/// and added to the integer elements in vector `a` (four signed 32-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddv.w))]
+pub unsafe fn __msa_maddv_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_maddv_w(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Add
+///
+/// The integer elements in vector `b` (two signed 64-bit integer numbers)
+/// are multiplied by integer elements in vector `c` (two signed 64-bit integer numbers)
+/// and added to the integer elements in vector `a` (two signed 64-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maddv.d))]
+pub unsafe fn __msa_maddv_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    msa_maddv_d(a, mem::transmute(b), c)
+}
+
+/// Vector Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (sixteen signed 8-bit integer numbers) and
+/// `b` (sixteen signed 8-bit integer numbers) are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_a.b))]
+pub unsafe fn __msa_max_a_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_max_a_b(a, mem::transmute(b))
+}
+
+/// Vector Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (eight signed 16-bit integer numbers) and
+/// `b` (eight signed 16-bit integer numbers) are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_a.h))]
+pub unsafe fn __msa_max_a_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_max_a_h(a, mem::transmute(b))
+}
+
+/// Vector Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (four signed 32-bit integer numbers) and
+/// `b` (four signed 32-bit integer numbers) are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_a.w))]
+pub unsafe fn __msa_max_a_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_max_a_w(a, mem::transmute(b))
+}
+
+/// Vector Maximum Based on Absolute Values
+///
+/// The value with the largest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (two signed 64-bit integer numbers) and
+/// `b` (two signed 64-bit integer numbers) are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_a.d))]
+pub unsafe fn __msa_max_a_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_max_a_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// and signed elements in vector `b` (sixteen signed 8-bit integer numbers) are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_s.b))]
+pub unsafe fn __msa_max_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_max_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// and signed elements in vector `b` (eight signed 16-bit integer numbers) are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_s.h))]
+pub unsafe fn __msa_max_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_max_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (four signed 32-bit integer numbers)
+/// and signed elements in vector `b` (four signed 32-bit integer numbers) are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_s.w))]
+pub unsafe fn __msa_max_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_max_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (two signed 64-bit integer numbers)
+/// and signed elements in vector `b` (two signed 64-bit integer numbers) are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_s.d))]
+pub unsafe fn __msa_max_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_max_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// and unsigned elements in vector `b` (sixteen unsigned 8-bit integer numbers) are written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_u.b))]
+pub unsafe fn __msa_max_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_max_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// and unsigned elements in vector `b` (eight unsigned 16-bit integer numbers) are written to vector
+/// (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_u.h))]
+pub unsafe fn __msa_max_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_max_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// and unsigned elements in vector `b` (four unsigned 32-bit integer numbers) are written to vector
+/// (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_u.w))]
+pub unsafe fn __msa_max_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_max_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// and unsigned elements in vector `b` (two unsigned 64-bit integer numbers) are written to vector
+/// (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(max_u.d))]
+pub unsafe fn __msa_max_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_max_u_d(a, mem::transmute(b))
+}
+
+/// Immediate Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_s.b, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_maxi_s_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_maxi_s_b(a, IMM_S5)
+}
+
+/// Immediate Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_s.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_maxi_s_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_maxi_s_h(a, IMM_S5)
+}
+
+/// Immediate Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (four signed 32-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_s.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_maxi_s_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_maxi_s_w(a, IMM_S5)
+}
+
+/// Immediate Signed Maximum
+///
+/// Maximum values between signed elements in vector `a` (two signed 64-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_s.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_maxi_s_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_maxi_s_d(a, IMM_S5)
+}
+
+/// Immediate Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_u.b, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_maxi_u_b<const IMM5: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm5!(IMM5);
+    msa_maxi_u_b(a, IMM5)
+}
+
+/// Immediate Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_u.h, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_maxi_u_h<const IMM5: i32>(a: v8u16) -> v8u16 {
+    static_assert_imm5!(IMM5);
+    msa_maxi_u_h(a, IMM5)
+}
+
+/// Immediate Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_u.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_maxi_u_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_imm5!(IMM5);
+    msa_maxi_u_w(a, IMM5)
+}
+
+/// Immediate Unsigned Maximum
+///
+/// Maximum values between unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(maxi_u.d, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_maxi_u_d<const IMM5: i32>(a: v2u64) -> v2u64 {
+    static_assert_imm5!(IMM5);
+    msa_maxi_u_d(a, IMM5)
+}
+
+/// Vector Minimum Based on Absolute Value
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (sixteen signed 8-bit integer numbers) and
+/// `b` (sixteen signed 8-bit integer numbers) are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_a.b))]
+pub unsafe fn __msa_min_a_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_min_a_b(a, mem::transmute(b))
+}
+
+/// Vector Minimum Based on Absolute Value
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (eight signed 16-bit integer numbers) and
+/// `b` (eight signed 16-bit integer numbers) are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_a.h))]
+pub unsafe fn __msa_min_a_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_min_a_h(a, mem::transmute(b))
+}
+
+/// Vector Minimum Based on Absolute Value
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (four signed 32-bit integer numbers) and
+/// `b` (four signed 32-bit integer numbers) are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_a.w))]
+pub unsafe fn __msa_min_a_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_min_a_w(a, mem::transmute(b))
+}
+
+/// Vector Minimum Based on Absolute Value
+///
+/// The value with the smallest magnitude, i.e. absolute value, between corresponding
+/// signed elements in vector `a` (two signed 64-bit integer numbers) and
+/// `b` (two signed 64-bit integer numbers) are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_a.d))]
+pub unsafe fn __msa_min_a_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_min_a_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// and signed elements in vector `b` (sixteen signed 8-bit integer numbers) are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_s.b))]
+pub unsafe fn __msa_min_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_min_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// and signed elements in vector `b` (eight signed 16-bit integer numbers) are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_s.h))]
+pub unsafe fn __msa_min_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_min_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (four signed 32-bit integer numbers)
+/// and signed elements in vector `b` (four signed 32-bit integer numbers) are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_s.w))]
+pub unsafe fn __msa_min_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_min_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (two signed 64-bit integer numbers)
+/// and signed elements in vector `b` (two signed 64-bit integer numbers) are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_s.d))]
+pub unsafe fn __msa_min_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_min_s_d(a, mem::transmute(b))
+}
+
+/// Immediate Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_s.b, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_mini_s_b<const IMM_S5: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_mini_s_b(a, IMM_S5)
+}
+
+/// Immediate Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_s.h, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_mini_s_h<const IMM_S5: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_mini_s_h(a, IMM_S5)
+}
+
+/// Immediate Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (four signed 32-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_s.w, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_mini_s_w<const IMM_S5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_mini_s_w(a, IMM_S5)
+}
+
+/// Immediate Signed Minimum
+///
+/// Minimum values between signed elements in vector `a` (two signed 64-bit integer numbers)
+/// and the 5-bit signed immediate imm_s5 are written to vector
+/// (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_s.d, imm_s5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_mini_s_d<const IMM_S5: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm_s5!(IMM_S5);
+    msa_mini_s_d(a, IMM_S5)
+}
+
+/// Vector Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// and unsigned elements in vector `b` (sixteen unsigned 8-bit integer numbers) are written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_u.b))]
+pub unsafe fn __msa_min_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_min_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// and unsigned elements in vector `b` (eight unsigned 16-bit integer numbers) are written to vector
+/// (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_u.h))]
+pub unsafe fn __msa_min_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_min_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// and unsigned elements in vector `b` (four unsigned 32-bit integer numbers) are written to vector
+/// (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_u.w))]
+pub unsafe fn __msa_min_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_min_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// and unsigned elements in vector `b` (two unsigned 64-bit integer numbers) are written to vector
+/// (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(min_u.d))]
+pub unsafe fn __msa_min_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_min_u_d(a, mem::transmute(b))
+}
+
+/// Immediate Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_u.b, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_mini_u_b<const IMM5: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm5!(IMM5);
+    msa_mini_u_b(a, IMM5)
+}
+
+/// Immediate Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_u.h, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_mini_u_h<const IMM5: i32>(a: v8u16) -> v8u16 {
+    static_assert_imm5!(IMM5);
+    msa_mini_u_h(a, IMM5)
+}
+
+/// Immediate Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_u.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_mini_u_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_imm5!(IMM5);
+    msa_mini_u_w(a, IMM5)
+}
+
+/// Immediate Unsigned Minimum
+///
+/// Minimum values between unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// and the 5-bit unsigned immediate `imm5` are written to vector
+/// (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mini_u.d, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_mini_u_d<const IMM5: i32>(a: v2u64) -> v2u64 {
+    static_assert_imm5!(IMM5);
+    msa_mini_u_d(a, IMM5)
+}
+
+/// Vector Signed Modulo
+///
+/// The signed integer elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (sixteen signed 8-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_s.b))]
+pub unsafe fn __msa_mod_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_mod_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Modulo
+///
+/// The signed integer elements in vector `a` (eight signed 16-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (eight signed 16-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (eight signed 16-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_s.h))]
+pub unsafe fn __msa_mod_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_mod_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Modulo
+///
+/// The signed integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (four signed 32-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (four signed 32-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_s.w))]
+pub unsafe fn __msa_mod_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_mod_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Modulo
+///
+/// The signed integer elements in vector `a` (two signed 64-bit integer numbers)
+/// are divided by signed integer elements in vector `b` (two signed 64-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (two signed 64-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_s.d))]
+pub unsafe fn __msa_mod_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_mod_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Modulo
+///
+/// The unsigned integer elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (sixteen unsigned 8-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (sixteen unsigned 8-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_u.b))]
+pub unsafe fn __msa_mod_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_mod_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Modulo
+///
+/// The unsigned integer elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (eight unsigned 16-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (eight unsigned 16-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_u.h))]
+pub unsafe fn __msa_mod_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_mod_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Modulo
+///
+/// The unsigned integer elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (four unsigned 32-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (four unsigned 32-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_u.w))]
+pub unsafe fn __msa_mod_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_mod_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Modulo
+///
+/// The unsigned integer elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are divided by unsigned integer elements in vector `b` (two unsigned 64-bit integer numbers).
+/// The remainder of the same sign as the dividend is written to vector
+/// (two unsigned 64-bit integer numbers). If a divisor element vector `b` is zero,
+/// the result value is UNPREDICTABLE.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mod_u.d))]
+pub unsafe fn __msa_mod_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_mod_u_d(a, mem::transmute(b))
+}
+
+/// Vector Move
+///
+/// Copy all WRLEN bits in vector `a` (eight signed 16-bit integer numbers)
+/// to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(move.v))]
+pub unsafe fn __msa_move_v(a: v16i8) -> v16i8 {
+    msa_move_v(a)
+}
+
+/// Vector Fixed-Point Multiply and Subtract
+///
+/// The product of fixed-point elements in vector `c` (eight signed 16-bit integer numbers)
+/// by fixed-point elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the fixed-point elements in vector `a`
+/// (eight signed 16-bit integer numbers). The multiplication result is not saturated,
+/// i.e. exact (-1) * (-1) = 1 is subtracted from the destination.
+/// The saturated fixed-point results are stored back to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msub_q.h))]
+pub unsafe fn __msa_msub_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_msub_q_h(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Subtract
+///
+/// The product of fixed-point elements in vector `c` (four signed 32-bit integer numbers)
+/// by fixed-point elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the fixed-point elements in vector `a`
+/// (four signed 32-bit integer numbers). The multiplication result is not saturated,
+/// i.e. exact (-1) * (-1) = 1 is subtracted from the destination.
+/// The saturated fixed-point results are stored back to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msub_q.w))]
+pub unsafe fn __msa_msub_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_msub_q_w(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Subtract Rounded
+///
+/// The product of fixed-point elements in vector `c` (eight signed 16-bit integer numbers)
+/// by fixed-point elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the fixed-point elements in vector `a`
+/// (eight signed 16-bit integer numbers). The multiplication result is not saturated,
+/// i.e. exact (-1) * (-1) = 1 is subtracted from the destination.
+/// The rounded and saturated fixed-point results are stored back to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubr_q.h))]
+pub unsafe fn __msa_msubr_q_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_msubr_q_h(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply and Subtract Rounded
+///
+/// The product of fixed-point elements in vector `c` (four signed 32-bit integer numbers)
+/// by fixed-point elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the fixed-point elements in vector `a`
+/// (four signed 32-bit integer numbers). The multiplication result is not saturated,
+/// i.e. exact (-1) * (-1) = 1 is subtracted from the destination.
+/// The rounded and saturated fixed-point results are stored back to vector `a`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubr_q.w))]
+pub unsafe fn __msa_msubr_q_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_msubr_q_w(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Subtract
+///
+/// The integer elements in vector `c` (sixteen signed 8-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// and subtracted from the integer elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubv.b))]
+pub unsafe fn __msa_msubv_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    msa_msubv_b(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Subtract
+///
+/// The integer elements in vector `c` (eight signed 16-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (eight signed 16-bit integer numbers)
+/// and subtracted from the integer elements in vector `a` (eight signed 16-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubv.h))]
+pub unsafe fn __msa_msubv_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_msubv_h(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Subtract
+///
+/// The integer elements in vector `c` (four signed 32-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (four signed 32-bit integer numbers)
+/// and subtracted from the integer elements in vector `a` (four signed 32-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubv.w))]
+pub unsafe fn __msa_msubv_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_msubv_w(a, mem::transmute(b), c)
+}
+
+/// Vector Multiply and Subtract
+///
+/// The integer elements in vector `c` (two signed 64-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (two signed 64-bit integer numbers)
+/// and subtracted from the integer elements in vector `a` (two signed 64-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(msubv.d))]
+pub unsafe fn __msa_msubv_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    msa_msubv_d(a, mem::transmute(b), c)
+}
+
+/// Vector Fixed-Point Multiply
+///
+/// The fixed-point elements in vector `a` (eight signed 16-bit integer numbers)
+/// multiplied by fixed-point elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mul_q.h))]
+pub unsafe fn __msa_mul_q_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_mul_q_h(a, mem::transmute(b))
+}
+
+/// Vector Fixed-Point Multiply
+///
+/// The fixed-point elements in vector `a` (four signed 32-bit integer numbers)
+/// multiplied by fixed-point elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mul_q.w))]
+pub unsafe fn __msa_mul_q_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_mul_q_w(a, mem::transmute(b))
+}
+
+/// Vector Fixed-Point Multiply Rounded
+///
+/// The fixed-point elements in vector `a` (eight signed 16-bit integer numbers)
+/// multiplied by fixed-point elements in vector `b` (eight signed 16-bit integer numbers).
+/// The rounded result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulr_q.h))]
+pub unsafe fn __msa_mulr_q_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_mulr_q_h(a, mem::transmute(b))
+}
+
+/// Vector Fixed-Point Multiply Rounded
+///
+/// The fixed-point elements in vector `a` (four signed 32-bit integer numbers)
+/// multiplied by fixed-point elements in vector `b` (four signed 32-bit integer numbers).
+/// The rounded result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulr_q.w))]
+pub unsafe fn __msa_mulr_q_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_mulr_q_w(a, mem::transmute(b))
+}
+
+/// Vector Multiply
+///
+/// The integer elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulv.b))]
+pub unsafe fn __msa_mulv_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_mulv_b(a, mem::transmute(b))
+}
+
+/// Vector Multiply
+///
+/// The integer elements in vector `a` (eight signed 16-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulv.h))]
+pub unsafe fn __msa_mulv_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_mulv_h(a, mem::transmute(b))
+}
+
+/// Vector Multiply
+///
+/// The integer elements in vector `a` (four signed 32-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulv.w))]
+pub unsafe fn __msa_mulv_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_mulv_w(a, mem::transmute(b))
+}
+
+/// Vector Multiply
+///
+/// The integer elements in vector `a` (two signed 64-bit integer numbers)
+/// are multiplied by integer elements in vector `b` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+/// The most significant half of the multiplication result is discarded.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(mulv.d))]
+pub unsafe fn __msa_mulv_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_mulv_d(a, mem::transmute(b))
+}
+
+/// Vector Leading Ones Count
+///
+/// The number of leading ones for elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// is stored to the elements in vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nloc.b))]
+pub unsafe fn __msa_nloc_b(a: v16i8) -> v16i8 {
+    msa_nloc_b(a)
+}
+
+/// Vector Leading Ones Count
+///
+/// The number of leading ones for elements in vector `a` (eight signed 16-bit integer numbers)
+/// is stored to the elements in vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nloc.h))]
+pub unsafe fn __msa_nloc_h(a: v8i16) -> v8i16 {
+    msa_nloc_h(a)
+}
+
+/// Vector Leading Ones Count
+///
+/// The number of leading ones for elements in vector `a` (four signed 32-bit integer numbers)
+/// is stored to the elements in vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nloc.w))]
+pub unsafe fn __msa_nloc_w(a: v4i32) -> v4i32 {
+    msa_nloc_w(a)
+}
+
+/// Vector Leading Ones Count
+///
+/// The number of leading ones for elements in vector `a` (two signed 64-bit integer numbers)
+/// is stored to the elements in vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nloc.d))]
+pub unsafe fn __msa_nloc_d(a: v2i64) -> v2i64 {
+    msa_nloc_d(a)
+}
+
+/// Vector Leading Zeros Count
+///
+/// The number of leading zeros for elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// is stored to the elements in vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nlzc.b))]
+pub unsafe fn __msa_nlzc_b(a: v16i8) -> v16i8 {
+    msa_nlzc_b(a)
+}
+
+/// Vector Leading Zeros Count
+///
+/// The number of leading zeros for elements in vector `a` (eight signed 16-bit integer numbers)
+/// is stored to the elements in vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nlzc.h))]
+pub unsafe fn __msa_nlzc_h(a: v8i16) -> v8i16 {
+    msa_nlzc_h(a)
+}
+
+/// Vector Leading Zeros Count
+///
+/// The number of leading zeros for elements in vector `a` (four signed 32-bit integer numbers)
+/// is stored to the elements in vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nlzc.w))]
+pub unsafe fn __msa_nlzc_w(a: v4i32) -> v4i32 {
+    msa_nlzc_w(a)
+}
+
+/// Vector Leading Zeros Count
+///
+/// The number of leading zeros for elements in vector `a` (two signed 64-bit integer numbers)
+/// is stored to the elements in vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nlzc.d))]
+pub unsafe fn __msa_nlzc_d(a: v2i64) -> v2i64 {
+    msa_nlzc_d(a)
+}
+
+/// Vector Logical Negated Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the corresponding bit of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// in a bitwise logical NOR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nor.v))]
+pub unsafe fn __msa_nor_v(a: v16u8, b: v16u8) -> v16u8 {
+    msa_nor_v(a, mem::transmute(b))
+}
+
+/// Immediate Logical Negated Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the 8-bit immediate `imm8`
+/// in a bitwise logical NOR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(nori.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_nori_b<const IMM8: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm8!(IMM8);
+    msa_nori_b(a, IMM8)
+}
+
+/// Vector Logical Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the corresponding bit of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// in a bitwise logical OR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(or.v))]
+pub unsafe fn __msa_or_v(a: v16u8, b: v16u8) -> v16u8 {
+    msa_or_v(a, mem::transmute(b))
+}
+
+/// Immediate Logical Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the 8-bit immediate `imm8`
+/// in a bitwise logical OR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(ori.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_ori_b<const IMM8: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm8!(IMM8);
+    msa_ori_b(a, IMM8)
+}
+
+/// Vector Pack Even
+///
+/// Even elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// are copied to the left half of the result vector and even elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckev.b))]
+pub unsafe fn __msa_pckev_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_pckev_b(a, mem::transmute(b))
+}
+
+/// Vector Pack Even
+///
+/// Even elements in vectors `a` (eight signed 16-bit integer numbers)
+/// are copied to the left half of the result vector and even elements in vector `b`
+/// (eight signed 16-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckev.h))]
+pub unsafe fn __msa_pckev_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_pckev_h(a, mem::transmute(b))
+}
+
+/// Vector Pack Even
+///
+/// Even elements in vectors `a` (four signed 32-bit integer numbers)
+/// are copied to the left half of the result vector and even elements in vector `b`
+/// (four signed 32-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckev.w))]
+pub unsafe fn __msa_pckev_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_pckev_w(a, mem::transmute(b))
+}
+
+/// Vector Pack Even
+///
+/// Even elements in vectors `a` (two signed 64-bit integer numbers)
+/// are copied to the left half of the result vector and even elements in vector `b`
+/// (two signed 64-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckev.d))]
+pub unsafe fn __msa_pckev_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_pckev_d(a, mem::transmute(b))
+}
+
+/// Vector Pack Odd
+///
+/// Odd elements in vectors `a` (sixteen signed 8-bit integer numbers)
+/// are copied to the left half of the result vector and odd elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckod.b))]
+pub unsafe fn __msa_pckod_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_pckod_b(a, mem::transmute(b))
+}
+
+/// Vector Pack Odd
+///
+/// Odd elements in vectors `a` (eight signed 16-bit integer numbers)
+/// are copied to the left half of the result vector and odd elements in vector `b`
+/// (eight signed 16-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckod.h))]
+pub unsafe fn __msa_pckod_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_pckod_h(a, mem::transmute(b))
+}
+
+/// Vector Pack Odd
+///
+/// Odd elements in vectors `a` (four signed 32-bit integer numbers)
+/// are copied to the left half of the result vector and odd elements in vector `b`
+/// (four signed 32-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckod.w))]
+pub unsafe fn __msa_pckod_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_pckod_w(a, mem::transmute(b))
+}
+
+/// Vector Pack Odd
+///
+/// Odd elements in vectors `a` (two signed 64-bit integer numbers)
+/// are copied to the left half of the result vector and odd elements in vector `b`
+/// (two signed 64-bit integer numbers) are copied to the right half of the result vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pckod.d))]
+pub unsafe fn __msa_pckod_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_pckod_d(a, mem::transmute(b))
+}
+
+/// Vector Population Count
+///
+/// The number of bits set to 1 for elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// is stored to the elements in the result vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pcnt.b))]
+pub unsafe fn __msa_pcnt_b(a: v16i8) -> v16i8 {
+    msa_pcnt_b(a)
+}
+
+/// Vector Population Count
+///
+/// The number of bits set to 1 for elements in vector `a` (eight signed 16-bit integer numbers)
+/// is stored to the elements in the result vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pcnt.h))]
+pub unsafe fn __msa_pcnt_h(a: v8i16) -> v8i16 {
+    msa_pcnt_h(a)
+}
+
+/// Vector Population Count
+///
+/// The number of bits set to 1 for elements in vector `a` (four signed 32-bit integer numbers)
+/// is stored to the elements in the result vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pcnt.w))]
+pub unsafe fn __msa_pcnt_w(a: v4i32) -> v4i32 {
+    msa_pcnt_w(a)
+}
+
+/// Vector Population Count
+///
+/// The number of bits set to 1 for elements in vector `a` (two signed 64-bit integer numbers)
+/// is stored to the elements in the result vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(pcnt.d))]
+pub unsafe fn __msa_pcnt_d(a: v2i64) -> v2i64 {
+    msa_pcnt_d(a)
+}
+
+/// Immediate Signed Saturate
+///
+/// Signed elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are saturated to signed values of `imm3+1` bits without changing the data width.
+/// The result is stored in the vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_s.b, imm4 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_sat_s_b<const IMM3: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm3!(IMM3);
+    msa_sat_s_b(a, IMM3)
+}
+
+/// Immediate Signed Saturate
+///
+/// Signed elements in vector `a` (eight signed 16-bit integer numbers)
+/// are saturated to signed values of `imm4+1` bits without changing the data width.
+/// The result is stored in the vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_s.h, imm3 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_sat_s_h<const IMM4: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm4!(IMM4);
+    msa_sat_s_h(a, IMM4)
+}
+
+/// Immediate Signed Saturate
+///
+/// Signed elements in vector `a` (four signed 32-bit integer numbers)
+/// are saturated to signed values of `imm5+1` bits without changing the data width.
+/// The result is stored in the vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_s.w, imm2 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_sat_s_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm5!(IMM5);
+    msa_sat_s_w(a, IMM5)
+}
+
+/// Immediate Signed Saturate
+///
+/// Signed elements in vector `a` (two signed 64-bit integer numbers)
+/// are saturated to signed values of `imm6+1` bits without changing the data width.
+/// The result is stored in the vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_s.d, imm1 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_sat_s_d<const IMM6: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm6!(IMM6);
+    msa_sat_s_d(a, IMM6)
+}
+
+/// Immediate Unsigned Saturate
+///
+/// Unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers)
+/// are saturated to unsigned values of `imm3+1` bits without changing the data width.
+/// The result is stored in the vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_u.b, imm4 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_sat_u_b<const IMM3: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm3!(IMM3);
+    msa_sat_u_b(a, IMM3)
+}
+
+/// Immediate Unsigned Saturate
+///
+/// Unsigned elements in vector `a` (eight unsigned 16-bit integer numbers)
+/// are saturated to unsigned values of `imm4+1` bits without changing the data width.
+/// The result is stored in the vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_u.h, imm3 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_sat_u_h<const IMM4: i32>(a: v8u16) -> v8u16 {
+    static_assert_imm4!(IMM4);
+    msa_sat_u_h(a, IMM4)
+}
+
+/// Immediate Unsigned Saturate
+///
+/// Unsigned elements in vector `a` (four unsigned 32-bit integer numbers)
+/// are saturated to unsigned values of `imm5+1` bits without changing the data width.
+/// The result is stored in the vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_u.w, imm2 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_sat_u_w<const IMM5: i32>(a: v4u32) -> v4u32 {
+    static_assert_imm5!(IMM5);
+    msa_sat_u_w(a, IMM5)
+}
+
+/// Immediate Unsigned Saturate
+///
+/// Unsigned elements in vector `a` (two unsigned 64-bit integer numbers)
+/// are saturated to unsigned values of `imm6+1` bits without changing the data width.
+/// The result is stored in the vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sat_u.d, imm1 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_sat_u_d<const IMM6: i32>(a: v2u64) -> v2u64 {
+    static_assert_imm6!(IMM6);
+    msa_sat_u_d(a, IMM6)
+}
+
+/// Immediate Set Shuffle Elements
+///
+/// The set shuffle instruction works on 4-element sets.
+/// All sets are shuffled in the same way: the element i82i+1..2i in `a`
+/// (sixteen signed 8-bit integer numbers) is copied over the element i in result vector
+/// (sixteen signed 8-bit integer numbers), where i is 0, 1, 2, 3.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(shf.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_shf_b<const IMM8: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm8!(IMM8);
+    msa_shf_b(a, IMM8)
+}
+
+/// Immediate Set Shuffle Elements
+///
+/// The set shuffle instruction works on 4-element sets.
+/// All sets are shuffled in the same way: the element i82i+1..2i in `a`
+/// (eight signed 16-bit integer numbers) is copied over the element i in result vector
+/// (eight signed 16-bit integer numbers), where i is 0, 1, 2, 3.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(shf.h, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_shf_h<const IMM8: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm8!(IMM8);
+    msa_shf_h(a, IMM8)
+}
+
+/// Immediate Set Shuffle Elements
+///
+/// The set shuffle instruction works on 4-element sets.
+/// All sets are shuffled in the same way: the element i82i+1..2i in `a`
+/// (four signed 32-bit integer numbers) is copied over the element i in result vector
+/// (four signed 32-bit integer numbers), where i is 0, 1, 2, 3.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(shf.w, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_shf_w<const IMM8: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm8!(IMM8);
+    msa_shf_w(a, IMM8)
+}
+
+/// GPR Columns Slide
+///
+/// Vector registers `a` (sixteen signed 8-bit integer numbers) and `b`
+/// (sixteen signed 8-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by the number of columns given in GPR `c`.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+/// GPR `c` value is interpreted modulo the number of columns in destination rectangle,
+/// or equivalently, the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sld.b))]
+pub unsafe fn __msa_sld_b(a: v16i8, b: v16i8, c: i32) -> v16i8 {
+    msa_sld_b(a, mem::transmute(b), c)
+}
+
+/// GPR Columns Slide
+///
+/// Vector registers `a` (eight signed 16-bit integer numbers) and `b`
+/// (eight signed 16-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by the number of columns given in GPR `c`.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+/// GPR `c` value is interpreted modulo the number of columns in destination rectangle,
+/// or equivalently, the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sld.h))]
+pub unsafe fn __msa_sld_h(a: v8i16, b: v8i16, c: i32) -> v8i16 {
+    msa_sld_h(a, mem::transmute(b), c)
+}
+
+/// GPR Columns Slide
+///
+/// Vector registers `a` (four signed 32-bit integer numbers) and `b`
+/// (four signed 32-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by the number of columns given in GPR `c`.
+/// The result is written to vector (four signed 32-bit integer numbers).
+/// GPR `c` value is interpreted modulo the number of columns in destination rectangle,
+/// or equivalently, the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sld.w))]
+pub unsafe fn __msa_sld_w(a: v4i32, b: v4i32, c: i32) -> v4i32 {
+    msa_sld_w(a, mem::transmute(b), c)
+}
+
+/// GPR Columns Slide
+///
+/// Vector registers `a` (two signed 64-bit integer numbers) and `b`
+/// (two signed 64-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by the number of columns given in GPR `c`.
+/// The result is written to vector (two signed 64-bit integer numbers).
+/// GPR `c` value is interpreted modulo the number of columns in destination rectangle,
+/// or equivalently, the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sld.d))]
+pub unsafe fn __msa_sld_d(a: v2i64, b: v2i64, c: i32) -> v2i64 {
+    msa_sld_d(a, mem::transmute(b), c)
+}
+
+/// Immediate Columns Slide
+///
+/// Vector registers `a` (sixteen signed 8-bit integer numbers) and `b`
+/// (sixteen signed 8-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by `imm1` columns.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sldi.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_sldi_b<const IMM4: i32>(a: v16i8, b: v16i8) -> v16i8 {
+    static_assert_imm4!(IMM4);
+    msa_sldi_b(a, mem::transmute(b), IMM4)
+}
+
+/// Immediate Columns Slide
+///
+/// Vector registers `a` (eight signed 16-bit integer numbers) and `b`
+/// (eight signed 16-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by `imm1` columns.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sldi.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_sldi_h<const IMM3: i32>(a: v8i16, b: v8i16) -> v8i16 {
+    static_assert_imm3!(IMM3);
+    msa_sldi_h(a, mem::transmute(b), IMM3)
+}
+
+/// Immediate Columns Slide
+///
+/// Vector registers `a` (four signed 32-bit integer numbers) and `b`
+/// (four signed 32-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by `imm1` columns.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sldi.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_sldi_w<const IMM2: i32>(a: v4i32, b: v4i32) -> v4i32 {
+    static_assert_imm2!(IMM2);
+    msa_sldi_w(a, mem::transmute(b), IMM2)
+}
+
+/// Immediate Columns Slide
+///
+/// Vector registers `a` (two signed 64-bit integer numbers) and `b`
+/// (two signed 64-bit integer numbers) contain 2-dimensional byte arrays (rectangles)
+/// stored row-wise with as many rows as bytes in integer data format df.
+/// The two source rectangles `b` and `a` are concatenated horizontally in the order
+/// they appear in the syntax, i.e. first `a` and then `b`. Place a new destination
+/// rectangle over `b` and then slide it to the left over the concatenation of `a` and `b`
+/// by `imm1` columns.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sldi.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_sldi_d<const IMM1: i32>(a: v2i64, b: v2i64) -> v2i64 {
+    static_assert_imm1!(IMM1);
+    msa_sldi_d(a, mem::transmute(b), IMM1)
+}
+
+/// Vector Shift Left
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted left by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits. The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sll.b))]
+pub unsafe fn __msa_sll_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_sll_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Left
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted left by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits. The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sll.h))]
+pub unsafe fn __msa_sll_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_sll_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Left
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted left by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits. The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sll.w))]
+pub unsafe fn __msa_sll_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_sll_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Left
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted left by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits. The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sll.d))]
+pub unsafe fn __msa_sll_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_sll_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Left
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted left by `imm4` bits.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(slli.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_slli_b<const IMM4: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm4!(IMM4);
+    msa_slli_b(a, IMM4)
+}
+
+/// Immediate Shift Left
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted left by `imm3` bits.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(slli.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_slli_h<const IMM3: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm3!(IMM3);
+    msa_slli_h(a, IMM3)
+}
+
+/// Immediate Shift Left
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted left by `imm2` bits.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(slli.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_slli_w<const IMM2: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm2!(IMM2);
+    msa_slli_w(a, IMM2)
+}
+
+/// Immediate Shift Left
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted left by `imm1` bits.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(slli.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_slli_d<const IMM1: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm1!(IMM1);
+    msa_slli_d(a, IMM1)
+}
+
+/// GPR Element Splat
+///
+/// Replicate vector `a` (sixteen signed 8-bit integer numbers)
+/// element with index given by GPR `b` to all elements in vector
+/// (sixteen signed 8-bit integer numbers) GPR `b` value is interpreted
+/// modulo the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splat.b))]
+pub unsafe fn __msa_splat_b(a: v16i8, b: i32) -> v16i8 {
+    msa_splat_b(a, mem::transmute(b))
+}
+
+/// GPR Element Splat
+///
+/// Replicate vector `a` (eight signed 16-bit integer numbers)
+/// element with index given by GPR `b` to all elements in vector
+/// (eight signed 16-bit integer numbers) GPR `b` value is interpreted
+/// modulo the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splat.h))]
+pub unsafe fn __msa_splat_h(a: v8i16, b: i32) -> v8i16 {
+    msa_splat_h(a, mem::transmute(b))
+}
+
+/// GPR Element Splat
+///
+/// Replicate vector `a` (four signed 32-bit integer numbers)
+/// element with index given by GPR `b` to all elements in vector
+/// (four signed 32-bit integer numbers) GPR `b` value is interpreted
+/// modulo the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splat.w))]
+pub unsafe fn __msa_splat_w(a: v4i32, b: i32) -> v4i32 {
+    msa_splat_w(a, mem::transmute(b))
+}
+
+/// GPR Element Splat
+///
+/// Replicate vector `a` (two signed 64-bit integer numbers)
+/// element with index given by GPR `b` to all elements in vector
+/// (two signed 64-bit integer numbers) GPR `b` value is interpreted
+/// modulo the number of data format df elements in the destination vector.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splat.d))]
+pub unsafe fn __msa_splat_d(a: v2i64, b: i32) -> v2i64 {
+    msa_splat_d(a, mem::transmute(b))
+}
+
+/// Immediate Element Splat
+///
+/// Replicate element `imm4` in vector `a` (sixteen signed 8-bit integer numbers)
+/// to all elements in vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splati.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_splati_b<const IMM4: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm4!(IMM4);
+    msa_splati_b(a, IMM4)
+}
+
+/// Immediate Element Splat
+///
+/// Replicate element `imm3` in vector `a` (eight signed 16-bit integer numbers)
+/// to all elements in vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splati.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_splati_h<const IMM3: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm3!(IMM3);
+    msa_splati_h(a, IMM3)
+}
+
+/// Immediate Element Splat
+///
+/// Replicate element `imm2` in vector `a` (four signed 32-bit integer numbers)
+/// to all elements in vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splati.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_splati_w<const IMM2: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm2!(IMM2);
+    msa_splati_w(a, IMM2)
+}
+
+/// Immediate Element Splat
+///
+/// Replicate element `imm1` in vector `a` (two signed 64-bit integer numbers)
+/// to all elements in vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(splati.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_splati_d<const IMM1: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm1!(IMM1);
+    msa_splati_d(a, IMM1)
+}
+
+/// Vector Shift Right Arithmetic
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sra.b))]
+pub unsafe fn __msa_sra_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_sra_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sra.h))]
+pub unsafe fn __msa_sra_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_sra_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sra.w))]
+pub unsafe fn __msa_sra_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_sra_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(sra.d))]
+pub unsafe fn __msa_sra_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_sra_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Right Arithmetic
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right arithmetic by `imm3` bits.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srai.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srai_b<const IMM3: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm3!(IMM3);
+    msa_srai_b(a, IMM3)
+}
+
+/// Immediate Shift Right Arithmetic
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right arithmetic by `imm4` bits.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srai.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srai_h<const IMM4: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm4!(IMM4);
+    msa_srai_h(a, IMM4)
+}
+
+/// Immediate Shift Right Arithmetic
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right arithmetic by `imm5` bits.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srai.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srai_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm5!(IMM5);
+    msa_srai_w(a, IMM5)
+}
+
+/// Immediate Shift Right Arithmetic
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right arithmetic by `imm6` bits.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srai.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srai_d<const IMM6: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm6!(IMM6);
+    msa_srai_d(a, IMM6)
+}
+
+/// Vector Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srar.b))]
+pub unsafe fn __msa_srar_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_srar_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srar.h))]
+pub unsafe fn __msa_srar_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_srar_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srar.w))]
+pub unsafe fn __msa_srar_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_srar_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right arithmetic by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srar.d))]
+pub unsafe fn __msa_srar_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_srar_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right arithmetic by `imm3` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srari.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srari_b<const IMM3: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm3!(IMM3);
+    msa_srari_b(a, IMM3)
+}
+
+/// Immediate Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right arithmetic by `imm4` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srari.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srari_h<const IMM4: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm4!(IMM4);
+    msa_srari_h(a, IMM4)
+}
+
+/// Immediate Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right arithmetic by `imm5` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srari.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srari_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm5!(IMM5);
+    msa_srari_w(a, IMM5)
+}
+
+/// Immediate Shift Right Arithmetic Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right arithmetic by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srari.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srari_d<const IMM6: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm6!(IMM6);
+    msa_srari_d(a, IMM6)
+}
+
+/// Vector Shift Right Logical
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srl.b))]
+pub unsafe fn __msa_srl_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_srl_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srl.h))]
+pub unsafe fn __msa_srl_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_srl_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srl.w))]
+pub unsafe fn __msa_srl_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_srl_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits.The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srl.d))]
+pub unsafe fn __msa_srl_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_srl_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Right Logical
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right logical by `imm4` bits.
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srli.b, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srli_b<const IMM4: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm4!(IMM4);
+    msa_srli_b(a, IMM4)
+}
+
+/// Immediate Shift Right Logical
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right logical by `imm3` bits.
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srli.h, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srli_h<const IMM3: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm3!(IMM3);
+    msa_srli_h(a, IMM3)
+}
+
+/// Immediate Shift Right Logical
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right logical by `imm2` bits.
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srli.w, imm2 = 0b11))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srli_w<const IMM2: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm2!(IMM2);
+    msa_srli_w(a, IMM2)
+}
+
+/// Immediate Shift Right Logical
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right logical by `imm1` bits.
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srli.d, imm1 = 0b1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srli_d<const IMM1: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm1!(IMM1);
+    msa_srli_d(a, IMM1)
+}
+
+/// Vector Shift Right Logical Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (sixteen signed 8-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlr.b))]
+pub unsafe fn __msa_srlr_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_srlr_b(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (eight signed 16-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlr.h))]
+pub unsafe fn __msa_srlr_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_srlr_h(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (four signed 32-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlr.w))]
+pub unsafe fn __msa_srlr_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_srlr_w(a, mem::transmute(b))
+}
+
+/// Vector Shift Right Logical Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right logical by the number of bits the elements in vector `b`
+/// (two signed 64-bit integer numbers) specify modulo the size of the
+/// element in bits.The most significant discarded bit is added to the shifted
+/// value (for rounding) and the result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlr.d))]
+pub unsafe fn __msa_srlr_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_srlr_d(a, mem::transmute(b))
+}
+
+/// Immediate Shift Right Logical Rounded
+///
+/// The elements in vector `a` (sixteen signed 8-bit integer numbers)
+/// are shifted right logical by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlri.b, imm3 = 0b111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srlri_b<const IMM3: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm3!(IMM3);
+    msa_srlri_b(a, IMM3)
+}
+
+/// Immediate Shift Right Logical Rounded
+///
+/// The elements in vector `a` (eight signed 16-bit integer numbers)
+/// are shifted right logical by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlri.h, imm4 = 0b1111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srlri_h<const IMM4: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm4!(IMM4);
+    msa_srlri_h(a, IMM4)
+}
+
+/// Immediate Shift Right Logical Rounded
+///
+/// The elements in vector `a` (four signed 32-bit integer numbers)
+/// are shifted right logical by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlri.w, imm5 = 0b11111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srlri_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm5!(IMM5);
+    msa_srlri_w(a, IMM5)
+}
+
+/// Immediate Shift Right Logical Rounded
+///
+/// The elements in vector `a` (two signed 64-bit integer numbers)
+/// are shifted right logical by `imm6` bits.The most significant
+/// discarded bit is added to the shifted value (for rounding) and
+/// the result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(srlri.d, imm6 = 0b111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_srlri_d<const IMM6: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm6!(IMM6);
+    msa_srlri_d(a, IMM6)
+}
+
+/// Vector Store
+///
+/// The WRLEN / 8 bytes in vector `a` (sixteen signed 8-bit integer numbers)
+/// are stored as elements of data format df at the effective memory location
+/// addressed by the base `mem_addr` and the 10-bit signed immediate offset `imm_s10`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(st.b, imm_s10 = 0b1111111111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_st_b<const IMM_S10: i32>(a: v16i8, mem_addr: *mut u8) -> () {
+    static_assert_imm_s10!(IMM_S10);
+    msa_st_b(a, mem_addr, IMM_S10)
+}
+
+/// Vector Store
+///
+/// The WRLEN / 8 bytes in vector `a` (eight signed 16-bit integer numbers)
+/// are stored as elements of data format df at the effective memory location
+/// addressed by the base `mem_addr` and the 11-bit signed immediate offset `imm_s11`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(st.h, imm_s11 = 0b11111111111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_st_h<const IMM_S11: i32>(a: v8i16, mem_addr: *mut u8) -> () {
+    static_assert_imm_s11!(IMM_S11);
+    static_assert!(IMM_S11: i32 where IMM_S11 % 2 == 0);
+    msa_st_h(a, mem_addr, IMM_S11)
+}
+
+/// Vector Store
+///
+/// The WRLEN / 8 bytes in vector `a` (four signed 32-bit integer numbers)
+/// are stored as elements of data format df at the effective memory location
+/// addressed by the base `mem_addr` and the 12-bit signed immediate offset `imm_s12`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(st.w, imm_s12 = 0b111111111111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_st_w<const IMM_S12: i32>(a: v4i32, mem_addr: *mut u8) -> () {
+    static_assert_imm_s12!(IMM_S12);
+    static_assert!(IMM_S12: i32 where IMM_S12 % 4 == 0);
+    msa_st_w(a, mem_addr, IMM_S12)
+}
+
+/// Vector Store
+///
+/// The WRLEN / 8 bytes in vector `a` (two signed 64-bit integer numbers)
+/// are stored as elements of data format df at the effective memory location
+/// addressed by the base `mem_addr` and the 13-bit signed immediate offset `imm_s13`.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(st.d, imm_s13 = 0b1111111111111))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn __msa_st_d<const IMM_S13: i32>(a: v2i64, mem_addr: *mut u8) -> () {
+    static_assert_imm_s13!(IMM_S13);
+    static_assert!(IMM_S13: i32 where IMM_S13 % 8 == 0);
+    msa_st_d(a, mem_addr, IMM_S13)
+}
+
+/// Vector Signed Saturated Subtract of Signed Values
+///
+/// The elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_s.b))]
+pub unsafe fn __msa_subs_s_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_subs_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Signed Values
+///
+/// The elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the elements in vector `a` (eight signed 16-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_s.h))]
+pub unsafe fn __msa_subs_s_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_subs_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Signed Values
+///
+/// The elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the elements in vector `a` (four signed 32-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_s.w))]
+pub unsafe fn __msa_subs_s_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_subs_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Signed Values
+///
+/// The elements in vector `b` (two signed 64-bit integer numbers)
+/// are subtracted from the elements in vector `a` (two signed 64-bit integer numbers).
+/// Signed arithmetic is performed and overflows clamp to the largest and/or smallest
+/// representable signed values before writing the result to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_s.d))]
+pub unsafe fn __msa_subs_s_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_subs_s_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Unsigned Values
+///
+/// The elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are subtracted from the elements in vector `a` (sixteen unsigned 8-bit integer numbers).
+/// Unsigned arithmetic is performed and under-flows clamp to 0 before writing
+/// the result to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_u.b))]
+pub unsafe fn __msa_subs_u_b(a: v16u8, b: v16u8) -> v16u8 {
+    msa_subs_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Unsigned Values
+///
+/// The elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are subtracted from the elements in vector `a` (eight unsigned 16-bit integer numbers).
+/// Unsigned arithmetic is performed and under-flows clamp to 0 before writing
+/// the result to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_u.h))]
+pub unsafe fn __msa_subs_u_h(a: v8u16, b: v8u16) -> v8u16 {
+    msa_subs_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Unsigned Values
+///
+/// The elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are subtracted from the elements in vector `a` (four unsigned 32-bit integer numbers).
+/// Unsigned arithmetic is performed and under-flows clamp to 0 before writing
+/// the result to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_u.w))]
+pub unsafe fn __msa_subs_u_w(a: v4u32, b: v4u32) -> v4u32 {
+    msa_subs_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Unsigned Values
+///
+/// The elements in vector `b` (two unsigned 64-bit integer numbers)
+/// are subtracted from the elements in vector `a` (two unsigned 64-bit integer numbers).
+/// Unsigned arithmetic is performed and under-flows clamp to 0 before writing
+/// the result to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subs_u.d))]
+pub unsafe fn __msa_subs_u_d(a: v2u64, b: v2u64) -> v2u64 {
+    msa_subs_u_d(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Signed from Unsigned
+///
+/// The signed elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The signed result is unsigned saturated and written to
+/// to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsus_u.b))]
+pub unsafe fn __msa_subsus_u_b(a: v16u8, b: v16i8) -> v16u8 {
+    msa_subsus_u_b(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Signed from Unsigned
+///
+/// The signed elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (eight unsigned 16-bit integer numbers).
+/// The signed result is unsigned saturated and written to
+/// to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsus_u.h))]
+pub unsafe fn __msa_subsus_u_h(a: v8u16, b: v8i16) -> v8u16 {
+    msa_subsus_u_h(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Signed from Unsigned
+///
+/// The signed elements in vector `b` (four signed 6432it integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (four unsigned 32-bit integer numbers).
+/// The signed result is unsigned saturated and written to
+/// to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsus_u.w))]
+pub unsafe fn __msa_subsus_u_w(a: v4u32, b: v4i32) -> v4u32 {
+    msa_subsus_u_w(a, mem::transmute(b))
+}
+
+/// Vector Unsigned Saturated Subtract of Signed from Unsigned
+///
+/// The signed elements in vector `b` (two signed 64-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (two unsigned 64-bit integer numbers).
+/// The signed result is unsigned saturated and written to
+/// to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsus_u.d))]
+pub unsafe fn __msa_subsus_u_d(a: v2u64, b: v2i64) -> v2u64 {
+    msa_subsus_u_d(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Unsigned Values
+///
+/// The unsigned elements in vector `b` (sixteen unsigned 8-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (sixteen unsigned 8-bit integer numbers).
+/// The signed result is signed saturated and written to
+/// to vector (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsuu_s.b))]
+pub unsafe fn __msa_subsuu_s_b(a: v16u8, b: v16u8) -> v16i8 {
+    msa_subsuu_s_b(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Unsigned Values
+///
+/// The unsigned elements in vector `b` (eight unsigned 16-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (eight unsigned 16-bit integer numbers).
+/// The signed result is signed saturated and written to
+/// to vector (eight unsigned 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsuu_s.h))]
+pub unsafe fn __msa_subsuu_s_h(a: v8u16, b: v8u16) -> v8i16 {
+    msa_subsuu_s_h(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Unsigned Values
+///
+/// The unsigned elements in vector `b` (four unsigned 32-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (four unsigned 32-bit integer numbers).
+/// The signed result is signed saturated and written to
+/// to vector (four unsigned 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsuu_s.w))]
+pub unsafe fn __msa_subsuu_s_w(a: v4u32, b: v4u32) -> v4i32 {
+    msa_subsuu_s_w(a, mem::transmute(b))
+}
+
+/// Vector Signed Saturated Subtract of Unsigned Values
+///
+/// The unsigned elements in vector `b` (two unsigned 64-bit integer numbers)
+/// are subtracted from the unsigned elements in vector `a` (two unsigned 64-bit integer numbers).
+/// The signed result is signed saturated and written to
+/// to vector (two unsigned 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subsuu_s.d))]
+pub unsafe fn __msa_subsuu_s_d(a: v2u64, b: v2u64) -> v2i64 {
+    msa_subsuu_s_d(a, mem::transmute(b))
+}
+
+/// Vector Subtract
+///
+/// The elements in vector `b` (sixteen signed 8-bit integer numbers)
+/// are subtracted from the elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subv.b))]
+pub unsafe fn __msa_subv_b(a: v16i8, b: v16i8) -> v16i8 {
+    msa_subv_b(a, mem::transmute(b))
+}
+
+/// Vector Subtract
+///
+/// The elements in vector `b` (eight signed 16-bit integer numbers)
+/// are subtracted from the elements in vector `a` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subv.h))]
+pub unsafe fn __msa_subv_h(a: v8i16, b: v8i16) -> v8i16 {
+    msa_subv_h(a, mem::transmute(b))
+}
+
+/// Vector Subtract
+///
+/// The elements in vector `b` (four signed 32-bit integer numbers)
+/// are subtracted from the elements in vector `a` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subv.w))]
+pub unsafe fn __msa_subv_w(a: v4i32, b: v4i32) -> v4i32 {
+    msa_subv_w(a, mem::transmute(b))
+}
+
+/// Vector Subtract
+///
+/// The elements in vector `b` (two signed 64-bit integer numbers)
+/// are subtracted from the elements in vector `a` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subv.d))]
+pub unsafe fn __msa_subv_d(a: v2i64, b: v2i64) -> v2i64 {
+    msa_subv_d(a, mem::transmute(b))
+}
+
+/// Immediate Subtract
+///
+/// The 5-bit immediate unsigned value `imm5`
+/// are subtracted from the elements in vector `a` (sixteen signed 8-bit integer numbers).
+/// The result is written to vector (sixteen signed 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subvi.b, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_subvi_b<const IMM5: i32>(a: v16i8) -> v16i8 {
+    static_assert_imm5!(IMM5);
+    msa_subvi_b(a, IMM5)
+}
+
+/// Immediate Subtract
+///
+/// The 5-bit immediate unsigned value `imm5`
+/// are subtracted from the elements in vector `a` (eight signed 16-bit integer numbers).
+/// The result is written to vector (eight signed 16-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subvi.h, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_subvi_h<const IMM5: i32>(a: v8i16) -> v8i16 {
+    static_assert_imm5!(IMM5);
+    msa_subvi_h(a, IMM5)
+}
+
+/// Immediate Subtract
+///
+/// The 5-bit immediate unsigned value `imm5`
+/// are subtracted from the elements in vector `a` (four signed 32-bit integer numbers).
+/// The result is written to vector (four signed 32-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subvi.w, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_subvi_w<const IMM5: i32>(a: v4i32) -> v4i32 {
+    static_assert_imm5!(IMM5);
+    msa_subvi_w(a, IMM5)
+}
+
+/// Immediate Subtract
+///
+/// The 5-bit immediate unsigned value `imm5`
+/// are subtracted from the elements in vector `a` (two signed 64-bit integer numbers).
+/// The result is written to vector (two signed 64-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(subvi.d, imm5 = 0b10111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_subvi_d<const IMM5: i32>(a: v2i64) -> v2i64 {
+    static_assert_imm5!(IMM5);
+    msa_subvi_d(a, IMM5)
+}
+
+/// Vector Data Preserving Shuffle
+///
+/// The vector shuffle instructions selectively copy data elements from the
+/// concatenation of vectors `b` (sixteen signed 8-bit integer numbers)
+/// and `c` (sixteen signed 8-bit integer numbers) in to vector `a`
+/// (sixteen signed 8-bit integer numbers) based on the corresponding control element in `a`.
+/// The least significant 6 bits in `a` control elements modulo the number of elements in
+/// the concatenated vectors `b`, `a` specify the index of the source element.
+/// If bit 6 or bit 7 is 1, there will be no copy, but rather the destination element is set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(vshf.b))]
+pub unsafe fn __msa_vshf_b(a: v16i8, b: v16i8, c: v16i8) -> v16i8 {
+    msa_vshf_b(a, mem::transmute(b), c)
+}
+
+/// Vector Data Preserving Shuffle
+///
+/// The vector shuffle instructions selectively copy data elements from the
+/// concatenation of vectors `b` (eight signed 16-bit integer numbers)
+/// and `c` (eight signed 16-bit integer numbers) in to vector `a`
+/// (eight signed 16-bit integer numbers) based on the corresponding control element in `a`.
+/// The least significant 6 bits in `a` control elements modulo the number of elements in
+/// the concatenated vectors `b`, `a` specify the index of the source element.
+/// If bit 6 or bit 7 is 1, there will be no copy, but rather the destination element is set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(vshf.h))]
+pub unsafe fn __msa_vshf_h(a: v8i16, b: v8i16, c: v8i16) -> v8i16 {
+    msa_vshf_h(a, mem::transmute(b), c)
+}
+
+/// Vector Data Preserving Shuffle
+///
+/// The vector shuffle instructions selectively copy data elements from the
+/// concatenation of vectors `b` (four signed 32-bit integer numbers)
+/// and `c` (four signed 32-bit integer numbers) in to vector `a`
+/// (four signed 32-bit integer numbers) based on the corresponding control element in `a`.
+/// The least significant 6 bits in `a` control elements modulo the number of elements in
+/// the concatenated vectors `b`, `a` specify the index of the source element.
+/// If bit 6 or bit 7 is 1, there will be no copy, but rather the destination element is set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(vshf.w))]
+pub unsafe fn __msa_vshf_w(a: v4i32, b: v4i32, c: v4i32) -> v4i32 {
+    msa_vshf_w(a, mem::transmute(b), c)
+}
+
+/// Vector Data Preserving Shuffle
+///
+/// The vector shuffle instructions selectively copy data elements from the
+/// concatenation of vectors `b` (two signed 64-bit integer numbers)
+/// and `c` (two signed 64-bit integer numbers) in to vector `a`
+/// (two signed 64-bit integer numbers) based on the corresponding control element in `a`.
+/// The least significant 6 bits in `a` control elements modulo the number of elements in
+/// the concatenated vectors `b`, `a` specify the index of the source element.
+/// If bit 6 or bit 7 is 1, there will be no copy, but rather the destination element is set to 0.
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(vshf.d))]
+pub unsafe fn __msa_vshf_d(a: v2i64, b: v2i64, c: v2i64) -> v2i64 {
+    msa_vshf_d(a, mem::transmute(b), c)
+}
+
+/// Vector Logical Exclusive Or
+///
+/// Each bit of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the corresponding bit of vector `b` (sixteen unsigned 8-bit integer numbers)
+/// in a bitwise logical XOR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(xor.v))]
+pub unsafe fn __msa_xor_v(a: v16u8, b: v16u8) -> v16u8 {
+    msa_xor_v(a, mem::transmute(b))
+}
+
+/// Immediate Logical Exclusive Or
+///
+/// Each byte of vector `a` (sixteen unsigned 8-bit integer numbers)
+/// is combined with the 8-bit immediate `imm8`
+/// in a bitwise logical XOR operation. The result is written to vector
+/// (sixteen unsigned 8-bit integer numbers).
+///
+#[inline]
+#[target_feature(enable = "msa")]
+#[cfg_attr(test, assert_instr(xori.b, imm8 = 0b11111111))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn __msa_xori_b<const IMM8: i32>(a: v16u8) -> v16u8 {
+    static_assert_imm8!(IMM8);
+    msa_xori_b(a, IMM8)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        core_arch::{mips::msa::*, simd::*},
+        mem,
+    };
+    use std::{f32, f64};
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_add_a_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -4, -3, -2, -1,
+            -4, -3, -2, -1,
+            -4, -3, -2, -1,
+            -4, -3, -2, -1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            5, 5, 5, 5,
+            5, 5, 5, 5, 
+            5, 5, 5, 5, 
+            5, 5, 5, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_add_a_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_add_a_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-4, -3, -2, -1, -4, -3, -2, -1);
+        #[rustfmt::skip]
+        let r = i16x8::new(5, 5, 5, 5, 5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_add_a_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_add_a_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-4, -3, -2, -1);
+        #[rustfmt::skip]
+        let r = i32x4::new(5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_add_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_add_a_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, -3);
+        #[rustfmt::skip]
+        let r = i64x2::new(5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_add_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_a_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -4, -3, -2, -100,
+            -4, -3, -2, -100,
+            -4, -3, -2, -100,
+            -4, -3, -2, -100
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            104, 127, 102, 127, 
+            104, 127, 102, 127, 
+            104, 127, 102, 127, 
+            104, 127, 102, 127
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_a_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_a_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            100, i16::MAX, 100, i16::MAX, 
+            100, i16::MAX, 100, i16::MAX
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(-4, -3, -2, -1, -4, -3, -2, -1);
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            104, i16::MAX, 102, i16::MAX,
+            104, i16::MAX, 102, i16::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_a_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_a_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MAX);
+        #[rustfmt::skip]
+        let b = i32x4::new(-4, -3, -2, -1);
+        #[rustfmt::skip]
+        let r = i32x4::new(104, i32::MAX, 102, i32::MAX);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_a_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, -3);
+        #[rustfmt::skip]
+        let r = i64x2::new(104, i64::MAX);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -4, -3, -2, 100,
+            -4, -3, -2, 100,
+            -4, -3, -2, 100,
+            -4, -3, -2, 100
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            96, i8::MIN, 98, i8::MAX, 
+            96, i8::MIN, 98, i8::MAX, 
+            96, i8::MIN, 98, i8::MAX, 
+            96, i8::MIN, 98, i8::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            100, i16::MIN, 100, i16::MAX, 
+            100, i16::MIN, 100, i16::MAX
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(-4, -3, -2, 1, -4, -3, -2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            96, i16::MIN, 98, i16::MAX, 
+            96, i16::MIN, 98, i16::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MIN);
+        #[rustfmt::skip]
+        let b = i32x4::new(-4, 3, -2, -1);
+        #[rustfmt::skip]
+        let r = i32x4::new(96, i32::MAX, 98, i32::MIN);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MIN);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, -3);
+        #[rustfmt::skip]
+        let r = i64x2::new(96, i64::MIN);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            4, 3, 2, 100,
+            4, 3, 2, 100,
+            4, 3, 2, 100,
+            4, 3, 2, 100
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            104, u8::MAX, 102, u8::MAX, 
+            104, u8::MAX, 102, u8::MAX, 
+            104, u8::MAX, 102, u8::MAX, 
+            104, u8::MAX, 102, u8::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            100, u16::MAX, 100, u16::MAX, 
+            100, u16::MAX, 100, u16::MAX
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(4, 3, 2, 1, 4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            104, u16::MAX, 102, u16::MAX, 
+            104, u16::MAX, 102, u16::MAX
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(100, u32::MAX, 100, u32::MAX);
+        #[rustfmt::skip]
+        let b = u32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = u32x4::new(104, u32::MAX, 102, u32::MAX);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_adds_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(100, u64::MAX);
+        #[rustfmt::skip]
+        let b = u64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = u64x2::new(104, u64::MAX);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_adds_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX,
+            100, i8::MIN, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -4, -3, -2, 100,
+            -4, -3, -2, 100,
+            -4, -3, -2, 100,
+            -4, -3, -2, 100
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            96, 125, 98, -29, 
+            96, 125, 98, -29, 
+            96, 125, 98, -29, 
+            96, 125, 98, -29
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_addv_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            100, i16::MIN, 100, i16::MAX, 
+            100, i16::MIN, 100, i16::MAX
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(-4, -3, -2, 1, -4, -3, -2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(96, 32765, 98, -32768, 96, 32765, 98, -32768);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_addv_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MIN);
+        #[rustfmt::skip]
+        let b = i32x4::new(-4, 3, -2, -1);
+        #[rustfmt::skip]
+        let r = i32x4::new(96, -2147483646, 98, 2147483647);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_addv_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MIN);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, -3);
+        #[rustfmt::skip]
+        let r = i64x2::new(96, 9223372036854775805);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_addv_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addvi_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX,
+            100, i8::MAX, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            103, -126, 103, -126, 
+            103, -126, 103, -126, 
+            103, -126, 103, -126, 
+            103, -126, 103, -126
+        );
+
+        assert_eq!(r, mem::transmute(__msa_addvi_b(mem::transmute(a), 67)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addvi_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, -100, -127, 
+            i16::MAX, 3276, -100, -127
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            -32766, 3279, -97, -124, 
+            -32766, 3279, -97, -124
+        );
+
+        assert_eq!(r, mem::transmute(__msa_addvi_h(mem::transmute(a), 67)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addvi_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MIN);
+        #[rustfmt::skip]
+        let r = i32x4::new(103, -2147483646, 103, -2147483645);
+
+        assert_eq!(r, mem::transmute(__msa_addvi_w(mem::transmute(a), 67)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_addvi_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MIN);
+        #[rustfmt::skip]
+        let r = i64x2::new(117, -9223372036854775791);
+
+        assert_eq!(r, mem::transmute(__msa_addvi_d(mem::transmute(a), 17)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_and_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX
+    );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            4, 3, 2, 100,
+            4, 3, 2, 100,
+            4, 3, 2, 100,
+            4, 3, 2, 100
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            4, 3, 0, 100, 
+            4, 3, 0, 100, 
+            4, 3, 0, 100, 
+            4, 3, 0, 100
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_and_v(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_andi_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX,
+            100, u8::MAX, 100, u8::MAX
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            4, 5, 4, 5, 
+            4, 5, 4, 5, 
+            4, 5, 4, 5, 
+            4, 5, 4, 5
+        );
+
+        assert_eq!(r, mem::transmute(__msa_andi_b(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            5, 5, 5, 5, 
+            5, 5, 5, 5, 
+            5, 5, 5, 5, 
+            5, 5, 5, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, -7, -8, -9, -6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(5, 5, 5, 5, 5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            5, 5, 5, 5, 
+            5, 5, 5, 5, 
+            5, 5, 5, 5, 
+            5, 5, 5, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(5, 5, 5, 5, 5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(5, 5, 5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_asub_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(5, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_asub_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            2, -5, 2, -7, 
+            2, -5, 2, -7, 
+            2, -5, 2, -7, 
+            2, -5, 2, -7
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(6, -7, 8, -9, 6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(2, -5, 2, -7, 2, -5, 2, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(2, -5, 2, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(-4, -5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            3, 4, 5, 6, 
+            3, 4, 5, 6, 
+            3, 4, 5, 6, 
+            3, 4, 5, 6
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(3, 4, 5, 6, 3, 4, 5, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(3, 4, 5, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ave_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(3, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ave_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -1, -2, 3, -4,
+            -1, -2, 3, -4,
+            -1, -2, 3, -4,
+            -1, -2, 3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, 7, -8, -9,
+            -6, 7, -8, -9,
+            -6, 7, -8, -9,
+            -6, 7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -3, 3, -2, -6, 
+            -3, 3, -2, -6, 
+            -3, 3, -2, -6, 
+            -3, 3, -2, -6
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, 3, -4, -1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, -9, -6, 7, -8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(-3, 3, -2, -6, -3, 3, -2, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-6, 7, -8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(-3, 3, -2, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(-3, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            4, 5, 6, 7, 
+            4, 5, 6, 7, 
+            4, 5, 6, 7, 
+            4, 5, 6, 7
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(4, 5, 6, 7, 4, 5, 6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(4, 5, 6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_aver_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(4, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_aver_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclr_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            191, 27, 54, 1, 
+            191, 27, 54, 1, 
+            191, 27, 54, 1, 
+            191, 27, 54, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bclr_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclr_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(191, 27, 55, 1, 191, 27, 55, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bclr_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclr_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(191, 27, 55, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bclr_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclr_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(191, 27);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bclr_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclri_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            247, 147, 55, 1, 
+            247, 147, 55, 1, 
+            247, 147, 55, 1, 
+            247, 147, 55, 1
+        );
+
+        assert_eq!(r, mem::transmute(__msa_bclri_b(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclri_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(2155, 1155, 155, 1, 2155, 1155, 155, 1);
+        #[rustfmt::skip]
+        let r = u16x8::new(107, 1155, 155, 1, 107, 1155, 155, 1);
+
+        assert_eq!(r, mem::transmute(__msa_bclri_h(mem::transmute(a), 11)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclri_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(211111155, 111111155, 11111155, 1);
+        #[rustfmt::skip]
+        let r = u32x4::new(202722547, 102722547, 2722547, 1);
+
+        assert_eq!(r, mem::transmute(__msa_bclri_w(mem::transmute(a), 23)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bclri_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(211111111155, 11111111111111155);
+        #[rustfmt::skip]
+        let r = u64x2::new(73672157683, 11110973672157683);
+
+        assert_eq!(r, mem::transmute(__msa_bclri_d(mem::transmute(a), 37)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsl_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            1, 3, 5, 9,
+            1, 3, 5, 9,
+            1, 3, 5, 9,
+            1, 3, 5, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            63, 11, 11, 1, 
+            63, 11, 11, 1, 
+            63, 11, 11, 1, 
+            63, 11, 11, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsl_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsl_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 16384, 8192, 4096, 
+            32767, 16384, 8192, 4096
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            21656, 5273, 7081, 2985, 
+            21656, 5273, 7081, 2985
+        );
+        #[rustfmt::skip]
+        let c = u16x8::new(
+            3, 7, 9, 13, 
+            15, 17, 21, 23
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            24575, 5120, 7040, 2984, 
+            21656, 0, 6144, 2816
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsl_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsl_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(2147483647, 536870912, 67108864, 8388608);
+        #[rustfmt::skip]
+        let b = u32x4::new(1036372536, 259093134, 78219975, 1119499719);
+        #[rustfmt::skip]
+        let c = u32x4::new(11, 15, 31, 37);
+        #[rustfmt::skip]
+        let r = u32x4::new(1037041663, 259063808, 78219975, 1082130432);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsl_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsl_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(8006399338, 2882303762);
+        #[rustfmt::skip]
+        let b = u64x2::new(9223372036854775805, 536870912);
+        #[rustfmt::skip]
+        let c = u64x2::new(12, 48);
+        #[rustfmt::skip]
+        let r = u64x2::new(9221120245047489898, 536901394);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsl_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsli_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 7, 11, 9, 
+            7, 7, 11, 9, 
+            7, 7, 11, 9, 
+            7, 7, 11, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsli_b(mem::transmute(a), mem::transmute(b), 5))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsli_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 16384, 8192, 4096, 
+            32767, 16384, 8192, 4096
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            21656, 5273, 7081, 2985, 
+            21656, 5273, 7081, 2985
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            21659, 5272, 7080, 2984, 
+            21659, 5272, 7080, 2984
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsli_h(mem::transmute(a), mem::transmute(b), 13))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsli_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(2147483647, 536870912, 67108864, 8388608);
+        #[rustfmt::skip]
+        let b = u32x4::new(1036372536, 259093134, 78219975, 1119499719);
+        #[rustfmt::skip]
+        let r = u32x4::new(1036386303, 259080192, 78217216, 1119485952);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsli_w(mem::transmute(a), mem::transmute(b), 17))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsli_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(8006399338, 2882303762);
+        #[rustfmt::skip]
+        let b = u64x2::new(9223372036854775805, 536870912);
+        #[rustfmt::skip]
+        let r = u64x2::new(9223372036854773098, 536901394);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsli_d(mem::transmute(a), mem::transmute(b), 48))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsr_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            1, 3, 5, 9,
+            1, 3, 5, 9,
+            1, 3, 5, 9,
+            1, 3, 5, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            254, 151, 8, 1, 
+            254, 151, 8, 1, 
+            254, 151, 8, 1, 
+            254, 151, 8, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsr_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsr_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 16384, 8192, 4096, 
+            32767, 16384, 8192, 4096
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            21656, 5273, 7081, 2985, 
+            21656, 5273, 7081, 2985
+        );
+        #[rustfmt::skip]
+        let c = u16x8::new(
+            3, 7, 9, 13, 
+            15, 17, 21, 23
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            32760, 16537, 9129, 2985, 
+            21656, 16385, 8233, 4265
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsr_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsr_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(2147483647, 536870912, 67108864, 8388608);
+        #[rustfmt::skip]
+        let b = u32x4::new(1036372536, 259093134, 78219975, 1119499719);
+        #[rustfmt::skip]
+        let c = u32x4::new(11, 15, 31, 37);
+        #[rustfmt::skip]
+        let r = u32x4::new(2147482168, 536900238, 78219975, 8388615);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsr_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsr_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(8006399338, 2882303762);
+        #[rustfmt::skip]
+        let b = u64x2::new(9223372036854775805, 536870912);
+        #[rustfmt::skip]
+        let c = u64x2::new(12, 48);
+        #[rustfmt::skip]
+        let r = u64x2::new(8006402045, 536870912);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsr_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsri_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            198, 135, 8, 9, 
+            198, 135, 8, 9, 
+            198, 135, 8, 9, 
+            198, 135, 8, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsri_b(mem::transmute(a), mem::transmute(b), 5))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsri_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 16384, 8192, 4096, 
+            32767, 16384, 8192, 4096
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            21656, 5273, 7081, 2985, 
+            21656, 5273, 7081, 2985
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            21656, 21657, 7081, 2985, 
+            21656, 21657, 7081, 2985
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsri_h(mem::transmute(a), mem::transmute(b), 13))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsri_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(2147483647, 536870912, 67108864, 8388608);
+        #[rustfmt::skip]
+        let b = u32x4::new(1036372536, 259093134, 78219975, 1119499719);
+        #[rustfmt::skip]
+        let r = u32x4::new(2147338808, 536965774, 67209927, 8533447);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsri_w(mem::transmute(a), mem::transmute(b), 17))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_binsri_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(8006399338, 2882303762);
+        #[rustfmt::skip]
+        let b = u64x2::new(9223372036854775805, 536870912);
+        #[rustfmt::skip]
+        let r = u64x2::new(562949953421309, 536870912);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_binsri_d(mem::transmute(a), mem::transmute(b), 48))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bmnz_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            254, 159, 48, 1, 
+            254, 159, 48, 1, 
+            254, 159, 48, 1, 
+            254, 159, 48, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bmnz_v(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bmnzi_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, u8::MAX, 155, 55,
+            1, u8::MAX, 155, 55,
+            1, u8::MAX, 155, 55,
+            1, u8::MAX, 155, 55
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            249, 159, 51, 7, 
+            249, 159, 51, 7, 
+            249, 159, 51, 7, 
+            249, 159, 51, 7
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bmnzi_b(mem::transmute(a), mem::transmute(b), 7))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bmz_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 3, 15, 9, 
+            7, 3, 15, 9, 
+            7, 3, 15, 9, 
+            7, 3, 15, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bmz_v(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bmzi_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1,
+            u8::MAX, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 255, 155, 55,
+            1, 255, 155, 55,
+            1, 255, 155, 55,
+            1, 255, 155, 55
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 251, 159, 49, 
+            7, 251, 159, 49, 
+            7, 251, 159, 49, 
+            7, 251, 159, 49
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bmzi_b(mem::transmute(a), mem::transmute(b), 7))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bneg_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            191, 27, 54, 3, 
+            191, 27, 54, 3, 
+            191, 27, 54, 3, 
+            191, 27, 54, 3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bneg_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bneg_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(191, 27, 311, 513, 191, 27, 311, 513);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bneg_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bneg_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(191, 27, 311, 513);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bneg_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bneg_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(191, 27);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bneg_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnegi_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            50, 100, 127, u8::MAX,
+            50, 100, 127, u8::MAX,
+            50, 100, 127, u8::MAX,
+            50, 100, 127, u8::MAX
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            34, 116, 111, 239, 
+            34, 116, 111, 239, 
+            34, 116, 111, 239, 
+            34, 116, 111, 239
+        );
+
+        assert_eq!(r, mem::transmute(__msa_bnegi_b(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnegi_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 3276, 100, 127,
+            32767, 3276, 100, 127
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(
+            30719, 1228, 2148, 2175, 
+            30719, 1228, 2148, 2175
+        );
+
+        assert_eq!(r, mem::transmute(__msa_bnegi_h(mem::transmute(a), 11)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnegi_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(100, 2147483647, 100, 2147483648);
+        #[rustfmt::skip]
+        let r = u32x4::new(16777316, 2130706431, 16777316, 2164260864);
+
+        assert_eq!(r, mem::transmute(__msa_bnegi_w(mem::transmute(a), 24)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnegi_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(100, 9223372036854775808);
+        #[rustfmt::skip]
+        let r = u64x2::new(4398046511204, 9223376434901286912);
+
+        assert_eq!(r, mem::transmute(__msa_bnegi_d(mem::transmute(a), 42)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 1, 1, 1,
+            1, 1, 1, 1,
+            2, 2, 2, 2,
+            4, 4, 0, 4,
+        );
+        let r = 0 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            32767, 3276, 100, 127,
+            32767, 0, 100, 127
+        );
+        let r = 0 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(100, 2147483647, 0, 2147483648);
+        let r = 0 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(100, 9223372036854775808);
+        #[rustfmt::skip]
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bnz_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            0, 0, 0, 1,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bnz_v(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bsel_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1,
+            3, 5, 7, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 3, 15, 9, 
+            7, 3, 15, 9, 
+            7, 3, 15, 9, 
+            7, 3, 15, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bsel_v(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseli_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            121, 29, 57, 9, 
+            121, 29, 57, 9, 
+            121, 29, 57, 9, 
+            121, 29, 57, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bseli_b(mem::transmute(a), mem::transmute(b), 121))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bset_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            255, 155, 55, 3, 
+            255, 155, 55, 3, 
+            255, 155, 55, 3, 
+            255, 155, 55, 3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bset_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bset_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(255, 155, 311, 513, 255, 155, 311, 513);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bset_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bset_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(255, 155, 311, 513);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bset_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bset_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(255, 155);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_bset_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseti_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            255, 159, 55, 5, 
+            255, 159, 55, 5, 
+            255, 159, 55, 5, 
+            255, 159, 55, 5
+        );
+
+        assert_eq!(r, mem::transmute(__msa_bseti_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseti_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let r = u16x8::new(255, 159, 55, 5, 255, 159, 55, 5);
+
+        assert_eq!(r, mem::transmute(__msa_bseti_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseti_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let r = u32x4::new(255, 159, 55, 5);
+
+        assert_eq!(r, mem::transmute(__msa_bseti_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bseti_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let r = u64x2::new(255, 159);
+
+        assert_eq!(r, mem::transmute(__msa_bseti_d(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1,
+            255, 155, 55, 1
+        );
+        let r = 0 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 0, 55, 1);
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 0);
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_bz_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0
+        );
+        let r = 1 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_bz_v(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceq_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, 127, 55, 1,
+            -128, 127, 55, 1,
+            -128, 127, 55, 1,
+            -128, 127, 55, 1
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -128, 126, 55, 1,
+            -128, 126, 55, 1,
+            -128, 126, 55, 1,
+            -128, 126, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, -1, -1, 
+            -1, 0, -1, -1, 
+            -1, 0, -1, -1, 
+            -1, 0, -1, -1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ceq_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceq_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(255, 155, 55, 1, 255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = i16x8::new(255, 155, 56, 1, 255, 155, 56, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, -1, 0, -1, -1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ceq_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceq_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(255, 155, 55, 1);
+        #[rustfmt::skip]
+        let b = i32x4::new(255, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ceq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceq_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = i64x2::new(255, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ceq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceqi_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, -1, -4, 15,
+            100, -1, -4, 15,
+            100, -1, -4, 15,
+            100, -1, -4, 15
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            0, 0, -1, 0,
+            0, 0, -1, 0,
+            0, 0, -1, 0,
+            0, 0, -1, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ceqi_b(mem::transmute(a), -4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceqi_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 100, -11,
+            32767, 3276, 100, -11
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(0, 0, 0, -1, 0, 0, 0, -1);
+
+        assert_eq!(r, mem::transmute(__msa_ceqi_h(mem::transmute(a), -11)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ceqi_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 3, 5, -3);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_ceqi_w(mem::transmute(a), 5)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // Test passes if 4294967293 is used instead -3 in vector `a`
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_ceqi_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(-3, 2);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-1, 0);
+
+    //     assert_eq!(r, mem::transmute(__msa_ceqi_d(mem::transmute(a), -3)));
+    // }
+
+    // Can not be tested in user mode
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_cfcmsa() {
+    //     let r = 5;
+
+    //     assert_eq!(r, mem::transmute(__msa_cfcmsa(5));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, 127, 55, 2,
+            -128, 127, 55, 2,
+            -128, 127, 55, 2,
+            -128, 127, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -128, 126, 55, 1,
+            -128, 126, 55, 1,
+            -128, 126, 55, 1,
+            -128, 126, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, -1, 0, 
+            -1, 0, -1, 0, 
+            -1, 0, -1, 0, 
+            -1, 0, -1, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(255, 155, 55, 2, 255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = i16x8::new(255, 155, 56, 1, 255, 155, 56, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, -1, -1, 0, -1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(255, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = i64x2::new(255, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 127, 55, 2,
+            u8::MAX, 127, 55, 2,
+            u8::MAX, 127, 55, 2,
+            u8::MAX, 127, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            u8::MAX, 126, 55, 1,
+            u8::MAX, 126, 55, 1,
+            u8::MAX, 126, 55, 1,
+            u8::MAX, 126, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 155, 55, 2, 
+            u16::MAX, 155, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            u16::MAX, 155, 56, 1, 
+            u16::MAX, 155, 56, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, -1, -1, 0, -1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = u32x4::new(u32::MAX, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_cle_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(u64::MAX, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_cle_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -2, -127, 100, -127,
+            -2, -127, 100, -127,
+            -2, -127, 100, -127,
+            -2, -127, 100, -127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(-1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1);
+
+        assert_eq!(r, mem::transmute(__msa_clei_s_b(mem::transmute(a), -2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 10, -1,
+            32767, 3276, 10, -1,
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(0, 0, 0, -1, 0, 0, 0, -1);
+
+        assert_eq!(r, mem::transmute(__msa_clei_s_h(mem::transmute(a), -1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 6, 2147483647);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clei_s_w(mem::transmute(a), 6)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // -3 is represented as 4294967293
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_clei_s_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(-3, 11);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-1, 0);
+
+    //     assert_eq!(r, mem::transmute(__msa_clei_s_d(mem::transmute(a), -3)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            2, 127, 100, 127,
+            2, 127, 100, 127,
+            2, 127, 100, 127,
+            2, 127, 100, 127,
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, 0, 0, 
+            -1, 0, 0, 0, 
+            -1, 0, 0, 0, 
+            -1, 0, 0, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_clei_u_b(mem::transmute(a), 25)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            1, 26, 15, 36,
+            1, 26, 15, 36
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, 0, -1, 0, -1, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clei_u_h(mem::transmute(a), 25)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(25, 32, 25, 32);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clei_u_w(mem::transmute(a), 31)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clei_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(10, 26);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clei_u_d(mem::transmute(a), 25)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, 127, 55, 2,
+            -128, 127, 55, 2,
+            -128, 127, 55, 2,
+            -128, 127, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -127, 126, 56, 1,
+            -127, 126, 56, 1,
+            -127, 126, 56, 1,
+            -127, 126, 56, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, -1, 0, 
+            -1, 0, -1, 0, 
+            -1, 0, -1, 0, 
+            -1, 0, -1, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-255, 155, 55, 2, -255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = i16x8::new(255, 156, 56, 1, 255, 156, 56, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, -1, -1, 0, -1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(255, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-255, 155);
+        #[rustfmt::skip]
+        let b = i64x2::new(255, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            128, 127, 55, 2,
+            128, 127, 55, 2,
+            128, 127, 55, 2,
+            128, 127, 55, 2
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            127, 126, 56, 1,
+            127, 126, 56, 1,
+            127, 126, 56, 1,
+            127, 126, 56, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            0, 0, -1, 0, 
+            0, 0, -1, 0, 
+            0, 0, -1, 0, 
+            0, 0, -1, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(255, 155, 55, 2, 255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = u16x8::new(255, 156, 56, 1, 255, 156, 56, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(0, -1, -1, 0, 0, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(255, 155, 55, 2);
+        #[rustfmt::skip]
+        let b = u32x4::new(255, 156, 55, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clt_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(255, 155);
+        #[rustfmt::skip]
+        let b = u64x2::new(255, 156);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_clt_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            2, -127, -5, 127,
+            2, -127, -5, 127,
+            2, -127, -5, 127,
+            2, -127, -5, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            0, -1, 0, 0, 
+            0, -1, 0, 0, 
+            0, -1, 0, 0, 
+            0, -1, 0, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_clti_s_b(mem::transmute(a), -5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -1024, 3276, 15, 127,
+            -1024, 3276, 15, 127
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-1, 0, 0, 0, -1, 0, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_s_h(mem::transmute(a), 15)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-15, 2147483647, -15, 2147483647);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_s_w(mem::transmute(a), -10)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // -3 is represented as 4294967293
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_clti_s_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(-5, -2);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-1, 0);
+
+    //     assert_eq!(r, mem::transmute(__msa_clti_s_d(mem::transmute(a), -3)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            2, 127, 49, 127,
+            2, 127, 49, 127,
+            2, 127, 49, 127,
+            2, 127, 49, 127,
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -1, 0, 0, 0,
+            -1, 0, 0, 0,
+            -1, 0, 0, 0,
+            -1, 0, 0, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_clti_u_b(mem::transmute(a), 50)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            327, 3276, 100, 127,
+            327, 3276, 100, 127
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_u_h(mem::transmute(a), 30)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(100, 2147483647, 100, 2147483647);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_u_w(mem::transmute(a), 10)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_clti_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 9223372036854775807);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_clti_u_d(mem::transmute(a), 10)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127
+        );
+        #[rustfmt::skip]
+        let r = -100 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_s_b(mem::transmute(a), 12)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 100, 11,
+            32767, 3276, 100, 11
+        );
+        #[rustfmt::skip]
+        let r = 32767 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_s_h(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 5, -2147483647);
+        let r = 2147483647 as i32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_s_w(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(3, 9223372036854775807);
+        #[rustfmt::skip]
+        let r = 9223372036854775807 as i64;
+
+        assert_eq!(r, mem::transmute(__msa_copy_s_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_u_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, 127, 4, 127,
+            100, 127, 4, 127,
+            100, 127, 4, 127,
+            100, 127, 4, 127
+        );
+        #[rustfmt::skip]
+        let r = 100 as u32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_u_b(mem::transmute(a), 12)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_u_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 100, 11,
+            32767, 3276, 100, 11
+        );
+        #[rustfmt::skip]
+        let r = 32767 as u32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_u_h(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_u_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 5, 2147483647);
+        #[rustfmt::skip]
+        let r = 2147483647 as u32;
+
+        assert_eq!(r, mem::transmute(__msa_copy_u_w(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_copy_u_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(3, i64::MAX);
+        #[rustfmt::skip]
+        let r = 9223372036854775807 as u64;
+
+        assert_eq!(r, mem::transmute(__msa_copy_u_d(mem::transmute(a), 1)));
+    }
+
+    // Can not be tested in user mode
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_ctcmsa() {
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            6, 3, 2, 2, 
+            6, 3, 2, 2, 
+            6, 3, 2, 2, 
+            6, 3, 2, 2
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-6, -7, -8, -9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = i16x8::new(-1, -2, -3, -4, -1, -2, -3, -4);
+        #[rustfmt::skip]
+        let r = i16x8::new(6, 3, 2, 2, -6, -3, -2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-6, -7, 8, 9);
+        #[rustfmt::skip]
+        let b = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let r = i32x4::new(6, 3, -2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-6, 7);
+        #[rustfmt::skip]
+        let b = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let r = i64x2::new(6, -3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            6, 3, 2, 2, 
+            6, 3, 2, 2, 
+            6, 3, 2, 2, 
+            6, 3, 2, 2
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = u16x8::new(6, 3, 2, 2, 6, 3, 2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = u32x4::new(6, 3, 2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_div_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let b = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = u64x2::new(6, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_div_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_s_h() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4,
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(20, -12, 20, 60, 20, -12, 20, 60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_s_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, -7, -8, -9, -6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(20, 60, 20, -12);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_s_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i64x2::new(20, -12);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_u_h() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(20, 60, 20, 60, 20, 60, 20, 60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_u_w() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(20, 60, 20, 60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dotp_u_d() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u64x2::new(20, 60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dotp_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, 4);
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4,
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(19, -14, 17, 56, 19, -14, 17, 64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_s_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(19, -14, 17, 56);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_s_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i32x4::new(-1, -2, -3, 4);
+        #[rustfmt::skip]
+        let c = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i64x2::new(19, -14);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_s_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(21, 62, 23, 64, 21, 62, 23, 64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_u_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let c = u16x8::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u32x4::new(21, 62, 23, 64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_u_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpadd_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let c = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u64x2::new(21, 62);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpadd_u_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-1, -2, -3, -4, -1, -2, -3, 4);
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4,
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-21, 10, -23, -64, -21, 10, -23, -56);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_s_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            -1, -2, -3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(-21, 10, -23, -64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_s_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, -2);
+        #[rustfmt::skip]
+        let b = i32x4::new(-1, -2, -3, 4);
+        #[rustfmt::skip]
+        let c = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = i64x2::new(-21, 10);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_s_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_u_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, -1, 2,-3, 4);
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4        
+        );
+        #[rustfmt::skip]
+        let c = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9       
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-19, -62, -17, -64, -21, -58, -23, -56);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_u_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_u_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let c = u16x8::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(-19, -62, -17, -64);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_u_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_dpsub_u_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, -2);
+        #[rustfmt::skip]
+        let b = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let c = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i64x2::new(-19, -62);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_dpsub_u_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fadd_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, -4.4);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.4, -3.3, 2.2, -1.1);
+        #[rustfmt::skip]
+        let r = f32x4::new(5.5, -5.5, 5.5, -5.5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fadd_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fadd_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.4, -3.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(5.5, -5.5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fadd_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    // Only observed beahiour should be SIGFPE signal
+    // Can not be tested
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcaf_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, -4.4);
+        #[rustfmt::skip]
+        let b = f32x4::new(0.0, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcaf_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    // Only observed beahiour should be SIGFPE signal
+    // Can not be tested
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcaf_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(-2.2, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcaf_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fceq_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-4.4, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fceq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fceq_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fceq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fclass_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(128, 8, 128, 2);
+
+        assert_eq!(r, mem::transmute(__msa_fclass_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fclass_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let r = i64x2::new(128, 8);
+
+        assert_eq!(r, mem::transmute(__msa_fclass_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcle_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-4.4, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcle_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcle_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcle_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fclt_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-4.4, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fclt_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fclt_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fclt_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcne_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-4.4, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcne_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcne_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcne_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcor_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcor_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcor_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcor_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcueq_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcueq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcueq_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcueq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcule_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcule_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcule_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcule_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcult_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcult_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcult_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcult_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcun_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcun_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcun_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcun_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcune_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(f32::NAN, -1.2, 3.3, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcune_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fcune_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(1.1, 1.1);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fcune_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fdiv_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.25, -20.2, 333.333, -425.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.0, -2.1, 11.11, 8.2);
+        #[rustfmt::skip]
+        let r = f32x4::new(1.3125, 9.619048, 30.002972, -51.82927);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fdiv_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fdiv_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1111.11, -222222.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(-4.85, 3.33);
+        #[rustfmt::skip]
+        let r = f64x2::new(-229.09484536082473, -66733.3933933934);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fdiv_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    /*// FIXME: 16-bit floats
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexdo_h() {
+        #[rustfmt::skip]
+        let a = f32x4::new(20.5, 2.3, 4.5, 5.4);
+        #[rustfmt::skip]
+        let b = f32x4::new(1.1, 1.0, 1.0, 1.0);
+        let r = i16x8::new(1, 9, 30, 51, 1, 9, 30, 51);
+
+        assert_eq!(r, mem::transmute(__msa_fexdo_h(mem::transmute(a), mem::transmute(b))));
+    }*/
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexdo_w() {
+        #[rustfmt::skip]
+        let a = f64x2::new(2000005.5, 2.3);
+        #[rustfmt::skip]
+        let b = f64x2::new(1235689784512.1, 2147483649998.5);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            1235689800000.0, 2147483600000.0, 
+            2000005.5, 2.3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fexdo_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexp2_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, -4.4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, -3, 2, 1);
+        #[rustfmt::skip]
+        let r = f32x4::new(17.6, -0.275, 13.2, -8.8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fexp2_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexp2_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-4, 3);
+        #[rustfmt::skip]
+        let r = f64x2::new(0.06875, -17.6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fexp2_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    // FIXME: 16-bit floats
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_fexupl_w() {
+    //     #[rustfmt::skip]
+    //     let a = f16x8(1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5);
+    //     #[rustfmt::skip]
+    //     let r = f32x4::new(5.5, 6.5, 7.5, 8.5);
+
+    //     assert_eq!(r, mem::transmute(__msa_fexupl_w(mem::transmute(a))));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexupl_d() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 6.5, 7.5, 8.5);
+        #[rustfmt::skip]
+        let r = f64x2::new(7.5, 8.5);
+
+        assert_eq!(r, mem::transmute(__msa_fexupl_d(mem::transmute(a))));
+    }
+
+    // FIXME: 16-bit floats
+    //     #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_fexupr_w() {
+    //     #[rustfmt::skip]
+    //     let a = f16x8(1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5);
+    //     #[rustfmt::skip]
+    //     let r = f32x4::new(1.5, 2.5, 3.5, 4.5);
+
+    //     assert_eq!(r, mem::transmute(__msa_fexupr_w(mem::transmute(a))));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fexupr_d() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 6.5, 7.5, 8.5);
+        #[rustfmt::skip]
+        let r = f64x2::new(5.5, 6.5);
+
+        assert_eq!(r, mem::transmute(__msa_fexupr_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffint_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(-1, 2, -3, 4);
+        #[rustfmt::skip]
+        let r = f32x4::new(-1.0, 2.0, -3.0, 4.0);
+
+        assert_eq!(r, mem::transmute(__msa_ffint_s_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffint_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let r = f64x2::new(-1.0,     2.0);
+
+        assert_eq!(r, mem::transmute(__msa_ffint_s_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffint_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = f32x4::new(1.0, 2.0, 3.0, 4.0);
+
+        assert_eq!(r, mem::transmute(__msa_ffint_u_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffint_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = f64x2::new(1.0, 2.0);
+
+        assert_eq!(r, mem::transmute(__msa_ffint_u_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffql_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(11, 25, 33, 47, 11, 25, 33, 47);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            0.00033569336, 0.00076293945,
+            0.0010070801, 0.0014343262
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ffql_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffql_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1111, 2222, 3333, 4444);
+        #[rustfmt::skip]
+        let r = f64x2::new(
+            0.000001552049070596695,
+            0.0000020693987607955933
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ffql_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffqr_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(12, 26, 34, 48, 11, 25, 33, 47);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            0.00036621094, 0.00079345703, 
+            0.0010375977, 0.0014648438
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ffqr_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ffqr_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1111, 2555, 3333, 475);
+        #[rustfmt::skip]
+        let r = f64x2::new(
+            0.0000005173496901988983, 
+            0.0000011897645890712738
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ffqr_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fill_b() {
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            2, 2, 2, 2,
+            2, 2, 2, 2,
+            2, 2, 2, 2,
+            2, 2, 2, 2
+        );
+
+        assert_eq!(r, mem::transmute(__msa_fill_b(2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fill_h() {
+        #[rustfmt::skip]
+        let r = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_fill_h(2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fill_w() {
+        #[rustfmt::skip]
+        let r = i32x4::new(2, 2, 2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_fill_w(2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fill_d() {
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_fill_d(2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_flog2_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(8.0, 16.0, 32.0, 64.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(3.0, 4.0, 5.0, 6.0);
+
+        assert_eq!(r, mem::transmute(__msa_flog2_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_flog2_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(8.0, 16.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(3.0, 4.0);
+
+        assert_eq!(r, mem::transmute(__msa_flog2_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmadd_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
+        #[rustfmt::skip]
+        let c = f32x4::new(9.0, 10.0, 11.0, 12.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(46.0, 62.0, 80.0, 100.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmadd_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmadd_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, 2.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 4.0);
+        #[rustfmt::skip]
+        let c = f64x2::new(5.0, 6.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(16.0, 26.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmadd_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmax_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, -6.0, 7.0, 8.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, -2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(5.0, -2.0, 7.0, 8.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmax_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmax_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, 4.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 2.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(3.0, 4.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmax_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmax_a_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, -6.0, -7.0, -8.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, -2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(5.0, -6.0, -7.0, -8.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmax_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmax_a_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, -4.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 2.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(3.0, -4.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmax_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmin_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, -6.0, 7.0, 8.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, -2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(1.0, -6.0, 3.0, 4.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmin_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmin_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, 4.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 2.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(1.0, 2.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmin_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmin_a_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, -6.0, -7.0, -8.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, -2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(1.0, -2.0, 3.0, 4.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmin_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmin_a_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, -4.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 2.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(1.0, 2.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmin_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmsub_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
+        #[rustfmt::skip]
+        let c = f32x4::new(9.0, 10.0, 11.0, 12.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(-44.0, -58.0, -74.0, -92.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmsub_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmsub_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.0, 2.0);
+        #[rustfmt::skip]
+        let b = f64x2::new(3.0, 4.0);
+        #[rustfmt::skip]
+        let c = f64x2::new(5.0, 6.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(-14.0, -22.0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmsub_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmul_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(1.1, -2.2, 3.3, 4.4);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.4, 3.3, 2.2, -1.1);
+        #[rustfmt::skip]
+        let r = f32x4::new(4.84, -7.26, 7.26, -4.84);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmul_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fmul_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(1.1, -2.2);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.0, -3.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(4.4, 7.26);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fmul_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frint_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(2.6, -2.7, 1.3, -1.7);
+        #[rustfmt::skip]
+        let r = f32x4::new(3.0, -3.0, 1.0, -2.0);
+
+        assert_eq!(r, mem::transmute(__msa_frint_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frint_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(2.6, 1.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(3.0, 1.0);
+
+        assert_eq!(r, mem::transmute(__msa_frint_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frcp_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(2.6, -2.7, 1.3, -1.7);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            0.3846154, -0.37037036, 
+            0.7692308, -0.58823526
+        );
+
+        assert_eq!(r, mem::transmute(__msa_frcp_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frcp_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(2.6, 1.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(0.3846153846153846, 0.7692307692307692);
+
+        assert_eq!(r, mem::transmute(__msa_frcp_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frsqrt_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(2.6, 2.7, 1.3, 1.7);
+        #[rustfmt::skip]
+        let r = f32x4::new(
+            0.6201737, 0.6085806, 
+            0.87705797, 0.766965
+        );
+
+        assert_eq!(r, mem::transmute(__msa_frsqrt_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_frsqrt_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(2.6, 1.3);
+        #[rustfmt::skip]
+        let r = f64x2::new(0.6201736729460422, 0.8770580193070292);
+
+        assert_eq!(r, mem::transmute(__msa_frsqrt_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsaf_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(-5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsaf_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsaf_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsaf_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fseq_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, -3.3, f32::NAN, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, -3.3, f32::NAN, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fseq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fseq_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 5.5);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fseq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsle_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 5.5, 5.5, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(-5.5, 3.3, 5.5, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsle_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsle_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsle_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fslt_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, 3.3, 5.5, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fslt_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fslt_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fslt_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsne_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, 3.3, 5.5, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsne_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsne_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, 5.5);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsne_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsor_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, f32::NAN, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, 3.3, 5.5, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, 0, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsor_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsor_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-125.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(125.5, f64::NAN);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsor_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsqrt_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(9.0, 81.0, 1089.0, 10000.0);
+        #[rustfmt::skip]
+        let r = f32x4::new(3.0, 9.0, 33.0, 100.0);
+
+        assert_eq!(r, mem::transmute(__msa_fsqrt_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsqrt_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(81.0, 10000.0);
+        #[rustfmt::skip]
+        let r = f64x2::new(9.0, 100.0);
+
+        assert_eq!(r, mem::transmute(__msa_fsqrt_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsub_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 6.5, 7.5, 8.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(1.25, 1.75, 2.25, 2.75);
+        #[rustfmt::skip]
+        let r = f32x4::new(4.25, 4.75, 5.25, 5.75);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsub_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsub_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(555.5, 55.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.25, 3.25);
+        #[rustfmt::skip]
+        let r = f64x2::new(551.25, 52.25);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsub_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsueq_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, f32::NAN, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.5, 5.5, -5.5, 5.5);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsueq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsueq_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(5.5, f64::NAN);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsueq_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsule_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.7, 5.8, 5.9, f32::NAN);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.6, 5.9, 5.9, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, -1, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsule_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsule_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f64x2::new(5.5, 5.5);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsule_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsult_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 5.5, 5.5, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(5.6, f32::NAN, 2.2, 1.1);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsult_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsult_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.4, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsult_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsun_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 5.5, f32::NAN, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.4, 3.3, 2.2, f32::NAN);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 0, -1, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsun_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsun_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(4.4, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsun_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsune_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(5.5, 5.5, f32::NAN, 5.5);
+        #[rustfmt::skip]
+        let b = f32x4::new(4.4, 3.3, 2.2, 5.5);
+        #[rustfmt::skip]
+        let r = i32x4::new(-1, -1, -1, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsune_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_fsune_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, f64::NAN);
+        #[rustfmt::skip]
+        let b = f64x2::new(5.5, 3.3);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_fsune_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftint_s_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 75.6, -1000.7, 1219.3);
+        #[rustfmt::skip]
+        let r = i32x4::new(-6, 76, -1001, 1219);
+
+        assert_eq!(r, mem::transmute(__msa_ftint_s_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftint_s_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-5.5, 25656.4);
+        #[rustfmt::skip]
+        let r = i64x2::new(-6, 25656);
+
+        assert_eq!(r, mem::transmute(__msa_ftint_s_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftint_u_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 75.6, -1000.7, 1219.3);
+        #[rustfmt::skip]
+        let r = u32x4::new(0, 76, 0, 1219);
+
+        assert_eq!(r, mem::transmute(__msa_ftint_u_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftint_u_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, -25656.4);
+        #[rustfmt::skip]
+        let r = u64x2::new(6, 0);
+
+        assert_eq!(r, mem::transmute(__msa_ftint_u_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftq_h() {
+        #[rustfmt::skip]
+        let a = f32x4::new(0.00001, 0.0002, 0.00001, -0.0002);
+        #[rustfmt::skip]
+        let b = f32x4::new(0.0001, -0.002, 0.0001, 0.002);
+        #[rustfmt::skip]
+        let r = i16x8::new(3, -66, 3, 66, 0, 7, 0, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ftq_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftq_w() {
+        #[rustfmt::skip]
+        let a = f64x2::new(0.00001, -0.0002);
+        #[rustfmt::skip]
+        let b = f64x2::new(0.00000045, 0.000015);
+        #[rustfmt::skip]
+        let r = i32x4::new(966, 32212, 21475, -429497);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ftq_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftrunc_s_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 75.6, -1000.7, 1219.3);
+        #[rustfmt::skip]
+        let r = i32x4::new(-5, 75, -1000, 1219);
+
+        assert_eq!(r, mem::transmute(__msa_ftrunc_s_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftrunc_s_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(-5.5, 25656.4);
+        #[rustfmt::skip]
+        let r = i64x2::new(-5, 25656);
+
+        assert_eq!(r, mem::transmute(__msa_ftrunc_s_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftrunc_u_w() {
+        #[rustfmt::skip]
+        let a = f32x4::new(-5.5, 75.6, -1000.7, 1219.3);
+        #[rustfmt::skip]
+        let r = u32x4::new(0, 75, 0, 1219);
+
+        assert_eq!(r, mem::transmute(__msa_ftrunc_u_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ftrunc_u_d() {
+        #[rustfmt::skip]
+        let a = f64x2::new(5.5, -25656.4);
+        #[rustfmt::skip]
+        let r = u64x2::new(5, 0);
+
+        assert_eq!(r, mem::transmute(__msa_ftrunc_u_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_s_h() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(6, 6, 2, -2, 6, 6, 2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_s_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(6, 6, 2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_s_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, -2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_u_h() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(6, 6, 6, 6, 6, 6, 6, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_u_w() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = u32x4::new(6, 6, 6, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hadd_u_d() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = u64x2::new(6, 6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hadd_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_s_h() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-2, 2, -6, -6, -2, 2, -6, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_s_w() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(-2, 2, -6, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_s_d() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-6, -6);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_u_h() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(-2, 2, -2, 2, -2, 2, -2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_u_w() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(-2, 2, -2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_hsub_u_d() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(-2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_hsub_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvev_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            4, 1, 2, 3,
+            4, 1, 2, 3,
+            4, 1, 2, 3,
+            4, 1, 2, 3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvev_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvev_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 1, 2, 3, 4, 1, 2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvev_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvev_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 1, 2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvev_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvev_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(4, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvev_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvl_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            8, 9, 7, 10,
+            6, 11, 5, 12,
+            4, 13, 3, 14,
+            2, 15, 1, 16
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvl_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvl_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 5, 3, 6, 2, 7, 1, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvl_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvl_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(2, 3, 1, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvl_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvl_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvl_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvod_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            15, 2, 13, 4,
+            11, 6, 9, 8,
+            7, 10, 5, 12,
+            3, 14, 1, 16
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvod_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvod_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(7, 2, 5, 4, 3, 6, 1, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvod_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvod_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(3, 2, 1, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvod_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvod_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvod_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvr_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            16, 1, 15, 2,
+            14, 3, 13, 4,
+            12, 5, 11, 6,
+            10, 7, 9, 8
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvr_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvr_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1,
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(8, 1, 7, 2, 6, 3, 5, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvr_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvr_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 1, 3, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvr_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ilvr_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_ilvr_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insert_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            5, 127, 4, 127
+        );
+
+        assert_eq!(r, mem::transmute(__msa_insert_b(mem::transmute(a), 12, 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insert_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 3276, 100, 11,
+            32767, 3276, 100, 11
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            32767, 3276, 100, 11,
+            5, 3276, 100, 11
+        );
+
+        assert_eq!(r, mem::transmute(__msa_insert_h(mem::transmute(a), 4, 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insert_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 5, -2147483647);
+        #[rustfmt::skip]
+        let r = i32x4::new(100, 7, 5, -2147483647);
+
+        assert_eq!(r, mem::transmute(__msa_insert_w(mem::transmute(a), 1, 7)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insert_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(3, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(3, 100);
+
+        assert_eq!(r, mem::transmute(__msa_insert_d(mem::transmute(a), 1, 100)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insve_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -100, i8::MAX, 4, i8::MAX,
+            -100, i8::MAX, 4, i8::MAX,
+            -100, i8::MAX, 4, i8::MAX,
+            -100, i8::MAX, 4, i8::MAX
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            5, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            -100, 127, 4, 127,
+            5, 127, 4, 127
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_insve_b(mem::transmute(a), 12, mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insve_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, 100, 11,
+            i16::MAX, 3276, 100, 11
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            32767, 3276, 100, 11,
+            1, 3276, 100, 11
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_insve_h(mem::transmute(a), 4, mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insve_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 2147483647, 5, -2147483647);
+        #[rustfmt::skip]
+        let b = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(100, 2147483647, 5, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_insve_w(mem::transmute(a), 3, mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_insve_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(3, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(3, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_insve_d(mem::transmute(a), 1, mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ld_b() {
+        #[rustfmt::skip]
+        let mut a : [i8; 32] = [
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31
+        ];
+        let p = &mut a[4] as *mut _ as *mut u8;
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            13, 14, 15, 16, 
+            17, 18, 19, 20, 
+            21, 22, 23, 24, 
+            25, 26, 27, 28
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ld_b(p, 9)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ld_h() {
+        #[rustfmt::skip]
+        let mut a : [i16; 16] = [
+            0, 1, 2, 3, 4, 5, 6, 7, 
+            8, 9, 10, 11, 12, 13, 14, 15
+        ];
+        let p = &mut a[4] as *mut _ as *mut u8;
+        #[rustfmt::skip]
+        let r = i16x8::new(3, 4, 5, 6, 7, 8, 9, 10);
+
+        assert_eq!(r, mem::transmute(__msa_ld_h(p, -2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ld_w() {
+        #[rustfmt::skip]
+        let mut a : [i32; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+        let p = &mut a[3] as *mut _ as *mut u8;
+        #[rustfmt::skip]
+        let r = i32x4::new(2, 3, 4, 5);
+
+        assert_eq!(r, mem::transmute(__msa_ld_w(p, -4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ld_d() {
+        #[rustfmt::skip]
+        let mut a : [i64; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+        let p = &mut a[4] as *mut _ as *mut u8;
+        #[rustfmt::skip]
+        let r = i64x2::new(0, 1);
+
+        assert_eq!(r, mem::transmute(__msa_ld_d(p, -32)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ldi_b() {
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -20, -20, -20, -20,
+            -20, -20, -20, -20,
+            -20, -20, -20, -20,
+            -20, -20, -20, -20
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ldi_b(-20)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ldi_h() {
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            255, 255, 255, 255,
+            255, 255, 255, 255
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ldi_h(255)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ldi_w() {
+        #[rustfmt::skip]
+        let r = i32x4::new(-509, -509, -509, -509);
+
+        assert_eq!(r, mem::transmute(__msa_ldi_w(-509)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // Test passes if 4294967185 is used instead -111 in vector `r`
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_ldi_d() {
+    //     let r = i64x2::new(-111, -111);
+
+    //     assert_eq!(r, mem::transmute(__msa_ldi_d(-111)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_madd_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 1024, i16::MIN, -1024,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1024, 1024, 1024, 1024,
+            1024, 1024, 1024, 1024
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            i16::MAX, i16::MAX, 1, -1,
+            33, 66, 99, 132
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(32767, 2047, -32768, -1025, 2, 4, 6, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_madd_q_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_madd_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, i32::MIN, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(102401, 102401, 102401, 102401);
+        #[rustfmt::skip]
+        let c = i32x4::new(10240, 20480, 30720, 40960);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483647, -2147483648, 2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_madd_q_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddr_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            32767, 1024, -32768, -1024,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1024, 1024, 1024, 1024,
+            1024, 1024, 1024, 1024
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            32767, 32767, 32767, 32767,
+            33, 66, 99, 132
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(32767, 2048, -31744, 0, 2, 4, 6, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddr_q_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddr_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, i32::MIN, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(102401, 102401, 102401, 102401);
+        #[rustfmt::skip]
+        let c = i32x4::new(10240, 20480, 30720, 40960);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483647, -2147483647, 2, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddr_q_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            5, 6, 7, 8,
+            5, 6, 7, 8,
+            5, 6, 7, 8,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            9, 10, 11, 12,
+            9, 10, 11, 12,
+            9, 10, 11, 12,
+            9, 10, 11, 12
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            46, 62, 80, 100,
+            46, 62, 80, 100,
+            46, 62, 80, 100,
+            46, 62, 80, 100
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddv_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(5, 6, 7, 8, 5, 6, 7, 8);
+        #[rustfmt::skip]
+        let c = i16x8::new(9, 10, 11, 12, 9, 10, 11, 12);
+        #[rustfmt::skip]
+        let r = i16x8::new(46, 62, 80, 100, 46, 62, 80, 100);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddv_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(3, 4, 3, 4);
+        #[rustfmt::skip]
+        let c = i32x4::new(5, 6, 5, 6);
+        #[rustfmt::skip]
+        let r = i32x4::new(16, 26, 16, 26);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddv_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maddv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(3, 4);
+        #[rustfmt::skip]
+        let c = i64x2::new(5, 6);
+        #[rustfmt::skip]
+        let r = i64x2::new(16, 26);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_maddv_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_a_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_a_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_a_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let r = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_a_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_a_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i32x4::new(6, 7, 8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_a_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(6, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            6, 7, 8, 9,
+            1, 2, 3, 4,
+            6, 7, 8, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 7, 3, 9, 1, 7, 3, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i32x4::new(6, 7, 8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(6, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(6, 7, 8, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_max_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_max_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, -20, -6, 8,
+            1, -20, -6, 8,
+            1, -20, -6, 8,
+            1, -20, -6, 8
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, -16, -6, 8,
+            1, -16, -6, 8,
+            1, -16, -6, 8,
+            1, -16, -6, 8
+        );
+
+        assert_eq!(r, mem::transmute(__msa_maxi_s_b(mem::transmute(a), -16)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 3, -60, -8, 1, 3, -6, -8);
+        #[rustfmt::skip]
+        let r = i16x8::new(15, 15, 15, 15, 15, 15, 15, 15);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_s_h(mem::transmute(a), 15)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 3, -6, -8);
+        #[rustfmt::skip]
+        let r = i32x4::new(1, 3, -5, -5);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_s_w(mem::transmute(a), -5)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // Test passes if 4294967293 is used instead -3 in vector `r`
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_maxi_s_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(1, -8);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-3, -3);
+
+    //     assert_eq!(r, mem::transmute(__msa_maxi_s_d(mem::transmute(a), -3)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 3, 6, 8,
+            1, 3, 6, 8,
+            1, 3, 6, 8,
+            1, 3, 6, 8
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            5, 5, 6, 8,
+            5, 5, 6, 8,
+            5, 5, 6, 8,
+            5, 5, 6, 8
+        );
+
+        assert_eq!(r, mem::transmute(__msa_maxi_u_b(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 3, 6, 8, 1, 3, 6, 8);
+        #[rustfmt::skip]
+        let r = u16x8::new(5, 5, 6, 8, 5, 5, 6, 8);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_u_h(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 3, 6, 8);
+        #[rustfmt::skip]
+        let r = u32x4::new(5, 5, 6, 8);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_u_w(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_maxi_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 8);
+        #[rustfmt::skip]
+        let r = u64x2::new(5, 8);
+
+        assert_eq!(r, mem::transmute(__msa_maxi_u_d(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_a_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_a_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_a_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let r = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_a_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_a_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i32x4::new(1, -2, 3, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_a_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_a_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_a_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -6, -7, -8, -9,
+            -1, -2, -3, -4,
+            -6, -7, -8, -9,
+            -1, -2, -3, -4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let r = i16x8::new(-6, -2, -8, -4, -6, -2, -8, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = i32x4::new(1, -2, 3, -4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(-1, -7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -10, -10, -10, -10,
+            -10, -10, -10, -10,
+            -10, -10, -10, -10,
+            -10, -10, -10, -10
+        );
+
+        assert_eq!(r, mem::transmute(__msa_mini_s_b(mem::transmute(a), -10)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let r = i16x8::new(-3, -3, -3, -4, -3, -3, -3, -4);
+
+        assert_eq!(r, mem::transmute(__msa_mini_s_h(mem::transmute(a), -3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let r = i32x4::new(-3, -3, -3, -4);
+
+        assert_eq!(r, mem::transmute(__msa_mini_s_w(mem::transmute(a), -3)));
+    }
+
+    // FIXME: https://reviews.llvm.org/D59884
+    // If target type is i64, negative immediate loses the sign
+    // -3 is represented as 4294967293
+    // #[simd_test(enable = "msa")]
+    // unsafe fn test_msa_mini_s_d() {
+    //     #[rustfmt::skip]
+    //     let a = i64x2::new(-3, 2);
+    //     #[rustfmt::skip]
+    //     let r = i64x2::new(-1, -3);
+
+    //     assert_eq!(r, mem::transmute(__msa_mini_s_d(mem::transmute(a), -3)));
+    // }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4,);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(1, 2, 3, 4,);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_min_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(1, 2,);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_min_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 3, 6, 8,
+            1, 3, 6, 8,
+            1, 3, 6, 8,
+            1, 3, 6, 8
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            1, 3, 5, 5,
+            1, 3, 5, 5,
+            1, 3, 5, 5,
+            1, 3, 5, 5
+        );
+
+        assert_eq!(r, mem::transmute(__msa_mini_u_b(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(1, 3, 6, 8, 1, 3, 6, 8);
+        #[rustfmt::skip]
+        let r = u16x8::new(1, 3, 5, 5, 1, 3, 5, 5);
+
+        assert_eq!(r, mem::transmute(__msa_mini_u_h(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(1, 3, 6, 8);
+        #[rustfmt::skip]
+        let r = u32x4::new(1, 3, 5, 5);
+
+        assert_eq!(r, mem::transmute(__msa_mini_u_w(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mini_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(1, 8);
+        #[rustfmt::skip]
+        let r = u64x2::new(1, 5);
+
+        assert_eq!(r, mem::transmute(__msa_mini_u_d(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -6, -7, -8, -9,
+            6, 7, 8, 9,
+            -6, -7, -8, -9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            1, 2, 3, 4,
+            -1, -2, -3, -4,
+            1, 2, 3, 4,
+            -1, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            0, -1, -2, -1,
+            0, 1, 2, 1,
+            0, -1, -2, -1,
+            0, 1, 2, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(-6, 7, -8, 9, -6, 7, -8, 9);
+        #[rustfmt::skip]
+        let b = i16x8::new(1, -2, 3, -4, 1, -2, 3, -4);
+        #[rustfmt::skip]
+        let r = i16x8::new(0, 1, -2, 1, 0, 1, -2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = i32x4::new(1, -2, 3, -4);
+        #[rustfmt::skip]
+        let r = i32x4::new(0, 1, 2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let b = i64x2::new(-1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(0, -1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            0, 1, 2, 1,
+            0, 1, 2, 1,
+            0, 1, 2, 1,
+            0, 1, 2, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = u16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = u16x8::new(0, 1, 2, 1, 0, 1, 2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let b = u32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = u32x4::new(0, 1, 2, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mod_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let b = u64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = u64x2::new(0, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mod_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_move_v() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            1, 2, 3, 4,
+            5, 6, 7, 8
+            );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            1, 2, 3, 4,
+            5, 6, 7, 8
+            );
+
+        assert_eq!(r, mem::transmute(__msa_move_v(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msub_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1024, -1024, 1024, -1024,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1025, 1025, 1025, 1025,
+            1025, 1025, 1025, 1025
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            1024, 2048, 3072, 4096,
+            1024, 2048, 3072, 4096
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(991, -1089, 927, -1153, -32, -63, -94, -125);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msub_q_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msub_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(2147483647, -2147483647, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(10240, 10240, 10240, 10240);
+        #[rustfmt::skip]
+        let c = i32x4::new(10240, 20480, 30720, 40960);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483646, -2147483648, 0, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msub_q_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubr_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1024, -1024, 1024, -1024,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1025, 1025, 1025, 1025, 
+            1025, 1025, 1025, 1025
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            1024, 2048, 3072, 4096,
+            1024, 2048, 3072, 4096
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(992, -1088, 928, -1152, -31, -62, -93, -124);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubr_q_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubr_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, -2147483647, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(10240, 10240, 10240, 10240);
+        #[rustfmt::skip]
+        let c = i32x4::new(10240, 20480, 30720, 40960);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483647, -2147483647, 1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubr_q_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            5, 6, 7, 8,
+            5, 6, 7, 8,
+            5, 6, 7, 8,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            9, 10, 11, 12,
+            9, 10, 11, 12,
+            9, 10, 11, 12,
+            9, 10, 11, 12
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -44, -58, -74, -92,
+            -44, -58, -74, -92,
+            -44, -58, -74, -92,
+            -44, -58, -74, -92
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubv_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(5, 6, 7, 8, 5, 6, 7, 8);
+        #[rustfmt::skip]
+        let c = i16x8::new(9, 10, 11, 12, 9, 10, 11, 12);
+        #[rustfmt::skip]
+        let r = i16x8::new(-44, -58, -74, -92, -44, -58, -74, -92);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubv_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(3, 4, 3, 4);
+        #[rustfmt::skip]
+        let c = i32x4::new(5, 6, 5, 6);
+        #[rustfmt::skip]
+        let r = i32x4::new(-14, -22, -14, -22);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubv_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_msubv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(3, 4);
+        #[rustfmt::skip]
+        let c = i64x2::new(5, 6);
+        #[rustfmt::skip]
+        let r = i64x2::new(-14, -22);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_msubv_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mul_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            12500, -20, -300, 400,
+            12500, 20, 300, 400
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1250, 10240, -7585, 8456,
+            1250, 10240, -7585, 8456
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(476, -7, 69, 103, 476, 6, -70, 103);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mul_q_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mul_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(
+            i32::MAX, i32::MAX,
+            i32::MIN, i32::MIN
+        );
+        #[rustfmt::skip]
+        let b = i32x4::new(30, 60, 30, 60);
+        #[rustfmt::skip]
+        let r = i32x4::new(29, 59, -30, -60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mul_q_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulr_q_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            12500, -20, -300, 400,
+            12500, 20, 300, 400
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            1250, 10240, -7585, 8456,
+            1250, 10240, -7585, 8456
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(477, -6, 69, 103, 477, 6, -69, 103);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulr_q_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulr_q_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(
+            i32::MAX, i32::MAX,
+            i32::MIN, i32::MIN
+        );
+        #[rustfmt::skip]
+        let b = i32x4::new(30, 60, 30, 60);
+        #[rustfmt::skip]
+        let r = i32x4::new(30, 60, -30, -60);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulr_q_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            16, 30, 42, 52,
+            60, 66, 70, 72,
+            72, 70, 66, 60,
+            52, 42, 30, 16
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulv_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(8, 14, 18, 20, 20, 18, 14, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulv_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 6, 6, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulv_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_mulv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_mulv_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nloc_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            0, 0, 0, 0,
+            0, 0, 0, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_nloc_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nloc_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            4096, 8192, 16384, 32767
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 2, 3, 4, 0, 0, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_nloc_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nloc_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(
+            i32::MIN, -1073741824,
+            1073741824, i32::MAX
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(1, 2, 0, 0);
+
+        assert_eq!(r, mem::transmute(__msa_nloc_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nloc_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(1, 0);
+
+        assert_eq!(r, mem::transmute(__msa_nloc_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nlzc_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            7, 6, 6, 5,
+            5, 5, 5, 4,
+            4, 4, 4, 4,
+            4, 4, 4, 3
+        );
+
+        assert_eq!(r, mem::transmute(__msa_nlzc_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nlzc_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(15, 14, 14, 13, 13, 13, 13, 12);
+
+        assert_eq!(r, mem::transmute(__msa_nlzc_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nlzc_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(31, 30, 30, 29);
+
+        assert_eq!(r, mem::transmute(__msa_nlzc_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nlzc_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(63, 62);
+
+        assert_eq!(r, mem::transmute(__msa_nlzc_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nor_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            254, 253, 252, 251,
+            250, 249, 248, 247,
+            246, 245, 244, 243,
+            242, 241, 240, 239
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_nor_v(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_nori_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            250, 249, 248, 251,
+            250, 249, 248, 243,
+            242, 241, 240, 243,
+            242, 241, 240, 235
+        );
+
+        assert_eq!(r, mem::transmute(__msa_nori_b(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_or_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_or_v(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_ori_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            5, 6, 7, 4,
+            5, 6, 7, 12,
+            13, 14, 15, 12,
+            13, 14, 15, 20
+        );
+
+        assert_eq!(r, mem::transmute(__msa_ori_b(mem::transmute(a), 4)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckev_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            4, 2, 4, 2,
+            4, 2, 4, 2,
+            1, 3, 1, 3,
+            1, 3, 1, 3
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckev_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckev_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(4, 3, 2, 1, 4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 2, 4, 2, 1, 3, 1, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckev_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckev_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 2, 1, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckev_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckev_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(4, 1);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckev_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckod_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            3, 1, 3, 1,
+            3, 1, 3, 1,
+            2, 4, 2, 4,
+            2, 4, 2, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckod_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckod_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(4, 3, 2, 1, 4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(3, 1, 3, 1, 2, 4, 2, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckod_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckod_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(3, 1, 2, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckod_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pckod_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(3, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_pckod_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pcnt_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            1, 1, 1, 1,
+            1, 1, 1, 7
+        );
+
+        assert_eq!(r, mem::transmute(__msa_pcnt_b(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pcnt_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            4096, 8192, 16384, 32767
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 2, 3, 4, 1, 1, 1, 15);
+
+        assert_eq!(r, mem::transmute(__msa_pcnt_h(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pcnt_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(
+            i32::MIN, -1073741824,
+            1073741824, i32::MAX
+        );
+        #[rustfmt::skip]
+        let r = i32x4::new(1, 2, 1, 31);
+
+        assert_eq!(r, mem::transmute(__msa_pcnt_w(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_pcnt_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(-2147483648, 2147483647);
+        #[rustfmt::skip]
+        let r = i64x2::new(33, 31);
+
+        assert_eq!(r, mem::transmute(__msa_pcnt_d(mem::transmute(a))));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            i8::MAX, 105, 30, 1,
+            i8::MAX, 105, 30, 1,
+            i8::MAX, 105, 30, 1,
+            i8::MAX, 105, 30, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            3, 3, 3, 1,
+            3, 3, 3, 1,
+            3, 3, 3, 1,
+            3, 3, 3, 1
+        );
+
+        assert_eq!(r, mem::transmute(__msa_sat_s_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 1155, 155, 1,
+            i16::MAX, 1155, 155, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(127, 127, 127, 1, 127, 127, 127, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_s_h(mem::transmute(a), 7)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, 111111155, i32::MAX, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(131071, 131071, 131071, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_s_w(mem::transmute(a), 17)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MAX, 1);
+        #[rustfmt::skip]
+        let r = i64x2::new(137438953471, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_s_d(mem::transmute(a), 37)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 105, 30, 1,
+            u8::MAX, 105, 30, 1,
+            u8::MAX, 105, 30, 1,
+            u8::MAX, 105, 30, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            7, 7, 7, 1,
+            7, 7, 7, 1,
+            7, 7, 7, 1,
+            7, 7, 7, 1
+        );
+
+        assert_eq!(r, mem::transmute(__msa_sat_u_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 1155, 155, 1,
+            u16::MAX, 1155, 155, 1
+        );
+        #[rustfmt::skip]
+        let r = u16x8::new(255, 255, 155, 1, 255, 255, 155, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_u_h(mem::transmute(a), 7)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 111111155, u32::MAX, 1);
+        #[rustfmt::skip]
+        let r = u32x4::new(262143, 262143, 262143, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_u_w(mem::transmute(a), 17)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sat_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 1);
+        #[rustfmt::skip]
+        let r = u64x2::new(274877906943, 1);
+
+        assert_eq!(r, mem::transmute(__msa_sat_u_d(mem::transmute(a), 37)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_shf_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            11, 12, 3, 4,
+            11, 12, 3, 4,
+            11, 12, 3, 4,
+            11, 12, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            11, 3, 4, 12,
+            11, 3, 4, 12,
+            11, 3, 4, 12,
+            11, 3, 4, 12
+        );
+
+        assert_eq!(r, mem::transmute(__msa_shf_b(mem::transmute(a), 120)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_shf_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            11, 12, 13, 14,
+            11, 12, 13, 14
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(11, 14, 12, 13, 11, 14, 12, 13);
+
+        assert_eq!(r, mem::transmute(__msa_shf_h(mem::transmute(a), 156)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_shf_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(1, 3, 2, 4);
+
+        assert_eq!(r, mem::transmute(__msa_shf_w(mem::transmute(a), 216)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sld_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, 14, 15
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 17, 18, 19,
+            20, 21, 22, 23,
+            24, 25, 26, 27,
+            28, 29, 30, 31
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            21, 22, 23, 24,
+            25, 26, 27, 28,
+            29, 30, 31, 0,
+            1, 2, 3, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sld_b(mem::transmute(a), mem::transmute(b), 5))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sld_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        #[rustfmt::skip]
+        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        // let c = 5 as i32;
+        let r = i16x8::new(9, 10, 11, 0, 13, 14, 15, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sld_h(mem::transmute(a), mem::transmute(b), 2))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sld_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(0, 1, 2, 3);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 5, 6, 7);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 5, 6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sld_w(mem::transmute(a), mem::transmute(b), 4))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sld_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(0, 1);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sld_d(mem::transmute(a), mem::transmute(b), 2))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sldi_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, 14, 15
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            16, 17, 18, 19,
+            20, 21, 22, 23,
+            24, 25, 26, 27,
+            28, 29, 30, 31
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            21, 22, 23, 24,
+            25, 26, 27, 28,
+            29, 30, 31, 0,
+            1, 2, 3, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sldi_b(mem::transmute(a), mem::transmute(b), 5))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sldi_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        #[rustfmt::skip]
+        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        // let c = 5 as i32;
+        let r = i16x8::new(9, 10, 11, 0, 13, 14, 15, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sldi_h(mem::transmute(a), mem::transmute(b), 2))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sldi_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(0, 1, 2, 3);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 5, 6, 7);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 5, 6, 7);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sldi_w(mem::transmute(a), mem::transmute(b), 4))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sldi_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(0, 1);
+        #[rustfmt::skip]
+        let b = i64x2::new(2, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 3);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sldi_d(mem::transmute(a), mem::transmute(b), 2))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sll_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            16, 16, 12, 8,
+            16, 16, 12, 8,
+            16, 16, 12, 8,
+            16, 16, 12, 8
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sll_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sll_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i16x8::new(4, 3, 2, 1, 4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(16, 16, 12, 8, 16, 16, 12, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sll_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sll_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(16, 16, 12, 8);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sll_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sll_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(16, 16);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sll_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_slli_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            4, 8, 12, 16,
+            4, 8, 12, 16,
+            4, 8, 12, 16,
+            4, 8, 12, 16
+        );
+
+        assert_eq!(r, mem::transmute(__msa_slli_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_slli_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4  
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 8, 12, 16, 4, 8, 12, 16);
+
+        assert_eq!(r, mem::transmute(__msa_slli_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_slli_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 8, 12, 16);
+
+        assert_eq!(r, mem::transmute(__msa_slli_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_slli_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 4);
+
+        assert_eq!(r, mem::transmute(__msa_slli_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splat_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            4, 4, 4, 4,
+            4, 4, 4, 4,
+            4, 4, 4, 4,
+            4, 4, 4, 4
+        );
+
+        assert_eq!(r, mem::transmute(__msa_splat_b(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splat_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(4, 4, 4, 4, 4, 4, 4, 4);
+
+        assert_eq!(r, mem::transmute(__msa_splat_h(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splat_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(4, 4, 4, 4);
+
+        assert_eq!(r, mem::transmute(__msa_splat_w(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splat_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_splat_d(mem::transmute(a), 3)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splati_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            3, 3, 3, 3,
+            3, 3, 3, 3,
+            3, 3, 3, 3,
+            3, 3, 3, 3
+        );
+
+        assert_eq!(r, mem::transmute(__msa_splati_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splati_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+
+        assert_eq!(r, mem::transmute(__msa_splati_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splati_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let r = i32x4::new(3, 3, 3, 3);
+
+        assert_eq!(r, mem::transmute(__msa_splati_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_splati_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let r = i64x2::new(2, 2);
+
+        assert_eq!(r, mem::transmute(__msa_splati_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sra_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -128, -1, -1, -1,
+            -1, -1, -1, -1,
+            1, 0, 0, 0,
+            1, 4, 16, 63
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sra_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sra_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            15, 14, 13, 12,
+            12, 13, 14, 15
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            -1, -1, -1, -1,
+            0, 0, 0, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sra_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sra_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -1073741824, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(16, 15, 16, 15);
+        #[rustfmt::skip]
+        let r = i32x4::new(-32768, -32768, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sra_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_sra_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(32, 31);
+        #[rustfmt::skip]
+        let r = i64x2::new(-2147483648, 4294967295);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_sra_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srai_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            i8::MAX, 125, 55, 1,
+            i8::MAX, 125, 55, 1,
+            i8::MAX, 125, 55, 1,
+            i8::MAX, 125, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            31, 31, 13, 0,
+            31, 31, 13, 0,
+            31, 31, 13, 0,
+            31, 31, 13, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srai_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srai_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 125, 55, 1, 
+            i16::MAX, 125, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(8191, 31, 13, 0, 8191, 31, 13, 0);
+
+        assert_eq!(r, mem::transmute(__msa_srai_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srai_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MAX, 125, 55, 1);
+        let r = i32x4::new(536870911, 31, 13, 0);
+
+        assert_eq!(r, mem::transmute(__msa_srai_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srai_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MAX, 55);
+        #[rustfmt::skip]
+        let r = i64x2::new(2305843009213693951, 13);
+
+        assert_eq!(r, mem::transmute(__msa_srai_d(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srar_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -8, -8, -8, -8,
+            0, 0, 0, 0,
+            1, 0, 0, 0,
+            1, 4, 16, 64
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srar_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srar_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MIN, -16384, -8192, -4096,
+            150, 50, 25, 15
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            -2048, -2048, -2048, -2048, 
+            75, 13, 3, 1
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srar_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srar_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -1073741824, 100, 50);
+        #[rustfmt::skip]
+        let b = i32x4::new(16, 15, 1, 2);
+        #[rustfmt::skip]
+        let r = i32x4::new(-32768, -32768, 50, 13);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srar_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srar_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(32, 31);
+        #[rustfmt::skip]
+        let r = i64x2::new(-2147483648, 4294967296);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srar_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srari_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            125, i8::MAX, 55, 1,
+            125, i8::MAX, 55, 1,
+            125, i8::MAX, 55, 1,
+            125, i8::MAX, 55, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            31, 32, 14, 0,
+            31, 32, 14, 0,
+            31, 32, 14, 0,
+            31, 32, 14, 0
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srari_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srari_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(2155, 1155, 155, 1, 2155, 1155, 155, 1);
+        #[rustfmt::skip]
+        let r = i16x8::new(539, 289, 39, 0, 539, 289, 39, 0);
+
+        assert_eq!(r, mem::transmute(__msa_srari_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srari_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(211111155, 111111155, 11111155, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(52777789, 27777789, 2777789, 0);
+
+        assert_eq!(r, mem::transmute(__msa_srari_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srari_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(211111111155, 111111111155);
+        #[rustfmt::skip]
+        let r = i64x2::new(52777777789, 27777777789);
+
+        assert_eq!(r, mem::transmute(__msa_srari_d(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srl_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -128, 1, 3, 7,
+            15, 31, 63, 127,
+            1, 0, 0, 0,
+            1, 4, 16, 63
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srl_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srl_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            15, 14, 13, 12,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 3, 7, 15, 0, 0, 0, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srl_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srl_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -1073741824, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(16, 15, 16, 15);
+        #[rustfmt::skip]
+        let r = i32x4::new(32768, 98304, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srl_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srl_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(32, 31);
+        #[rustfmt::skip]
+        let r = i64x2::new(2147483648, 4294967295);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srl_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srli_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            25, 50, 100, 127,
+            25, 50, 100, 127,
+            25, 50, 100, 127,
+            25, 50, 100, 127
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            6, 12, 25, 31,
+            6, 12, 25, 31, 
+            6, 12, 25, 31, 
+            6, 12, 25, 31
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srli_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srli_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, 100, 127,
+            i16::MAX, 3276, 100, 127
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            8191, 819, 25, 31,
+            8191, 819, 25, 31
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srli_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srli_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, i32::MAX, 100, i32::MAX);
+        #[rustfmt::skip]
+        let r = i32x4::new(25, 536870911, 25, 536870911);
+
+        assert_eq!(r, mem::transmute(__msa_srli_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srli_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(50, 4611686018427387903);
+
+        assert_eq!(r, mem::transmute(__msa_srli_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlr_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            -128, -64, -32, -16,
+            -8, -4, -2, -1,
+            1, 2, 4, 8,
+            16, 32, 64, 127
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            8, 7, 6, 5,
+            4, 3, 2, 1,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            -128, 2, 4, 8,
+            16, 32, 64, -128,
+            1, 0, 0, 0,
+            1, 4, 16, 64
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srlr_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlr_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            -32768, -16384, -8192, -4096,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            15, 14, 13, 12,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(1, 3, 7, 15, 0, 0, 1, 2);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srlr_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlr_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -1073741824, 1, 2);
+        #[rustfmt::skip]
+        let b = i32x4::new(16, 15, 16, 15);
+        let r = i32x4::new(32768, 98304, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srlr_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlr_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        #[rustfmt::skip]
+        let b = i64x2::new(32, 31);
+        #[rustfmt::skip]
+        let r = i64x2::new(2147483648, 4294967296);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_srlr_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlri_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            25, 50, 100, i8::MAX,
+            25, 50, 100, i8::MAX,
+            25, 50, 100, i8::MAX,
+            25, 50, 100, i8::MAX
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            6, 13, 25, 32, 
+            6, 13, 25, 32, 
+            6, 13, 25, 32, 
+            6, 13, 25, 32
+        );
+
+        assert_eq!(r, mem::transmute(__msa_srlri_b(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlri_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, 100, 127,
+            i16::MAX, 3276, 100, 127
+        );
+        let r = i16x8::new(8192, 819, 25, 32, 8192, 819, 25, 32);
+
+        assert_eq!(r, mem::transmute(__msa_srlri_h(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlri_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 150, 200, i32::MAX);
+        #[rustfmt::skip]
+        let r = i32x4::new(25, 38, 50, 536870912);
+
+        assert_eq!(r, mem::transmute(__msa_srlri_w(mem::transmute(a), 2)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_srlri_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(50, 4611686018427387904);
+
+        assert_eq!(r, mem::transmute(__msa_srlri_d(mem::transmute(a), 1)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_st_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            13, 14, 15, 16, 
+            17, 18, 19, 20, 
+            21, 22, 23, 24, 
+            25, 26, 27, 28
+        );
+        #[rustfmt::skip]
+        let mut arr : [i8; 16] = [
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0
+        ];
+        #[rustfmt::skip]
+        let r : [i8; 16] = [
+            13, 14, 15, 16, 
+            17, 18, 19, 20, 
+            21, 22, 23, 24, 
+            25, 26, 27, 28
+        ];
+        __msa_st_b(mem::transmute(a), arr.as_mut_ptr() as *mut u8, 0);
+        assert_eq!(arr, r);
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_st_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(13, 14, 15, 16, 17, 18, 19, 20);
+        let mut arr: [i16; 8] = [0, 0, 0, 0, 0, 0, 0, 0];
+        #[rustfmt::skip]
+        let r  : [i16; 8] = [13, 14, 15, 16, 17, 18, 19, 20];
+        __msa_st_h(mem::transmute(a), arr.as_mut_ptr() as *mut u8, 0);
+        assert_eq!(arr, r);
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_st_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(13, 14, 15, 16);
+        let mut arr: [i32; 4] = [0, 0, 0, 0];
+        #[rustfmt::skip]
+        let r  : [i32; 4] = [13, 14, 15, 16];
+        __msa_st_w(mem::transmute(a), arr.as_mut_ptr() as *mut u8, 0);
+        assert_eq!(arr, r);
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_st_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(13, 14);
+        let mut arr: [i64; 2] = [0, 0];
+        #[rustfmt::skip]
+        let r : [i64; 2] = [13, 14];
+        __msa_st_d(mem::transmute(a), arr.as_mut_ptr() as *mut u8, 0);
+        assert_eq!(arr, r);
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_s_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            i8::MIN, 5, -11, 5,
+            i8::MIN, 5, -11, 5,
+            i8::MIN, 5, -11, 5,
+            i8::MIN, 5, -11, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_s_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MIN, -2, -3, -4,
+            i16::MIN, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(6, -7, 8, -9, 6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            i16::MIN, 5, -11, 5,
+            i16::MIN, 5, -11, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_s_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(i32::MIN, 5, -11, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_s_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MIN, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(i64::MIN, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9,
+            6, 7, 8, 9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            249, 0, 0, 0,
+            249, 0, 0, 0,
+            249, 0, 0, 0,
+            249, 0, 0, 0
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 2, 3, 4, 
+            u16::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 9, 6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u16x8::new(65529, 0, 0, 0, 65529, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 9);
+        #[rustfmt::skip]
+        let r = u32x4::new(4294967289, 0, 0, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subs_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = u64x2::new(18446744073709551609, 0);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subs_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsus_u_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9,
+            -6, -7, -8, -9
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            255, 9, 11, 13,
+            255, 9, 11, 13,
+            255, 9, 11, 13,
+            255, 9, 11, 13
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsus_u_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsus_u_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 2, 3, 4,
+            u16::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(-6, -7, -8, -9, -6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = u16x8::new(65535, 9, 11, 13, 65535, 9, 11, 13);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsus_u_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsus_u_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(-6, -7, -8, -9);
+        #[rustfmt::skip]
+        let r = u32x4::new(4294967295, 9, 11, 13);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsus_u_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsus_u_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(-6, -7);
+        #[rustfmt::skip]
+        let r = u64x2::new(18446744073709551615, 9);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsus_u_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsuu_s_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4,
+            u8::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            6, 7, 8, u8::MAX,
+            6, 7, 8, u8::MAX,
+            6, 7, 8, u8::MAX,
+            6, 7, 8, u8::MAX
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            127, -5, -5, -128,
+            127, -5, -5, -128,
+            127, -5, -5, -128,
+            127, -5, -5, -128
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsuu_s_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsuu_s_h() {
+        #[rustfmt::skip]
+        let a = u16x8::new(
+            u16::MAX, 2, 3, 
+            4, u16::MAX, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = u16x8::new(6, 7, 8, 65535, 6, 7, 8, 65535);
+        #[rustfmt::skip]
+        let r = i16x8::new(32767, -5, -5, -32768, 32767, -5, -5, -32768);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsuu_s_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsuu_s_w() {
+        #[rustfmt::skip]
+        let a = u32x4::new(u32::MAX, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = u32x4::new(6, 7, 8, 4294967295);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483647, -5, -5, -2147483648);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsuu_s_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subsuu_s_d() {
+        #[rustfmt::skip]
+        let a = u64x2::new(u64::MAX, 2);
+        #[rustfmt::skip]
+        let b = u64x2::new(6, 7);
+        #[rustfmt::skip]
+        let r = i64x2::new(i64::MAX, -5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subsuu_s_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subv_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4,
+            i8::MIN, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9,
+            6, -7, 8, -9
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            122, 5, -11, 5,
+            122, 5, -11, 5,
+            122, 5, -11, 5,
+            122, 5, -11, 5
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subv_b(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subv_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MIN, -2, -3, -4,
+            i16::MIN, -2, -3, -4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(6, -7, 8, -9, 6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i16x8::new(32762, 5, -11, 5, 32762, 5, -11, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subv_h(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subv_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(i32::MIN, -2, -3, -4);
+        #[rustfmt::skip]
+        let b = i32x4::new(6, -7, 8, -9);
+        #[rustfmt::skip]
+        let r = i32x4::new(2147483642, 5, -11, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subv_w(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subv_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(i64::MAX, -2);
+        #[rustfmt::skip]
+        let b = i64x2::new(6, -7);
+        #[rustfmt::skip]
+        let r = i64x2::new(9223372036854775801, 5);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_subv_d(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subvi_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            100, i8::MAX, 50, i8::MIN,
+            100, i8::MAX, 50, i8::MIN,
+            100, i8::MAX, 50, i8::MIN,
+            100, i8::MAX, 50, i8::MIN
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            95, 122, 45, 123,
+            95, 122, 45, 123,
+            95, 122, 45, 123,
+            95, 122, 45, 123
+        );
+
+        assert_eq!(r, mem::transmute(__msa_subvi_b(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subvi_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            i16::MAX, 3276, -100, i16::MIN,
+            i16::MAX, 3276, -100, i16::MIN
+        );
+        #[rustfmt::skip]
+        let r = i16x8::new(
+            32762, 3271, -105, 32763,
+            32762, 3271, -105, 32763
+        );
+
+        assert_eq!(r, mem::transmute(__msa_subvi_h(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subvi_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(100, 150, 200, i32::MAX);
+        #[rustfmt::skip]
+        let r = i32x4::new(95, 145, 195, 2147483642);
+
+        assert_eq!(r, mem::transmute(__msa_subvi_w(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_subvi_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(100, i64::MAX);
+        #[rustfmt::skip]
+        let r = i64x2::new(95, 9223372036854775802);
+
+        assert_eq!(r, mem::transmute(__msa_subvi_d(mem::transmute(a), 5)));
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_vshf_b() {
+        #[rustfmt::skip]
+        let a = i8x16::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let c = i8x16::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = i8x16::new(
+            3, 2, 1, 4, 
+            3, 2, 1, 4, 
+            3, 2, 1, 4, 
+            3, 2, 1, 4
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_vshf_b(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_vshf_h() {
+        #[rustfmt::skip]
+        let a = i16x8::new(
+            1, 2, 3, 4,
+            1, 2, 3, 4
+        );
+        #[rustfmt::skip]
+        let b = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let c = i16x8::new(
+            4, 3, 2, 1,
+            4, 3, 2, 1
+        );
+        let r = i16x8::new(3, 2, 1, 4, 3, 2, 1, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_vshf_h(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_vshf_w() {
+        #[rustfmt::skip]
+        let a = i32x4::new(1, 2, 3, 4);
+        #[rustfmt::skip]
+        let b = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let c = i32x4::new(4, 3, 2, 1);
+        #[rustfmt::skip]
+        let r = i32x4::new(3, 2, 1, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_vshf_w(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_vshf_d() {
+        #[rustfmt::skip]
+        let a = i64x2::new(1, 2);
+        #[rustfmt::skip]
+        let b = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let c = i64x2::new(4, 3);
+        #[rustfmt::skip]
+        let r = i64x2::new(3, 4);
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_vshf_d(
+                mem::transmute(a),
+                mem::transmute(b),
+                mem::transmute(c)
+            ))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_xor_v() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let b = u8x16::new(
+            16, 15, 14, 13,
+            12, 11, 10, 9,
+            8, 7, 6, 5,
+            4, 3, 2, 1
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            17, 13, 13, 9, 
+            9, 13, 13, 1, 
+            1, 13, 13, 9, 
+            9, 13, 13, 17
+        );
+
+        assert_eq!(
+            r,
+            mem::transmute(__msa_xor_v(mem::transmute(a), mem::transmute(b)))
+        );
+    }
+
+    #[simd_test(enable = "msa")]
+    unsafe fn test_msa_xori_b() {
+        #[rustfmt::skip]
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16
+        );
+        #[rustfmt::skip]
+        let r = u8x16::new(
+            5, 6, 7, 0, 
+            1, 2, 3, 12, 
+            13, 14, 15, 8, 
+            9, 10, 11, 20
+        );
+
+        assert_eq!(r, mem::transmute(__msa_xori_b(mem::transmute(a), 4)));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/mips/msa/macros.rs b/library/stdarch/crates/core_arch/src/mips/msa/macros.rs
new file mode 100644
index 000000000..de8905840
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mips/msa/macros.rs
@@ -0,0 +1,31 @@
+//! Utility macros.
+
+macro_rules! static_assert_imm_s5 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, -16, 15>::VALID;
+    };
+}
+
+macro_rules! static_assert_imm_s10 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, -512, 511>::VALID;
+    };
+}
+
+macro_rules! static_assert_imm_s11 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, -1024, 1023>::VALID;
+    };
+}
+
+macro_rules! static_assert_imm_s12 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, -2048, 2047>::VALID;
+    };
+}
+
+macro_rules! static_assert_imm_s13 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, -4096, 4095>::VALID;
+    };
+}
diff --git a/library/stdarch/crates/core_arch/src/mod.rs b/library/stdarch/crates/core_arch/src/mod.rs
new file mode 100644
index 000000000..20751eeec
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/mod.rs
@@ -0,0 +1,305 @@
+//! `core_arch`
+
+#[macro_use]
+mod macros;
+
+#[cfg(any(target_arch = "arm", target_arch = "aarch64", doc))]
+mod arm_shared;
+
+mod simd;
+
+#[doc = include_str!("core_arch_docs.md")]
+#[stable(feature = "simd_arch", since = "1.27.0")]
+pub mod arch {
+    /// Platform-specific intrinsics for the `x86` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "x86", doc))]
+    #[doc(cfg(target_arch = "x86"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub mod x86 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use crate::core_arch::x86::*;
+    }
+
+    /// Platform-specific intrinsics for the `x86_64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "x86_64", doc))]
+    #[doc(cfg(target_arch = "x86_64"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub mod x86_64 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use crate::core_arch::x86::*;
+        #[stable(feature = "simd_x86", since = "1.27.0")]
+        pub use crate::core_arch::x86_64::*;
+    }
+
+    /// Platform-specific intrinsics for the `arm` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "arm", doc))]
+    #[doc(cfg(target_arch = "arm"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod arm {
+        pub use crate::core_arch::arm::*;
+    }
+
+    /// Platform-specific intrinsics for the `aarch64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "aarch64", doc))]
+    #[doc(cfg(target_arch = "aarch64"))]
+    #[stable(feature = "neon_intrinsics", since = "1.59.0")]
+    pub mod aarch64 {
+        #[stable(feature = "neon_intrinsics", since = "1.59.0")]
+        pub use crate::core_arch::aarch64::*;
+    }
+
+    /// Platform-specific intrinsics for the `riscv32` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "riscv32", doc))]
+    #[doc(cfg(any(target_arch = "riscv32")))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod riscv32 {
+        pub use crate::core_arch::riscv_shared::*;
+    }
+
+    /// Platform-specific intrinsics for the `riscv64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "riscv64", doc))]
+    #[doc(cfg(any(target_arch = "riscv64")))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod riscv64 {
+        pub use crate::core_arch::riscv64::*;
+        // RISC-V RV64 supports all RV32 instructions as well in current specifications (2022-01-05).
+        // Module `riscv_shared` includes instructions available under all RISC-V platforms,
+        // i.e. RISC-V RV32 instructions.
+        pub use crate::core_arch::riscv_shared::*;
+    }
+
+    /// Platform-specific intrinsics for the `wasm32` platform.
+    ///
+    /// This module provides intrinsics specific to the WebAssembly
+    /// architecture. Here you'll find intrinsics specific to WebAssembly that
+    /// aren't otherwise surfaced somewhere in a cross-platform abstraction of
+    /// `std`, and you'll also find functions for leveraging WebAssembly
+    /// proposals such as [atomics] and [simd].
+    ///
+    /// Intrinsics in the `wasm32` module are modeled after the WebAssembly
+    /// instructions that they represent. Most functions are named after the
+    /// instruction they intend to correspond to, and the arguments/results
+    /// correspond to the type signature of the instruction itself. Stable
+    /// WebAssembly instructions are [documented online][instrdoc].
+    ///
+    /// [instrdoc]: https://webassembly.github.io/spec/core/valid/instructions.html
+    ///
+    /// If a proposal is not yet stable in WebAssembly itself then the functions
+    /// within this function may be unstable and require the nightly channel of
+    /// Rust to use. As the proposal itself stabilizes the intrinsics in this
+    /// module should stabilize as well.
+    ///
+    /// [atomics]: https://github.com/webassembly/threads
+    /// [simd]: https://github.com/webassembly/simd
+    ///
+    /// See the [module documentation](../index.html) for general information
+    /// about the `arch` module and platform intrinsics.
+    ///
+    /// ## Atomics
+    ///
+    /// The [threads proposal][atomics] for WebAssembly adds a number of
+    /// instructions for dealing with multithreaded programs. Most instructions
+    /// added in the [atomics] proposal are exposed in Rust through the
+    /// `std::sync::atomic` module. Some instructions, however, don't have
+    /// direct equivalents in Rust so they're exposed here instead.
+    ///
+    /// Note that the instructions added in the [atomics] proposal can work in
+    /// either a context with a shared wasm memory and without. These intrinsics
+    /// are always available in the standard library, but you likely won't be
+    /// able to use them too productively unless you recompile the standard
+    /// library (and all your code) with `-Ctarget-feature=+atomics`.
+    ///
+    /// It's also worth pointing out that multi-threaded WebAssembly and its
+    /// story in Rust is still in a somewhat "early days" phase as of the time
+    /// of this writing. Pieces should mostly work but it generally requires a
+    /// good deal of manual setup. At this time it's not as simple as "just call
+    /// `std::thread::spawn`", but it will hopefully get there one day!
+    ///
+    /// ## SIMD
+    ///
+    /// The [simd proposal][simd] for WebAssembly added a new `v128` type for a
+    /// 128-bit SIMD register. It also added a large array of instructions to
+    /// operate on the `v128` type to perform data processing. Using SIMD on
+    /// wasm is intended to be similar to as you would on `x86_64`, for example.
+    /// You'd write a function such as:
+    ///
+    /// ```rust,ignore
+    /// #[cfg(target_arch = "wasm32")]
+    /// #[target_feature(enable = "simd128")]
+    /// unsafe fn uses_simd() {
+    ///     use std::arch::wasm32::*;
+    ///     // ...
+    /// }
+    /// ```
+    ///
+    /// Unlike `x86_64`, however, WebAssembly does not currently have dynamic
+    /// detection at runtime as to whether SIMD is supported (this is one of the
+    /// motivators for the [conditional sections][condsections] and [feature
+    /// detection] proposals, but that is still pretty early days). This means
+    /// that your binary will either have SIMD and can only run on engines
+    /// which support SIMD, or it will not have SIMD at all. For compatibility
+    /// the standard library itself does not use any SIMD internally.
+    /// Determining how best to ship your WebAssembly binary with SIMD is
+    /// largely left up to you as it can can be pretty nuanced depending on
+    /// your situation.
+    ///
+    /// [condsections]: https://github.com/webassembly/conditional-sections
+    /// [feature detection]: https://github.com/WebAssembly/feature-detection
+    ///
+    /// To enable SIMD support at compile time you need to do one of two things:
+    ///
+    /// * First you can annotate functions with `#[target_feature(enable =
+    ///   "simd128")]`. This causes just that one function to have SIMD support
+    ///   available to it, and intrinsics will get inlined as usual in this
+    ///   situation.
+    ///
+    /// * Second you can compile your program with `-Ctarget-feature=+simd128`.
+    ///   This compilation flag blanket enables SIMD support for your entire
+    ///   compilation. Note that this does not include the standard library
+    ///   unless you [recompile the standard library][buildstd].
+    ///
+    /// [buildstd]: https://doc.rust-lang.org/nightly/cargo/reference/unstable.html#build-std
+    ///
+    /// If you enable SIMD via either of these routes then you'll have a
+    /// WebAssembly binary that uses SIMD instructions, and you'll need to ship
+    /// that accordingly. Also note that if you call SIMD intrinsics but don't
+    /// enable SIMD via either of these mechanisms, you'll still have SIMD
+    /// generated in your program. This means to generate a binary without SIMD
+    /// you'll need to avoid both options above plus calling into any intrinsics
+    /// in this module.
+    #[cfg(any(target_arch = "wasm32", doc))]
+    #[doc(cfg(target_arch = "wasm32"))]
+    #[stable(feature = "simd_wasm32", since = "1.33.0")]
+    pub mod wasm32 {
+        #[stable(feature = "simd_wasm32", since = "1.33.0")]
+        pub use crate::core_arch::wasm32::*;
+    }
+
+    /// Platform-specific intrinsics for the `wasm64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "wasm64", doc))]
+    #[doc(cfg(target_arch = "wasm64"))]
+    #[unstable(feature = "simd_wasm64", issue = "90599")]
+    pub mod wasm64 {
+        #[unstable(feature = "simd_wasm64", issue = "90599")]
+        pub use crate::core_arch::wasm32::*;
+    }
+
+    /// Platform-specific intrinsics for the `wasm` target family.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_family = "wasm", doc))]
+    #[doc(cfg(target_family = "wasm"))]
+    #[unstable(feature = "simd_wasm64", issue = "90599")]
+    pub mod wasm {
+        #[unstable(feature = "simd_wasm64", issue = "90599")]
+        pub use crate::core_arch::wasm32::*;
+    }
+
+    /// Platform-specific intrinsics for the `mips` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "mips", doc))]
+    #[doc(cfg(target_arch = "mips"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod mips {
+        pub use crate::core_arch::mips::*;
+    }
+
+    /// Platform-specific intrinsics for the `mips64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "mips64", doc))]
+    #[doc(cfg(target_arch = "mips64"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod mips64 {
+        pub use crate::core_arch::mips::*;
+    }
+
+    /// Platform-specific intrinsics for the `PowerPC` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "powerpc", doc))]
+    #[doc(cfg(target_arch = "powerpc"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod powerpc {
+        pub use crate::core_arch::powerpc::*;
+    }
+
+    /// Platform-specific intrinsics for the `PowerPC64` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "powerpc64", doc))]
+    #[doc(cfg(target_arch = "powerpc64"))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod powerpc64 {
+        pub use crate::core_arch::powerpc64::*;
+    }
+
+    /// Platform-specific intrinsics for the `NVPTX` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "nvptx", target_arch = "nvptx64", doc))]
+    #[doc(cfg(any(target_arch = "nvptx", target_arch = "nvptx64")))]
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    pub mod nvptx {
+        pub use crate::core_arch::nvptx::*;
+    }
+}
+
+mod simd_llvm;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", doc))]
+#[doc(cfg(any(target_arch = "x86", target_arch = "x86_64")))]
+mod x86;
+#[cfg(any(target_arch = "x86_64", doc))]
+#[doc(cfg(target_arch = "x86_64"))]
+mod x86_64;
+
+#[cfg(any(target_arch = "aarch64", doc))]
+#[doc(cfg(target_arch = "aarch64"))]
+mod aarch64;
+#[cfg(any(target_arch = "arm", doc))]
+#[doc(cfg(any(target_arch = "arm")))]
+mod arm;
+
+#[cfg(any(target_arch = "riscv32", target_arch = "riscv64", doc))]
+#[doc(cfg(any(target_arch = "riscv32", target_arch = "riscv64")))]
+mod riscv_shared;
+
+#[cfg(any(target_arch = "riscv64", doc))]
+#[doc(cfg(any(target_arch = "riscv64")))]
+mod riscv64;
+
+#[cfg(any(target_family = "wasm", doc))]
+#[doc(cfg(target_family = "wasm"))]
+mod wasm32;
+
+#[cfg(any(target_arch = "mips", target_arch = "mips64", doc))]
+#[doc(cfg(any(target_arch = "mips", target_arch = "mips64")))]
+mod mips;
+
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64", doc))]
+#[doc(cfg(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod powerpc;
+
+#[cfg(any(target_arch = "powerpc64", doc))]
+#[doc(cfg(target_arch = "powerpc64"))]
+mod powerpc64;
+
+#[cfg(any(target_arch = "nvptx", target_arch = "nvptx64", doc))]
+#[doc(cfg(any(target_arch = "nvptx", target_arch = "nvptx64")))]
+mod nvptx;
diff --git a/library/stdarch/crates/core_arch/src/nvptx/mod.rs b/library/stdarch/crates/core_arch/src/nvptx/mod.rs
new file mode 100644
index 000000000..bf6673f47
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/nvptx/mod.rs
@@ -0,0 +1,213 @@
+//! NVPTX intrinsics (experimental)
+//!
+//! These intrinsics form the foundation of the CUDA
+//! programming model.
+//!
+//! The reference is the [CUDA C Programming Guide][cuda_c]. Relevant is also
+//! the [LLVM NVPTX Backend documentation][llvm_docs].
+//!
+//! [cuda_c]:
+//! http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+//! [llvm_docs]:
+//! https://llvm.org/docs/NVPTXUsage.html
+
+use crate::ffi::c_void;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.nvvm.barrier0"]
+    fn syncthreads() -> ();
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.x"]
+    fn block_dim_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.y"]
+    fn block_dim_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.z"]
+    fn block_dim_z() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.x"]
+    fn block_idx_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.y"]
+    fn block_idx_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.z"]
+    fn block_idx_z() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.x"]
+    fn grid_dim_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.y"]
+    fn grid_dim_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.z"]
+    fn grid_dim_z() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.tid.x"]
+    fn thread_idx_x() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.tid.y"]
+    fn thread_idx_y() -> i32;
+    #[link_name = "llvm.nvvm.read.ptx.sreg.tid.z"]
+    fn thread_idx_z() -> i32;
+}
+
+/// Synchronizes all threads in the block.
+#[inline]
+pub unsafe fn _syncthreads() -> () {
+    syncthreads()
+}
+
+/// x-th thread-block dimension.
+#[inline]
+pub unsafe fn _block_dim_x() -> i32 {
+    block_dim_x()
+}
+
+/// y-th thread-block dimension.
+#[inline]
+pub unsafe fn _block_dim_y() -> i32 {
+    block_dim_y()
+}
+
+/// z-th thread-block dimension.
+#[inline]
+pub unsafe fn _block_dim_z() -> i32 {
+    block_dim_z()
+}
+
+/// x-th thread-block index.
+#[inline]
+pub unsafe fn _block_idx_x() -> i32 {
+    block_idx_x()
+}
+
+/// y-th thread-block index.
+#[inline]
+pub unsafe fn _block_idx_y() -> i32 {
+    block_idx_y()
+}
+
+/// z-th thread-block index.
+#[inline]
+pub unsafe fn _block_idx_z() -> i32 {
+    block_idx_z()
+}
+
+/// x-th block-grid dimension.
+#[inline]
+pub unsafe fn _grid_dim_x() -> i32 {
+    grid_dim_x()
+}
+
+/// y-th block-grid dimension.
+#[inline]
+pub unsafe fn _grid_dim_y() -> i32 {
+    grid_dim_y()
+}
+
+/// z-th block-grid dimension.
+#[inline]
+pub unsafe fn _grid_dim_z() -> i32 {
+    grid_dim_z()
+}
+
+/// x-th thread index.
+#[inline]
+pub unsafe fn _thread_idx_x() -> i32 {
+    thread_idx_x()
+}
+
+/// y-th thread index.
+#[inline]
+pub unsafe fn _thread_idx_y() -> i32 {
+    thread_idx_y()
+}
+
+/// z-th thread index.
+#[inline]
+pub unsafe fn _thread_idx_z() -> i32 {
+    thread_idx_z()
+}
+
+/// Generates the trap instruction `TRAP`
+#[inline]
+pub unsafe fn trap() -> ! {
+    crate::intrinsics::abort()
+}
+
+// Basic CUDA syscall declarations.
+extern "C" {
+    /// Print formatted output from a kernel to a host-side output stream.
+    ///
+    /// Syscall arguments:
+    /// * `status`: The status value that is returned by `vprintf`.
+    /// * `format`: A pointer to the format specifier input (uses common `printf` format).
+    /// * `valist`: A pointer to the valist input.
+    ///
+    /// ```
+    /// #[repr(C)]
+    /// struct PrintArgs(f32, f32, f32, i32);
+    ///
+    /// vprintf(
+    ///     "int(%f + %f) = int(%f) = %d\n".as_ptr(),
+    ///     transmute(&PrintArgs(a, b, a + b, (a + b) as i32)),
+    /// );
+    /// ```
+    ///
+    /// Sources:
+    /// [Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#formatted-output),
+    /// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+    pub fn vprintf(format: *const u8, valist: *const c_void) -> i32;
+
+    /// Allocate memory dynamically from a fixed-size heap in global memory.
+    ///
+    /// The CUDA in-kernel `malloc()` function allocates at least `size` bytes
+    /// from the device heap and returns a pointer to the allocated memory
+    /// or `NULL` if insufficient memory exists to fulfill the request.
+    ///
+    /// The returned pointer is guaranteed to be aligned to a 16-byte boundary.
+    ///
+    /// The memory allocated by a given CUDA thread via `malloc()` remains allocated
+    /// for the lifetime of the CUDA context, or until it is explicitly released
+    /// by a call to `free()`. It can be used by any other CUDA threads
+    /// even from subsequent kernel launches.
+    ///
+    /// Sources:
+    /// [Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations),
+    /// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+    // FIXME(denzp): assign `malloc` and `nothrow` attributes.
+    pub fn malloc(size: usize) -> *mut c_void;
+
+    /// Free previously dynamically allocated memory.
+    ///
+    /// The CUDA in-kernel `free()` function deallocates the memory pointed to by `ptr`,
+    /// which must have been returned by a previous call to `malloc()`. If `ptr` is NULL,
+    /// the call to `free()` is ignored.
+    ///
+    /// Any CUDA thread may free memory allocated by another thread, but care should be taken
+    /// to ensure that the same pointer is not freed more than once. Repeated calls to `free()`
+    /// with the same `ptr` has undefined behavior.
+    ///
+    /// Sources:
+    /// [Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations),
+    /// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+    // FIXME(denzp): assign `nothrow` attribute.
+    pub fn free(ptr: *mut c_void);
+
+    // Internal declaration of the syscall. Exported variant has
+    // the `char_size` parameter set to `1` (single char size in bytes).
+    fn __assertfail(
+        message: *const u8,
+        file: *const u8,
+        line: u32,
+        function: *const u8,
+        char_size: usize,
+    );
+}
+
+/// Syscall to be used whenever the *assert expression produces a `false` value*.
+///
+/// Syscall arguments:
+/// * `message`: The pointer to the string that should be output.
+/// * `file`: The pointer to the file name string associated with the assert.
+/// * `line`: The line number associated with the assert.
+/// * `function`: The pointer to the function name string associated with the assert.
+///
+/// Source:
+/// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+#[inline]
+pub unsafe fn __assert_fail(message: *const u8, file: *const u8, line: u32, function: *const u8) {
+    __assertfail(message, file, line, function, 1)
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc/altivec.rs b/library/stdarch/crates/core_arch/src/powerpc/altivec.rs
new file mode 100644
index 000000000..8b2be39dc
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/altivec.rs
@@ -0,0 +1,2778 @@
+//! PowerPC AltiVec intrinsics.
+//!
+//! AltiVec is a brandname trademarked by Freescale (previously Motorola) for
+//! the standard `Category:Vector` part of the Power ISA v.2.03 specification.
+//! This Category is also known as VMX (used by IBM), and "Velocity Engine" (a
+//! brand name previously used by Apple).
+//!
+//! The references are: [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA
+//! NVlink)] and [POWER ISA v3.0B (for POWER9)].
+//!
+//! [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA NVlink)]: https://ibm.box.com/s/jd5w15gz301s5b5dt375mshpq9c3lh4u
+//! [POWER ISA v3.0B (for POWER9)]: https://ibm.box.com/s/1hzcwkwf8rbju5h9iyf44wm94amnlcrv
+
+#![allow(non_camel_case_types)]
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+types! {
+    /// PowerPC-specific 128-bit wide vector of sixteen packed `i8`
+    pub struct vector_signed_char(i8, i8, i8, i8, i8, i8, i8, i8,
+                                  i8, i8, i8, i8, i8, i8, i8, i8);
+    /// PowerPC-specific 128-bit wide vector of sixteen packed `u8`
+    pub struct vector_unsigned_char(u8, u8, u8, u8, u8, u8, u8, u8,
+                                    u8, u8, u8, u8, u8, u8, u8, u8);
+
+    /// PowerPC-specific 128-bit wide vector mask of sixteen packed elements
+    pub struct vector_bool_char(i8, i8, i8, i8, i8, i8, i8, i8,
+                                i8, i8, i8, i8, i8, i8, i8, i8);
+    /// PowerPC-specific 128-bit wide vector of eight packed `i16`
+    pub struct vector_signed_short(i16, i16, i16, i16, i16, i16, i16, i16);
+    /// PowerPC-specific 128-bit wide vector of eight packed `u16`
+    pub struct vector_unsigned_short(u16, u16, u16, u16, u16, u16, u16, u16);
+    /// PowerPC-specific 128-bit wide vector mask of eight packed elements
+    pub struct vector_bool_short(i16, i16, i16, i16, i16, i16, i16, i16);
+    // pub struct vector_pixel(???);
+    /// PowerPC-specific 128-bit wide vector of four packed `i32`
+    pub struct vector_signed_int(i32, i32, i32, i32);
+    /// PowerPC-specific 128-bit wide vector of four packed `u32`
+    pub struct vector_unsigned_int(u32, u32, u32, u32);
+    /// PowerPC-specific 128-bit wide vector mask of four packed elements
+    pub struct vector_bool_int(i32, i32, i32, i32);
+    /// PowerPC-specific 128-bit wide vector of four packed `f32`
+    pub struct vector_float(f32, f32, f32, f32);
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.ppc.altivec.vperm"]
+    fn vperm(
+        a: vector_signed_int,
+        b: vector_signed_int,
+        c: vector_unsigned_char,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmhaddshs"]
+    fn vmhaddshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_short,
+    ) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmhraddshs"]
+    fn vmhraddshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_short,
+    ) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmsumuhs"]
+    fn vmsumuhs(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmsumshs"]
+    fn vmsumshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmsumubm"]
+    fn vmsumubm(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmsummbm"]
+    fn vmsummbm(
+        a: vector_signed_char,
+        b: vector_unsigned_char,
+        c: vector_signed_int,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmsumuhm"]
+    fn vmsumuhm(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmsumshm"]
+    fn vmsumshm(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmaddfp"]
+    fn vmaddfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float;
+    #[link_name = "llvm.ppc.altivec.vnmsubfp"]
+    fn vnmsubfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float;
+    #[link_name = "llvm.ppc.altivec.vsum2sws"]
+    fn vsum2sws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vsum4ubs"]
+    fn vsum4ubs(a: vector_unsigned_char, b: vector_unsigned_int) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vsum4sbs"]
+    fn vsum4sbs(a: vector_signed_char, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vsum4shs"]
+    fn vsum4shs(a: vector_signed_short, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmuleub"]
+    fn vmuleub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vmulesb"]
+    fn vmulesb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmuleuh"]
+    fn vmuleuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmulesh"]
+    fn vmulesh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmuloub"]
+    fn vmuloub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vmulosb"]
+    fn vmulosb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmulouh"]
+    fn vmulouh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_int;
+    #[link_name = "llvm.ppc.altivec.vmulosh"]
+    fn vmulosh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vmaxsb"]
+    fn vmaxsb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vmaxsh"]
+    fn vmaxsh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmaxsw"]
+    fn vmaxsw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vmaxub"]
+    fn vmaxub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vmaxuh"]
+    fn vmaxuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vmaxuw"]
+    fn vmaxuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vminsb"]
+    fn vminsb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vminsh"]
+    fn vminsh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vminsw"]
+    fn vminsw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vminub"]
+    fn vminub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vminuh"]
+    fn vminuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vminuw"]
+    fn vminuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vsubsbs"]
+    fn vsubsbs(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vsubshs"]
+    fn vsubshs(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vsubsws"]
+    fn vsubsws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vsububs"]
+    fn vsububs(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vsubuhs"]
+    fn vsubuhs(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vsubuws"]
+    fn vsubuws(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vaddcuw"]
+    fn vaddcuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vaddsbs"]
+    fn vaddsbs(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vaddshs"]
+    fn vaddshs(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vaddsws"]
+    fn vaddsws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vaddubs"]
+    fn vaddubs(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vadduhs"]
+    fn vadduhs(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vadduws"]
+    fn vadduws(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ppc.altivec.vavgsb"]
+    fn vavgsb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.vavgsh"]
+    fn vavgsh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vavgsw"]
+    fn vavgsw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vavgub"]
+    fn vavgub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.vavguh"]
+    fn vavguh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
+    #[link_name = "llvm.ppc.altivec.vavguw"]
+    fn vavguw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
+
+    #[link_name = "llvm.ceil.v4f32"]
+    fn vceil(a: vector_float) -> vector_float;
+
+    #[link_name = "llvm.ppc.altivec.vcmpbfp"]
+    fn vcmpbfp(a: vector_float, b: vector_float) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpequb"]
+    fn vcmpequb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_bool_char;
+    #[link_name = "llvm.ppc.altivec.vcmpequh"]
+    fn vcmpequh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_bool_short;
+    #[link_name = "llvm.ppc.altivec.vcmpequw"]
+    fn vcmpequw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_bool_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpgefp"]
+    fn vcmpgefp(a: vector_float, b: vector_float) -> vector_bool_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpgtub"]
+    fn vcmpgtub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_bool_char;
+    #[link_name = "llvm.ppc.altivec.vcmpgtuh"]
+    fn vcmpgtuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_bool_short;
+    #[link_name = "llvm.ppc.altivec.vcmpgtuw"]
+    fn vcmpgtuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_bool_int;
+
+    #[link_name = "llvm.ppc.altivec.vcmpgtsb"]
+    fn vcmpgtsb(a: vector_signed_char, b: vector_signed_char) -> vector_bool_char;
+    #[link_name = "llvm.ppc.altivec.vcmpgtsh"]
+    fn vcmpgtsh(a: vector_signed_short, b: vector_signed_short) -> vector_bool_short;
+    #[link_name = "llvm.ppc.altivec.vcmpgtsw"]
+    fn vcmpgtsw(a: vector_signed_int, b: vector_signed_int) -> vector_bool_int;
+
+    #[link_name = "llvm.ppc.altivec.vexptefp"]
+    fn vexptefp(a: vector_float) -> vector_float;
+
+    #[link_name = "llvm.floor.v4f32"]
+    fn vfloor(a: vector_float) -> vector_float;
+}
+
+macro_rules! s_t_l {
+    (i32x4) => {
+        vector_signed_int
+    };
+    (i16x8) => {
+        vector_signed_short
+    };
+    (i8x16) => {
+        vector_signed_char
+    };
+
+    (u32x4) => {
+        vector_unsigned_int
+    };
+    (u16x8) => {
+        vector_unsigned_short
+    };
+    (u8x16) => {
+        vector_unsigned_char
+    };
+
+    (f32x4) => {
+        vector_float
+    };
+}
+
+macro_rules! t_t_l {
+    (i32) => {
+        vector_signed_int
+    };
+    (i16) => {
+        vector_signed_short
+    };
+    (i8) => {
+        vector_signed_char
+    };
+
+    (u32) => {
+        vector_unsigned_int
+    };
+    (u16) => {
+        vector_unsigned_short
+    };
+    (u8) => {
+        vector_unsigned_char
+    };
+
+    (f32) => {
+        vector_float
+    };
+}
+
+macro_rules! impl_from {
+    ($s: ident) => {
+        impl From<$s> for s_t_l!($s) {
+            fn from (v: $s) -> Self {
+                unsafe {
+                    transmute(v)
+                }
+            }
+        }
+    };
+    ($($s: ident),*) => {
+        $(
+            impl_from! { $s }
+        )*
+    };
+}
+
+impl_from! { i8x16, u8x16,  i16x8, u16x8, i32x4, u32x4, f32x4 }
+
+macro_rules! impl_neg {
+    ($s: ident : $zero: expr) => {
+        impl crate::ops::Neg for s_t_l!($s) {
+            type Output = s_t_l!($s);
+            fn neg(self) -> Self::Output {
+                let zero = $s::splat($zero);
+                unsafe { transmute(simd_sub(zero, transmute(self))) }
+            }
+        }
+    };
+}
+
+impl_neg! { i8x16 : 0 }
+impl_neg! { i16x8 : 0 }
+impl_neg! { i32x4 : 0 }
+impl_neg! { f32x4 : 0f32 }
+
+mod sealed {
+    use super::*;
+
+    macro_rules! test_impl {
+        ($fun:ident ($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, $instr:ident]) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr($instr))]
+            pub unsafe fn $fun ($($v : $ty),*) -> $r {
+                $call ($($v),*)
+            }
+        };
+        ($fun:ident ($($v:ident : $ty:ty),*) -> $r:ty [$call:ident, $instr_altivec:ident / $instr_vsx:ident]) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(all(test, not(target_feature="vsx")), assert_instr($instr_altivec))]
+            #[cfg_attr(all(test, target_feature="vsx"), assert_instr($instr_vsx))]
+            pub unsafe fn $fun ($($v : $ty),*) -> $r {
+                $call ($($v),*)
+            }
+        }
+
+    }
+
+    #[allow(unknown_lints, unused_macro_rules)]
+    macro_rules! impl_vec_trait {
+        ([$Trait:ident $m:ident] $fun:ident ($a:ty)) => {
+            impl $Trait for $a {
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn $m(self) -> Self {
+                    $fun(transmute(self))
+                }
+            }
+        };
+        ([$Trait:ident $m:ident] $fun:ident ($a:ty) -> $r:ty) => {
+            impl $Trait for $a {
+                type Result = $r;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn $m(self) -> Self::Result {
+                    $fun(transmute(self))
+                }
+            }
+        };
+        ([$Trait:ident $m:ident] 1 ($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident, $sf: ident)) => {
+            impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int) -> vector_signed_int }
+            impl_vec_trait!{ [$Trait $m] $sf (vector_float) -> vector_float }
+        };
+        ([$Trait:ident $m:ident] $fun:ident ($a:ty, $b:ty) -> $r:ty) => {
+            impl $Trait<$b> for $a {
+                type Result = $r;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn $m(self, b: $b) -> Self::Result {
+                    $fun(transmute(self), transmute(b))
+                }
+            }
+        };
+        ([$Trait:ident $m:ident] $fun:ident ($a:ty, ~$b:ty) -> $r:ty) => {
+            impl_vec_trait!{ [$Trait $m] $fun ($a, $a) -> $r }
+            impl_vec_trait!{ [$Trait $m] $fun ($a, $b) -> $r }
+            impl_vec_trait!{ [$Trait $m] $fun ($b, $a) -> $r }
+        };
+        ([$Trait:ident $m:ident] ~($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident)) => {
+            impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char, ~vector_bool_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char, ~vector_bool_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short, ~vector_bool_short) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short, ~vector_bool_short) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int, ~vector_bool_int) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int, ~vector_bool_int) -> vector_signed_int }
+        };
+        ([$Trait:ident $m:ident] ~($fn:ident)) => {
+            impl_vec_trait!{ [$Trait $m] ~($fn, $fn, $fn, $fn, $fn, $fn) }
+        };
+        ([$Trait:ident $m:ident] 2 ($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident)) => {
+            impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char, vector_signed_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short, vector_signed_short) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int, vector_signed_int) -> vector_signed_int }
+        };
+        ([$Trait:ident $m:ident] 2 ($fn:ident)) => {
+            impl_vec_trait!{ [$Trait $m] ($fn, $fn, $fn, $fn, $fn, $fn) }
+        }
+    }
+
+    macro_rules! impl_vec_cmp {
+        ([$Trait:ident $m:ident] ($b:ident, $h:ident, $w:ident)) => {
+            impl_vec_cmp! { [$Trait $m] ($b, $b, $h, $h, $w, $w) }
+        };
+        ([$Trait:ident $m:ident] ($ub:ident, $sb:ident, $uh:ident, $sh:ident, $uw:ident, $sw:ident)) => {
+            impl_vec_trait!{ [$Trait $m] $ub (vector_unsigned_char, vector_unsigned_char) -> vector_bool_char }
+            impl_vec_trait!{ [$Trait $m] $sb (vector_signed_char, vector_signed_char) -> vector_bool_char }
+            impl_vec_trait!{ [$Trait $m] $uh (vector_unsigned_short, vector_unsigned_short) -> vector_bool_short }
+            impl_vec_trait!{ [$Trait $m] $sh (vector_signed_short, vector_signed_short) -> vector_bool_short }
+            impl_vec_trait!{ [$Trait $m] $uw (vector_unsigned_int, vector_unsigned_int) -> vector_bool_int }
+            impl_vec_trait!{ [$Trait $m] $sw (vector_signed_int, vector_signed_int) -> vector_bool_int }
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn load(off: i32, p: *const i8) -> u32x4 {
+        let addr = p.offset(off as isize);
+
+        *(addr as *const u32x4)
+    }
+
+    pub trait VectorLd {
+        type Result;
+        unsafe fn vec_ld(self, off: i32) -> Self::Result;
+    }
+
+    macro_rules! impl_vec_ld {
+        ($fun:ident $ty:ident [$instr:ident]) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr($instr))]
+            pub unsafe fn $fun(off: i32, p: *const $ty) -> t_t_l!($ty) {
+                transmute(load(off, p as *const i8))
+            }
+
+            impl VectorLd for *const $ty {
+                type Result = t_t_l!($ty);
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_ld(self, off: i32) -> Self::Result {
+                    $fun(off, self)
+                }
+            }
+        };
+        ($fun:ident $ty:ident) => {
+            impl_vec_ld! { $fun $ty [lvx] }
+        };
+    }
+
+    impl_vec_ld! { vec_ld_u8 u8 }
+    impl_vec_ld! { vec_ld_i8 i8 }
+
+    impl_vec_ld! { vec_ld_u16 u16 }
+    impl_vec_ld! { vec_ld_i16 i16 }
+
+    impl_vec_ld! { vec_ld_u32 u32 }
+    impl_vec_ld! { vec_ld_i32 i32 }
+
+    impl_vec_ld! { vec_ld_f32 f32 }
+
+    test_impl! { vec_floor(a: vector_float) -> vector_float [ vfloor, vrfim / xvrspim ] }
+
+    test_impl! { vec_vexptefp(a: vector_float) -> vector_float [ vexptefp, vexptefp ] }
+
+    test_impl! { vec_vcmpgtub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_bool_char [ vcmpgtub, vcmpgtub ] }
+    test_impl! { vec_vcmpgtuh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_bool_short [ vcmpgtuh, vcmpgtuh ] }
+    test_impl! { vec_vcmpgtuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_bool_int [ vcmpgtuw, vcmpgtuw ] }
+
+    test_impl! { vec_vcmpgtsb(a: vector_signed_char, b: vector_signed_char) -> vector_bool_char [ vcmpgtsb, vcmpgtsb ] }
+    test_impl! { vec_vcmpgtsh(a: vector_signed_short, b: vector_signed_short) -> vector_bool_short [ vcmpgtsh, vcmpgtsh ] }
+    test_impl! { vec_vcmpgtsw(a: vector_signed_int, b: vector_signed_int) -> vector_bool_int [ vcmpgtsw, vcmpgtsw ] }
+
+    pub trait VectorCmpGt<Other> {
+        type Result;
+        unsafe fn vec_cmpgt(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_cmp! { [VectorCmpGt vec_cmpgt] ( vec_vcmpgtub, vec_vcmpgtsb, vec_vcmpgtuh, vec_vcmpgtsh, vec_vcmpgtuw, vec_vcmpgtsw ) }
+
+    test_impl! { vec_vcmpgefp(a: vector_float, b: vector_float) -> vector_bool_int [ vcmpgefp, vcmpgefp ] }
+
+    test_impl! { vec_vcmpequb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_bool_char [ vcmpequb, vcmpequb ] }
+    test_impl! { vec_vcmpequh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_bool_short [ vcmpequh, vcmpequh ] }
+    test_impl! { vec_vcmpequw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_bool_int [ vcmpequw, vcmpequw ] }
+
+    pub trait VectorCmpEq<Other> {
+        type Result;
+        unsafe fn vec_cmpeq(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_cmp! { [VectorCmpEq vec_cmpeq] (vec_vcmpequb, vec_vcmpequh, vec_vcmpequw) }
+
+    test_impl! { vec_vcmpbfp(a: vector_float, b: vector_float) -> vector_signed_int [vcmpbfp, vcmpbfp] }
+
+    test_impl! { vec_vceil(a: vector_float) -> vector_float [vceil, vrfip / xvrspip ] }
+
+    test_impl! { vec_vavgsb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ vavgsb, vavgsb ] }
+    test_impl! { vec_vavgsh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [ vavgsh, vavgsh ] }
+    test_impl! { vec_vavgsw(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [ vavgsw, vavgsw ] }
+    test_impl! { vec_vavgub(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [ vavgub, vavgub ] }
+    test_impl! { vec_vavguh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [ vavguh, vavguh ] }
+    test_impl! { vec_vavguw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [ vavguw, vavguw ] }
+
+    pub trait VectorAvg<Other> {
+        type Result;
+        unsafe fn vec_avg(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAvg vec_avg] 2 (vec_vavgub, vec_vavgsb, vec_vavguh, vec_vavgsh, vec_vavguw, vec_vavgsw) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(all(test, not(target_feature = "vsx")), assert_instr(vandc))]
+    #[cfg_attr(all(test, target_feature = "vsx"), assert_instr(xxlandc))]
+    unsafe fn andc(a: u8x16, b: u8x16) -> u8x16 {
+        simd_and(simd_xor(u8x16::splat(0xff), b), a)
+    }
+
+    pub trait VectorAndc<Other> {
+        type Result;
+        unsafe fn vec_andc(self, b: Other) -> Self::Result;
+    }
+
+    macro_rules! impl_vec_andc {
+        (($a:ty, $b:ty) -> $r:ty) => {
+            impl VectorAndc<$b> for $a {
+                type Result = $r;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_andc(self, b: $b) -> Self::Result {
+                    transmute(andc(transmute(self), transmute(b)))
+                }
+            }
+        };
+        (($a:ty, ~$b:ty) -> $r:ty) => {
+            impl_vec_andc! { ($a, $a) -> $r }
+            impl_vec_andc! { ($a, $b) -> $r }
+            impl_vec_andc! { ($b, $a) -> $r }
+        };
+    }
+
+    impl_vec_andc! { (vector_unsigned_char, ~vector_bool_char) -> vector_unsigned_char }
+    impl_vec_andc! { (vector_signed_char, ~vector_bool_char) -> vector_signed_char }
+    impl_vec_andc! { (vector_unsigned_short, ~vector_bool_short) -> vector_unsigned_short }
+    impl_vec_andc! { (vector_signed_short, ~vector_bool_short) -> vector_signed_short }
+    impl_vec_andc! { (vector_unsigned_int, ~vector_bool_int) -> vector_unsigned_int }
+    impl_vec_andc! { (vector_signed_int, ~vector_bool_int) -> vector_signed_int }
+
+    test_impl! { vec_vand(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ simd_and, vand / xxland ] }
+
+    pub trait VectorAnd<Other> {
+        type Result;
+        unsafe fn vec_and(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAnd vec_and] ~(simd_and) }
+
+    test_impl! { vec_vaddsbs(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ vaddsbs, vaddsbs ] }
+    test_impl! { vec_vaddshs(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [ vaddshs, vaddshs ] }
+    test_impl! { vec_vaddsws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [ vaddsws, vaddsws ] }
+    test_impl! { vec_vaddubs(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [ vaddubs, vaddubs ] }
+    test_impl! { vec_vadduhs(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [ vadduhs, vadduhs ] }
+    test_impl! { vec_vadduws(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [ vadduws, vadduws ] }
+
+    pub trait VectorAdds<Other> {
+        type Result;
+        unsafe fn vec_adds(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorAdds vec_adds] ~(vaddubs, vaddsbs, vadduhs, vaddshs, vadduws, vaddsws) }
+
+    test_impl! { vec_vaddcuw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vaddcuw, vaddcuw] }
+
+    test_impl! { vec_vsubsbs(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [ vsubsbs, vsubsbs ] }
+    test_impl! { vec_vsubshs(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [ vsubshs, vsubshs ] }
+    test_impl! { vec_vsubsws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [ vsubsws, vsubsws ] }
+    test_impl! { vec_vsububs(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [ vsububs, vsububs ] }
+    test_impl! { vec_vsubuhs(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [ vsubuhs, vsubuhs ] }
+    test_impl! { vec_vsubuws(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [ vsubuws, vsubuws ] }
+
+    pub trait VectorSubs<Other> {
+        type Result;
+        unsafe fn vec_subs(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorSubs vec_subs] ~(vsububs, vsubsbs, vsubuhs, vsubshs, vsubuws, vsubsws) }
+
+    pub trait VectorAbs {
+        unsafe fn vec_abs(self) -> Self;
+    }
+
+    macro_rules! impl_abs {
+        ($name:ident,  $ty: ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $name(v: s_t_l!($ty)) -> s_t_l!($ty) {
+                v.vec_max(-v)
+            }
+
+            impl_vec_trait! { [VectorAbs vec_abs] $name (s_t_l!($ty)) }
+        };
+    }
+
+    impl_abs! { vec_abs_i8, i8x16 }
+    impl_abs! { vec_abs_i16, i16x8 }
+    impl_abs! { vec_abs_i32, i32x4 }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    unsafe fn vec_abs_f32(v: vector_float) -> vector_float {
+        let v: u32x4 = transmute(v);
+
+        transmute(simd_and(v, u32x4::splat(0x7FFFFFFF)))
+    }
+
+    impl_vec_trait! { [VectorAbs vec_abs] vec_abs_f32 (vector_float) }
+
+    pub trait VectorAbss {
+        unsafe fn vec_abss(self) -> Self;
+    }
+
+    macro_rules! impl_abss {
+        ($name:ident,  $ty: ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $name(v: s_t_l!($ty)) -> s_t_l!($ty) {
+                let zero: s_t_l!($ty) = transmute(0u8.vec_splats());
+                v.vec_max(zero.vec_subs(v))
+            }
+
+            impl_vec_trait! { [VectorAbss vec_abss] $name (s_t_l!($ty)) }
+        };
+    }
+
+    impl_abss! { vec_abss_i8, i8x16 }
+    impl_abss! { vec_abss_i16, i16x8 }
+    impl_abss! { vec_abss_i32, i32x4 }
+
+    macro_rules! splats {
+        ($name:ident, $v:ident, $r:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn $name(v: $v) -> s_t_l!($r) {
+                transmute($r::splat(v))
+            }
+        };
+    }
+
+    splats! { splats_u8, u8, u8x16 }
+    splats! { splats_u16, u16, u16x8 }
+    splats! { splats_u32, u32, u32x4 }
+    splats! { splats_i8, i8, i8x16 }
+    splats! { splats_i16, i16, i16x8 }
+    splats! { splats_i32, i32, i32x4 }
+    splats! { splats_f32, f32, f32x4 }
+
+    test_impl! { vec_splats_u8 (v: u8) -> vector_unsigned_char [splats_u8, vspltb] }
+    test_impl! { vec_splats_u16 (v: u16) -> vector_unsigned_short [splats_u16, vsplth] }
+    test_impl! { vec_splats_u32 (v: u32) -> vector_unsigned_int [splats_u32, vspltw / xxspltw] }
+    test_impl! { vec_splats_i8 (v: i8) -> vector_signed_char [splats_i8, vspltb] }
+    test_impl! { vec_splats_i16 (v: i16) -> vector_signed_short [splats_i16, vsplth] }
+    test_impl! { vec_splats_i32 (v: i32) -> vector_signed_int [splats_i32, vspltw / xxspltw] }
+    test_impl! { vec_splats_f32 (v: f32) -> vector_float [splats_f32, vspltw / xxspltw] }
+
+    pub trait VectorSplats {
+        type Result;
+        unsafe fn vec_splats(self) -> Self::Result;
+    }
+
+    macro_rules! impl_vec_splats {
+        ($(($fn:ident ($ty:ty) -> $r:ty)),*) => {
+            $(
+                impl_vec_trait!{ [VectorSplats vec_splats] $fn ($ty) -> $r }
+            )*
+        }
+    }
+
+    impl_vec_splats! {
+        (vec_splats_u8 (u8) -> vector_unsigned_char),
+        (vec_splats_i8 (i8) -> vector_signed_char),
+        (vec_splats_u16 (u16) -> vector_unsigned_short),
+        (vec_splats_i16 (i16) -> vector_signed_short),
+        (vec_splats_u32 (u32) -> vector_unsigned_int),
+        (vec_splats_i32 (i32) -> vector_signed_int),
+        (vec_splats_f32 (f32) -> vector_float)
+    }
+
+    test_impl! { vec_vsububm (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [simd_sub, vsububm] }
+    test_impl! { vec_vsubuhm (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [simd_sub, vsubuhm] }
+    test_impl! { vec_vsubuwm (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [simd_sub, vsubuwm] }
+
+    pub trait VectorSub<Other> {
+        type Result;
+        unsafe fn vec_sub(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorSub vec_sub] ~(simd_sub, simd_sub, simd_sub, simd_sub, simd_sub, simd_sub) }
+    impl_vec_trait! { [VectorSub vec_sub] simd_sub(vector_float, vector_float) -> vector_float }
+
+    test_impl! { vec_vminsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vminsb, vminsb] }
+    test_impl! { vec_vminsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vminsh, vminsh] }
+    test_impl! { vec_vminsw (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vminsw, vminsw] }
+
+    test_impl! { vec_vminub (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vminub, vminub] }
+    test_impl! { vec_vminuh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vminuh, vminuh] }
+    test_impl! { vec_vminuw (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vminuw, vminuw] }
+
+    pub trait VectorMin<Other> {
+        type Result;
+        unsafe fn vec_min(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorMin vec_min] ~(vminub, vminsb, vminuh, vminsh, vminuw, vminsw) }
+
+    test_impl! { vec_vmaxsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vmaxsb, vmaxsb] }
+    test_impl! { vec_vmaxsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vmaxsh, vmaxsh] }
+    test_impl! { vec_vmaxsw (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vmaxsw, vmaxsw] }
+
+    test_impl! { vec_vmaxub (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vmaxub, vmaxub] }
+    test_impl! { vec_vmaxuh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vmaxuh, vmaxuh] }
+    test_impl! { vec_vmaxuw (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vmaxuw, vmaxuw] }
+
+    pub trait VectorMax<Other> {
+        type Result;
+        unsafe fn vec_max(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_trait! { [VectorMax vec_max] ~(vmaxub, vmaxsb, vmaxuh, vmaxsh, vmaxuw, vmaxsw) }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuleub))]
+    unsafe fn vec_vmuleub(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_short {
+        vmuleub(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulesb))]
+    unsafe fn vec_vmulesb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short {
+        vmulesb(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuleuh))]
+    unsafe fn vec_vmuleuh(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_int {
+        vmuleuh(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulesh))]
+    unsafe fn vec_vmulesh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int {
+        vmulesh(a, b)
+    }
+
+    pub trait VectorMule<Result> {
+        unsafe fn vec_mule(self, b: Self) -> Result;
+    }
+
+    impl VectorMule<vector_unsigned_short> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_unsigned_short {
+            vmuleub(self, b)
+        }
+    }
+    impl VectorMule<vector_signed_short> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_signed_short {
+            vmulesb(self, b)
+        }
+    }
+    impl VectorMule<vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_unsigned_int {
+            vmuleuh(self, b)
+        }
+    }
+    impl VectorMule<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mule(self, b: Self) -> vector_signed_int {
+            vmulesh(self, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmuloub))]
+    unsafe fn vec_vmuloub(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_short {
+        vmuloub(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulosb))]
+    unsafe fn vec_vmulosb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_short {
+        vmulosb(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulouh))]
+    unsafe fn vec_vmulouh(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_int {
+        vmulouh(a, b)
+    }
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmulosh))]
+    unsafe fn vec_vmulosh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_int {
+        vmulosh(a, b)
+    }
+
+    pub trait VectorMulo<Result> {
+        unsafe fn vec_mulo(self, b: Self) -> Result;
+    }
+
+    impl VectorMulo<vector_unsigned_short> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_unsigned_short {
+            vmuloub(self, b)
+        }
+    }
+    impl VectorMulo<vector_signed_short> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_signed_short {
+            vmulosb(self, b)
+        }
+    }
+    impl VectorMulo<vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_unsigned_int {
+            vmulouh(self, b)
+        }
+    }
+    impl VectorMulo<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_mulo(self, b: Self) -> vector_signed_int {
+            vmulosh(self, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum4ubs))]
+    unsafe fn vec_vsum4ubs(a: vector_unsigned_char, b: vector_unsigned_int) -> vector_unsigned_int {
+        vsum4ubs(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum4sbs))]
+    unsafe fn vec_vsum4sbs(a: vector_signed_char, b: vector_signed_int) -> vector_signed_int {
+        vsum4sbs(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum4shs))]
+    unsafe fn vec_vsum4shs(a: vector_signed_short, b: vector_signed_int) -> vector_signed_int {
+        vsum4shs(a, b)
+    }
+
+    pub trait VectorSum4s<Other> {
+        unsafe fn vec_sum4s(self, b: Other) -> Other;
+    }
+
+    impl VectorSum4s<vector_unsigned_int> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_sum4s(self, b: vector_unsigned_int) -> vector_unsigned_int {
+            vsum4ubs(self, b)
+        }
+    }
+
+    impl VectorSum4s<vector_signed_int> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_sum4s(self, b: vector_signed_int) -> vector_signed_int {
+            vsum4sbs(self, b)
+        }
+    }
+
+    impl VectorSum4s<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_sum4s(self, b: vector_signed_int) -> vector_signed_int {
+            vsum4shs(self, b)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsum2sws))]
+    unsafe fn vec_vsum2sws(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        vsum2sws(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vnmsubfp))]
+    unsafe fn vec_vnmsubfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+        vnmsubfp(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmaddfp))]
+    unsafe fn vec_vmaddfp(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+        vmaddfp(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumubm))]
+    unsafe fn vec_vmsumubm(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        vmsumubm(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsummbm))]
+    unsafe fn vec_vmsummbm(
+        a: vector_signed_char,
+        b: vector_unsigned_char,
+        c: vector_signed_int,
+    ) -> vector_signed_int {
+        vmsummbm(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumuhm))]
+    unsafe fn vec_vmsumuhm(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        vmsumuhm(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumshm))]
+    unsafe fn vec_vmsumshm(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int {
+        vmsumshm(a, b, c)
+    }
+
+    pub trait VectorMsum<B, Other> {
+        unsafe fn vec_msum(self, b: B, c: Other) -> Other;
+    }
+
+    impl VectorMsum<vector_unsigned_char, vector_unsigned_int> for vector_unsigned_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_unsigned_char,
+            c: vector_unsigned_int,
+        ) -> vector_unsigned_int {
+            vmsumubm(self, b, c)
+        }
+    }
+
+    impl VectorMsum<vector_unsigned_char, vector_signed_int> for vector_signed_char {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_unsigned_char,
+            c: vector_signed_int,
+        ) -> vector_signed_int {
+            vmsummbm(self, b, c)
+        }
+    }
+
+    impl VectorMsum<vector_unsigned_short, vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_unsigned_short,
+            c: vector_unsigned_int,
+        ) -> vector_unsigned_int {
+            vmsumuhm(self, b, c)
+        }
+    }
+
+    impl VectorMsum<vector_signed_short, vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msum(
+            self,
+            b: vector_signed_short,
+            c: vector_signed_int,
+        ) -> vector_signed_int {
+            vmsumshm(self, b, c)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumuhs))]
+    unsafe fn vec_vmsumuhs(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+        c: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        vmsumuhs(a, b, c)
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmsumshs))]
+    unsafe fn vec_vmsumshs(
+        a: vector_signed_short,
+        b: vector_signed_short,
+        c: vector_signed_int,
+    ) -> vector_signed_int {
+        vmsumshs(a, b, c)
+    }
+
+    pub trait VectorMsums<Other> {
+        unsafe fn vec_msums(self, b: Self, c: Other) -> Other;
+    }
+
+    impl VectorMsums<vector_unsigned_int> for vector_unsigned_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msums(self, b: Self, c: vector_unsigned_int) -> vector_unsigned_int {
+            vmsumuhs(self, b, c)
+        }
+    }
+
+    impl VectorMsums<vector_signed_int> for vector_signed_short {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_msums(self, b: Self, c: vector_signed_int) -> vector_signed_int {
+            vmsumshs(self, b, c)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vperm))]
+    unsafe fn vec_vperm(
+        a: vector_signed_int,
+        b: vector_signed_int,
+        c: vector_unsigned_char,
+    ) -> vector_signed_int {
+        vperm(a, b, c)
+    }
+
+    pub trait VectorPerm {
+        unsafe fn vec_vperm(self, b: Self, c: vector_unsigned_char) -> Self;
+    }
+
+    macro_rules! vector_perm {
+        {$impl: ident} => {
+            impl VectorPerm for $impl {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            unsafe fn vec_vperm(self, b: Self, c: vector_unsigned_char) -> Self {
+                    transmute(vec_vperm(transmute(self), transmute(b), c))
+                }
+            }
+        }
+    }
+
+    vector_perm! { vector_signed_char }
+    vector_perm! { vector_unsigned_char }
+    vector_perm! { vector_bool_char }
+
+    vector_perm! { vector_signed_short }
+    vector_perm! { vector_unsigned_short }
+    vector_perm! { vector_bool_short }
+
+    vector_perm! { vector_signed_int }
+    vector_perm! { vector_unsigned_int }
+    vector_perm! { vector_bool_int }
+
+    vector_perm! { vector_float }
+
+    pub trait VectorAdd<Other> {
+        type Result;
+        unsafe fn vec_add(self, other: Other) -> Self::Result;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_bc_sc(a: vector_bool_char, b: vector_signed_char) -> vector_signed_char {
+        simd_add(transmute(a), b)
+    }
+    impl VectorAdd<vector_signed_char> for vector_bool_char {
+        type Result = vector_signed_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_char) -> Self::Result {
+            vec_add_bc_sc(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_char> for vector_signed_char {
+        type Result = vector_signed_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_char) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_sc_sc(
+        a: vector_signed_char,
+        b: vector_signed_char,
+    ) -> vector_signed_char {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_signed_char> for vector_signed_char {
+        type Result = vector_signed_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_char) -> Self::Result {
+            vec_add_sc_sc(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_bc_uc(
+        a: vector_bool_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        simd_add(transmute(a), b)
+    }
+    impl VectorAdd<vector_unsigned_char> for vector_bool_char {
+        type Result = vector_unsigned_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_char) -> Self::Result {
+            vec_add_bc_uc(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_char> for vector_unsigned_char {
+        type Result = vector_unsigned_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_char) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vaddubm))]
+    pub unsafe fn vec_add_uc_uc(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_unsigned_char> for vector_unsigned_char {
+        type Result = vector_unsigned_char;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_char) -> Self::Result {
+            vec_add_uc_uc(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_bs_ss(
+        a: vector_bool_short,
+        b: vector_signed_short,
+    ) -> vector_signed_short {
+        let a: i16x8 = transmute(a);
+        let a: vector_signed_short = simd_cast(a);
+        simd_add(a, b)
+    }
+
+    impl VectorAdd<vector_signed_short> for vector_bool_short {
+        type Result = vector_signed_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_short) -> Self::Result {
+            vec_add_bs_ss(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_short> for vector_signed_short {
+        type Result = vector_signed_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_short) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_ss_ss(
+        a: vector_signed_short,
+        b: vector_signed_short,
+    ) -> vector_signed_short {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_signed_short> for vector_signed_short {
+        type Result = vector_signed_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_short) -> Self::Result {
+            vec_add_ss_ss(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_bs_us(
+        a: vector_bool_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_short {
+        let a: i16x8 = transmute(a);
+        let a: vector_unsigned_short = simd_cast(a);
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_unsigned_short> for vector_bool_short {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_short) -> Self::Result {
+            vec_add_bs_us(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_short> for vector_unsigned_short {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_short) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduhm))]
+    pub unsafe fn vec_add_us_us(
+        a: vector_unsigned_short,
+        b: vector_unsigned_short,
+    ) -> vector_unsigned_short {
+        simd_add(a, b)
+    }
+
+    impl VectorAdd<vector_unsigned_short> for vector_unsigned_short {
+        type Result = vector_unsigned_short;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_short) -> Self::Result {
+            vec_add_us_us(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_bi_si(a: vector_bool_int, b: vector_signed_int) -> vector_signed_int {
+        let a: i32x4 = transmute(a);
+        let a: vector_signed_int = simd_cast(a);
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_signed_int> for vector_bool_int {
+        type Result = vector_signed_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_int) -> Self::Result {
+            vec_add_bi_si(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_int> for vector_signed_int {
+        type Result = vector_signed_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_int) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_si_si(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_signed_int> for vector_signed_int {
+        type Result = vector_signed_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_signed_int) -> Self::Result {
+            vec_add_si_si(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_bi_ui(a: vector_bool_int, b: vector_unsigned_int) -> vector_unsigned_int {
+        let a: i32x4 = transmute(a);
+        let a: vector_unsigned_int = simd_cast(a);
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_unsigned_int> for vector_bool_int {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_int) -> Self::Result {
+            vec_add_bi_ui(self, other)
+        }
+    }
+    impl VectorAdd<vector_bool_int> for vector_unsigned_int {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_bool_int) -> Self::Result {
+            other.vec_add(self)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vadduwm))]
+    pub unsafe fn vec_add_ui_ui(
+        a: vector_unsigned_int,
+        b: vector_unsigned_int,
+    ) -> vector_unsigned_int {
+        simd_add(a, b)
+    }
+    impl VectorAdd<vector_unsigned_int> for vector_unsigned_int {
+        type Result = vector_unsigned_int;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_unsigned_int) -> Self::Result {
+            vec_add_ui_ui(self, other)
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(xvaddsp))]
+    pub unsafe fn vec_add_float_float(a: vector_float, b: vector_float) -> vector_float {
+        simd_add(a, b)
+    }
+
+    impl VectorAdd<vector_float> for vector_float {
+        type Result = vector_float;
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_add(self, other: vector_float) -> Self::Result {
+            vec_add_float_float(self, other)
+        }
+    }
+
+    pub trait VectorMladd<Other> {
+        type Result;
+        unsafe fn vec_mladd(self, b: Other, c: Other) -> Self::Result;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vmladduhm))]
+    unsafe fn mladd(a: i16x8, b: i16x8, c: i16x8) -> i16x8 {
+        simd_add(simd_mul(a, b), c)
+    }
+
+    macro_rules! vector_mladd {
+        ($a: ident, $bc: ident, $d: ident) => {
+            impl VectorMladd<$bc> for $a {
+                type Result = $d;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_mladd(self, b: $bc, c: $bc) -> Self::Result {
+                    let a: i16x8 = transmute(self);
+                    let b: i16x8 = transmute(b);
+                    let c: i16x8 = transmute(c);
+
+                    transmute(mladd(a, b, c))
+                }
+            }
+        };
+    }
+
+    vector_mladd! { vector_unsigned_short, vector_unsigned_short, vector_unsigned_short }
+    vector_mladd! { vector_unsigned_short, vector_signed_short, vector_signed_short }
+    vector_mladd! { vector_signed_short, vector_unsigned_short, vector_signed_short }
+    vector_mladd! { vector_signed_short, vector_signed_short, vector_signed_short }
+}
+
+/// Vector ld.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_ld<T>(off: i32, p: T) -> <T as sealed::VectorLd>::Result
+where
+    T: sealed::VectorLd,
+{
+    p.vec_ld(off)
+}
+
+/// Vector floor.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_floor(a: vector_float) -> vector_float {
+    sealed::vec_floor(a)
+}
+
+/// Vector expte.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_expte(a: vector_float) -> vector_float {
+    sealed::vec_vexptefp(a)
+}
+
+/// Vector cmplt.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_cmplt<T, U>(a: U, b: T) -> <T as sealed::VectorCmpGt<U>>::Result
+where
+    T: sealed::VectorCmpGt<U>,
+{
+    vec_cmpgt(b, a)
+}
+
+/// Vector cmple.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_cmple(a: vector_float, b: vector_float) -> vector_bool_int {
+    vec_cmpge(b, a)
+}
+
+/// Vector cmpgt.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_cmpgt<T, U>(a: T, b: U) -> <T as sealed::VectorCmpGt<U>>::Result
+where
+    T: sealed::VectorCmpGt<U>,
+{
+    a.vec_cmpgt(b)
+}
+
+/// Vector cmpge.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_cmpge(a: vector_float, b: vector_float) -> vector_bool_int {
+    sealed::vec_vcmpgefp(a, b)
+}
+
+/// Vector cmpeq.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_cmpeq<T, U>(a: T, b: U) -> <T as sealed::VectorCmpEq<U>>::Result
+where
+    T: sealed::VectorCmpEq<U>,
+{
+    a.vec_cmpeq(b)
+}
+
+/// Vector cmpb.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_cmpb(a: vector_float, b: vector_float) -> vector_signed_int {
+    sealed::vec_vcmpbfp(a, b)
+}
+
+/// Vector cmpb.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_ceil(a: vector_float) -> vector_float {
+    sealed::vec_vceil(a)
+}
+
+/// Vector avg.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_avg<T, U>(a: T, b: U) -> <T as sealed::VectorAvg<U>>::Result
+where
+    T: sealed::VectorAvg<U>,
+{
+    a.vec_avg(b)
+}
+
+/// Vector andc.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_andc<T, U>(a: T, b: U) -> <T as sealed::VectorAndc<U>>::Result
+where
+    T: sealed::VectorAndc<U>,
+{
+    a.vec_andc(b)
+}
+
+/// Vector and.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_and<T, U>(a: T, b: U) -> <T as sealed::VectorAnd<U>>::Result
+where
+    T: sealed::VectorAnd<U>,
+{
+    a.vec_and(b)
+}
+
+/// Vector adds.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_adds<T, U>(a: T, b: U) -> <T as sealed::VectorAdds<U>>::Result
+where
+    T: sealed::VectorAdds<U>,
+{
+    a.vec_adds(b)
+}
+
+/// Vector addc.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_addc(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int {
+    sealed::vec_vaddcuw(a, b)
+}
+
+/// Vector abs.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_abs<T>(a: T) -> T
+where
+    T: sealed::VectorAbs,
+{
+    a.vec_abs()
+}
+
+/// Vector abss.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_abss<T>(a: T) -> T
+where
+    T: sealed::VectorAbss,
+{
+    a.vec_abss()
+}
+
+/// Vector splats.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_splats<T>(a: T) -> <T as sealed::VectorSplats>::Result
+where
+    T: sealed::VectorSplats,
+{
+    a.vec_splats()
+}
+
+/// Vector sub.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_sub<T, U>(a: T, b: U) -> <T as sealed::VectorSub<U>>::Result
+where
+    T: sealed::VectorSub<U>,
+{
+    a.vec_sub(b)
+}
+
+/// Vector subs.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_subs<T, U>(a: T, b: U) -> <T as sealed::VectorSubs<U>>::Result
+where
+    T: sealed::VectorSubs<U>,
+{
+    a.vec_subs(b)
+}
+
+/// Vector min.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_min<T, U>(a: T, b: U) -> <T as sealed::VectorMin<U>>::Result
+where
+    T: sealed::VectorMin<U>,
+{
+    a.vec_min(b)
+}
+
+/// Vector max.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_max<T, U>(a: T, b: U) -> <T as sealed::VectorMax<U>>::Result
+where
+    T: sealed::VectorMax<U>,
+{
+    a.vec_max(b)
+}
+
+/// Vector add.
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_add<T, U>(a: T, b: U) -> <T as sealed::VectorAdd<U>>::Result
+where
+    T: sealed::VectorAdd<U>,
+{
+    a.vec_add(b)
+}
+
+/// Endian-biased intrinsics
+#[cfg(target_endian = "little")]
+mod endian {
+    use super::*;
+    /// Vector permute.
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_perm<T>(a: T, b: T, c: vector_unsigned_char) -> T
+    where
+        T: sealed::VectorPerm,
+    {
+        // vperm has big-endian bias
+        //
+        // Xor the mask and flip the arguments
+        let d = transmute(u8x16::new(
+            255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+        ));
+        let c = simd_xor(c, d);
+
+        b.vec_vperm(a, c)
+    }
+
+    /// Vector Sum Across Partial (1/2) Saturated
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_sum2s(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        // vsum2sws has big-endian bias
+        //
+        // swap the even b elements with the odd ones
+        let flip = transmute(u8x16::new(
+            4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11,
+        ));
+        let b = vec_perm(b, b, flip);
+        let c = vsum2sws(a, b);
+
+        vec_perm(c, c, flip)
+    }
+
+    // Even and Odd are swapped in little-endian
+    /// Vector Multiply Even
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_mule<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMulo<U>,
+    {
+        a.vec_mulo(b)
+    }
+    /// Vector Multiply Odd
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_mulo<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMule<U>,
+    {
+        a.vec_mule(b)
+    }
+}
+
+/// Vector Multiply Add Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vmhaddshs))]
+pub unsafe fn vec_madds(
+    a: vector_signed_short,
+    b: vector_signed_short,
+    c: vector_signed_short,
+) -> vector_signed_short {
+    vmhaddshs(a, b, c)
+}
+
+/// Vector Multiply Low and Add Unsigned Half Word
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_mladd<T, U>(a: T, b: U, c: U) -> <T as sealed::VectorMladd<U>>::Result
+where
+    T: sealed::VectorMladd<U>,
+{
+    a.vec_mladd(b, c)
+}
+
+/// Vector Multiply Round and Add Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vmhraddshs))]
+pub unsafe fn vec_mradds(
+    a: vector_signed_short,
+    b: vector_signed_short,
+    c: vector_signed_short,
+) -> vector_signed_short {
+    vmhraddshs(a, b, c)
+}
+
+/// Vector Multiply Sum
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_msum<T, B, U>(a: T, b: B, c: U) -> U
+where
+    T: sealed::VectorMsum<B, U>,
+{
+    a.vec_msum(b, c)
+}
+
+/// Vector Multiply Sum Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_msums<T, U>(a: T, b: T, c: U) -> U
+where
+    T: sealed::VectorMsums<U>,
+{
+    a.vec_msums(b, c)
+}
+
+/// Vector Multiply Add
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_madd(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+    vmaddfp(a, b, c)
+}
+
+/// Vector Negative Multiply Subtract
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_nmsub(a: vector_float, b: vector_float, c: vector_float) -> vector_float {
+    vnmsubfp(a, b, c)
+}
+
+/// Vector Sum Across Partial (1/4) Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_sum4s<T, U>(a: T, b: U) -> U
+where
+    T: sealed::VectorSum4s<U>,
+{
+    a.vec_sum4s(b)
+}
+
+#[cfg(target_endian = "big")]
+mod endian {
+    use super::*;
+    /// Vector permute.
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_perm<T>(a: T, b: T, c: vector_unsigned_char) -> T
+    where
+        T: sealed::VectorPerm,
+    {
+        a.vec_vperm(b, c)
+    }
+
+    /// Vector Sum Across Partial (1/2) Saturated
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_sum2s(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int {
+        vsum2sws(a, b)
+    }
+
+    /// Vector Multiply Even
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_mule<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMule<U>,
+    {
+        a.vec_mule(b)
+    }
+    /// Vector Multiply Odd
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    pub unsafe fn vec_mulo<T, U>(a: T, b: T) -> U
+    where
+        T: sealed::VectorMulo<U>,
+    {
+        a.vec_mulo(b)
+    }
+}
+
+pub use self::endian::*;
+
+#[cfg(test)]
+mod tests {
+    #[cfg(target_arch = "powerpc")]
+    use crate::core_arch::arch::powerpc::*;
+
+    #[cfg(target_arch = "powerpc64")]
+    use crate::core_arch::arch::powerpc64::*;
+
+    use std::mem::transmute;
+
+    use crate::core_arch::simd::*;
+    use stdarch_test::simd_test;
+
+    macro_rules! test_vec_2 {
+        { $name: ident, $fn:ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! { $name, $fn, $ty -> $ty, [$($a),+], [$($b),+], [$($d),+] }
+        };
+        { $name: ident, $fn:ident, $ty: ident -> $ty_out: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+                let b: s_t_l!($ty) = transmute($ty::new($($b),+));
+
+                let d = $ty_out::new($($d),+);
+                let r : $ty_out = transmute($fn(a, b));
+                assert_eq!(d, r);
+            }
+         }
+    }
+
+    macro_rules! test_vec_1 {
+        { $name: ident, $fn:ident, f32x4, [$($a:expr),+], ~[$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: vector_float = transmute(f32x4::new($($a),+));
+
+                let d: vector_float = transmute(f32x4::new($($d),+));
+                let r = transmute(vec_cmple(vec_abs(vec_sub($fn(a), d)), vec_splats(std::f32::EPSILON)));
+                let e = m32x4::new(true, true, true, true);
+                assert_eq!(e, r);
+            }
+        };
+        { $name: ident, $fn:ident, $ty: ident, [$($a:expr),+], [$($d:expr),+] } => {
+            test_vec_1! { $name, $fn, $ty -> $ty, [$($a),+], [$($d),+] }
+        };
+        { $name: ident, $fn:ident, $ty: ident -> $ty_out: ident, [$($a:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+
+                let d = $ty_out::new($($d),+);
+                let r : $ty_out = transmute($fn(a));
+                assert_eq!(d, r);
+            }
+        }
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_ld() {
+        let pat = [
+            u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+            u8x16::new(
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ),
+        ];
+
+        for off in 0..16 {
+            let v: u8x16 = transmute(vec_ld(0, (pat.as_ptr() as *const u8).offset(off)));
+            assert_eq!(
+                v,
+                u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+            );
+        }
+        for off in 16..32 {
+            let v: u8x16 = transmute(vec_ld(0, (pat.as_ptr() as *const u8).offset(off)));
+            assert_eq!(
+                v,
+                u8x16::new(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)
+            );
+        }
+    }
+
+    test_vec_1! { test_vec_floor, vec_floor, f32x4,
+        [1.1, 1.9, -0.5, -0.9],
+        [1.0, 1.0, -1.0, -1.0]
+    }
+
+    test_vec_1! { test_vec_expte, vec_expte, f32x4,
+        [0.0, 2.0, 2.0, -1.0],
+        ~[1.0, 4.0, 4.0, 0.5]
+    }
+
+    test_vec_2! { test_vec_cmpgt_i8, vec_cmpgt, i8x16 -> m8x16,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [true, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_u8, vec_cmpgt, u8x16 -> m8x16,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [true, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_i16, vec_cmpgt, i16x8 -> m16x8,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        [true, false, true, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_u16, vec_cmpgt, u16x8 -> m16x8,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        [true, true, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_i32, vec_cmpgt, i32x4 -> m32x4,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        [true, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpgt_u32, vec_cmpgt, u32x4 -> m32x4,
+        [1, 255, 0, 0],
+        [0, 255,  0, 1],
+        [true, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpge, vec_cmpge, f32x4 -> m32x4,
+        [0.1, -0.1, 0.0, 0.99],
+        [0.1, 0.0, 0.1, 1.0],
+        [true, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpeq_i8, vec_cmpeq, i8x16 -> m8x16,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, true]
+    }
+
+    test_vec_2! { test_vec_cmpeq_u8, vec_cmpeq, u8x16 -> m8x16,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, true]
+    }
+
+    test_vec_2! { test_vec_cmpeq_i16, vec_cmpeq, i16x8 -> m16x8,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        [false, false, false, false, true, true, true, true]
+    }
+
+    test_vec_2! { test_vec_cmpeq_u16, vec_cmpeq, u16x8 -> m16x8,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        [false, false, false, false, true, true, true, true]
+    }
+
+    test_vec_2! { test_vec_cmpeq_i32, vec_cmpeq, i32x4 -> m32x4,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        [false, true, true, false]
+    }
+
+    test_vec_2! { test_vec_cmpeq_u32, vec_cmpeq, u32x4 -> m32x4,
+        [1, 255, 0, 0],
+        [0, 255,  0, 1],
+        [false, true, true, false]
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_cmpb() {
+        let a: vector_float = transmute(f32x4::new(0.1, 0.5, 0.6, 0.9));
+        let b: vector_float = transmute(f32x4::new(-0.1, 0.5, -0.6, 0.9));
+        let d = i32x4::new(
+            -0b10000000000000000000000000000000,
+            0,
+            -0b10000000000000000000000000000000,
+            0,
+        );
+
+        assert_eq!(d, transmute(vec_cmpb(a, b)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_ceil() {
+        let a: vector_float = transmute(f32x4::new(0.1, 0.5, 0.6, 0.9));
+        let d = f32x4::new(1.0, 1.0, 1.0, 1.0);
+
+        assert_eq!(d, transmute(vec_ceil(a)));
+    }
+
+    test_vec_2! { test_vec_andc, vec_andc, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b10000000],
+    [0b11001100, 0b00001100, 0b11000000, 0b01001100] }
+
+    test_vec_2! { test_vec_and, vec_and, i32x4,
+    [0b11001100, 0b11001100, 0b11001100, 0b11001100],
+    [0b00110011, 0b11110011, 0b00001100, 0b00000000],
+    [0b00000000, 0b11000000, 0b00001100, 0b00000000] }
+
+    macro_rules! test_vec_avg {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_avg, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_avg! { test_vec_avg_i32x4, i32x4,
+    [i32::MIN, i32::MAX, 1, -1],
+    [-1, 1, 1, -1],
+    [-1073741824, 1073741824, 1, -1] }
+
+    test_vec_avg! { test_vec_avg_u32x4, u32x4,
+    [u32::MAX, 0, 1, 2],
+    [2, 1, 0, 0],
+    [2147483649, 1, 1, 1] }
+
+    test_vec_avg! { test_vec_avg_i16x8, i16x8,
+    [i16::MIN, i16::MAX, 1, -1, 0, 0, 0, 0],
+    [-1, 1, 1, -1, 0, 0, 0, 0],
+    [-16384, 16384, 1, -1, 0, 0, 0, 0] }
+
+    test_vec_avg! { test_vec_avg_u16x8, u16x8,
+    [u16::MAX, 0, 1, 2, 0, 0, 0, 0],
+    [2, 1, 0, 0, 0, 0, 0, 0],
+    [32769, 1, 1, 1, 0, 0, 0, 0] }
+
+    test_vec_avg! { test_vec_avg_i8x16, i8x16,
+    [i8::MIN, i8::MAX, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [-1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [-64, 64, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    test_vec_avg! { test_vec_avg_u8x16, u8x16,
+    [u8::MAX, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [129, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    macro_rules! test_vec_adds {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_adds, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_adds! { test_vec_adds_i32x4, i32x4,
+    [i32::MIN, i32::MAX, 1, -1],
+    [-1, 1, 1, -1],
+    [i32::MIN, i32::MAX, 2, -2] }
+
+    test_vec_adds! { test_vec_adds_u32x4, u32x4,
+    [u32::MAX, 0, 1, 2],
+    [2, 1, 0, 0],
+    [u32::MAX, 1, 1, 2] }
+
+    test_vec_adds! { test_vec_adds_i16x8, i16x8,
+    [i16::MIN, i16::MAX, 1, -1, 0, 0, 0, 0],
+    [-1, 1, 1, -1, 0, 0, 0, 0],
+    [i16::MIN, i16::MAX, 2, -2, 0, 0, 0, 0] }
+
+    test_vec_adds! { test_vec_adds_u16x8, u16x8,
+    [u16::MAX, 0, 1, 2, 0, 0, 0, 0],
+    [2, 1, 0, 0, 0, 0, 0, 0],
+    [u16::MAX, 1, 1, 2, 0, 0, 0, 0] }
+
+    test_vec_adds! { test_vec_adds_i8x16, i8x16,
+    [i8::MIN, i8::MAX, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [-1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [i8::MIN, i8::MAX, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    test_vec_adds! { test_vec_adds_u8x16, u8x16,
+    [u8::MAX, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    [u8::MAX, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    test_vec_2! { test_vec_addc, vec_addc, u32x4, [u32::MAX, 0, 0, 0], [1, 1, 1, 1], [1, 0, 0, 0] }
+
+    macro_rules! test_vec_abs {
+        { $name: ident, $ty: ident, $a: expr, $d: expr } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a = vec_splats($a);
+                let a: s_t_l!($ty) = vec_abs(a);
+                let d = $ty::splat($d);
+                assert_eq!(d, transmute(a));
+            }
+        }
+    }
+
+    test_vec_abs! { test_vec_abs_i8, i8x16, -42i8, 42i8 }
+    test_vec_abs! { test_vec_abs_i16, i16x8, -42i16, 42i16 }
+    test_vec_abs! { test_vec_abs_i32, i32x4, -42i32, 42i32 }
+    test_vec_abs! { test_vec_abs_f32, f32x4, -42f32, 42f32 }
+
+    macro_rules! test_vec_abss {
+        { $name: ident, $ty: ident, $a: expr, $d: expr } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a = vec_splats($a);
+                let a: s_t_l!($ty) = vec_abss(a);
+                let d = $ty::splat($d);
+                assert_eq!(d, transmute(a));
+            }
+        }
+    }
+
+    test_vec_abss! { test_vec_abss_i8, i8x16, -127i8, 127i8 }
+    test_vec_abss! { test_vec_abss_i16, i16x8, -42i16, 42i16 }
+    test_vec_abss! { test_vec_abss_i32, i32x4, -42i32, 42i32 }
+
+    macro_rules! test_vec_splats {
+        { $name: ident, $ty: ident, $a: expr } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = vec_splats($a);
+                let d = $ty::splat($a);
+                assert_eq!(d, transmute(a));
+            }
+        }
+    }
+
+    test_vec_splats! { test_vec_splats_u8, u8x16, 42u8 }
+    test_vec_splats! { test_vec_splats_u16, u16x8, 42u16 }
+    test_vec_splats! { test_vec_splats_u32, u32x4, 42u32 }
+    test_vec_splats! { test_vec_splats_i8, i8x16, 42i8 }
+    test_vec_splats! { test_vec_splats_i16, i16x8, 42i16 }
+    test_vec_splats! { test_vec_splats_i32, i32x4, 42i32 }
+    test_vec_splats! { test_vec_splats_f32, f32x4, 42f32 }
+
+    macro_rules! test_vec_sub {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_sub, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_sub! { test_vec_sub_f32x4, f32x4,
+    [-1.0, 0.0, 1.0, 2.0],
+    [2.0, 1.0, -1.0, -2.0],
+    [-3.0, -1.0, 2.0, 4.0] }
+
+    test_vec_sub! { test_vec_sub_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [-3, -1, 2, 4] }
+
+    test_vec_sub! { test_vec_sub_u32x4, u32x4,
+    [0, 0, 1, 2],
+    [2, 1, 0, 0],
+    [4294967294, 4294967295, 1, 2] }
+
+    test_vec_sub! { test_vec_sub_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_sub! { test_vec_sub_u16x8, u16x8,
+    [0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0],
+    [65534, 65535, 1, 2, 65534, 65535, 1, 2] }
+
+    test_vec_sub! { test_vec_sub_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_sub! { test_vec_sub_u8x16, u8x16,
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0],
+    [254, 255, 1, 2, 254, 255, 1, 2, 254, 255, 1, 2, 254, 255, 1, 2] }
+
+    macro_rules! test_vec_subs {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            test_vec_2! {$name, vec_subs, $ty, [$($a),+], [$($b),+], [$($d),+] }
+        }
+    }
+
+    test_vec_subs! { test_vec_subs_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [-3, -1, 2, 4] }
+
+    test_vec_subs! { test_vec_subs_u32x4, u32x4,
+    [0, 0, 1, 2],
+    [2, 1, 0, 0],
+    [0, 0, 1, 2] }
+
+    test_vec_subs! { test_vec_subs_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_subs! { test_vec_subs_u16x8, u16x8,
+    [0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0],
+    [0, 0, 1, 2, 0, 0, 1, 2] }
+
+    test_vec_subs! { test_vec_subs_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [-3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4, -3, -1, 2, 4] }
+
+    test_vec_subs! { test_vec_subs_u8x16, u8x16,
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0],
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2] }
+
+    macro_rules! test_vec_min {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+                let b: s_t_l!($ty) = transmute($ty::new($($b),+));
+
+                let d = $ty::new($($d),+);
+                let r : $ty = transmute(vec_min(a, b));
+                assert_eq!(d, r);
+            }
+         }
+    }
+
+    test_vec_min! { test_vec_min_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [-1, 0, -1, -2] }
+
+    test_vec_min! { test_vec_min_u32x4, u32x4,
+    [0, 0, 1, 2],
+    [2, 1, 0, 0],
+    [0, 0, 0, 0] }
+
+    test_vec_min! { test_vec_min_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [-1, 0, -1, -2, -1, 0, -1, -2] }
+
+    test_vec_min! { test_vec_min_u16x8, u16x8,
+    [0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0],
+    [0, 0, 0, 0, 0, 0, 0, 0] }
+
+    test_vec_min! { test_vec_min_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [-1, 0, -1, -2, -1, 0, -1, -2, -1, 0, -1, -2, -1, 0, -1, -2] }
+
+    test_vec_min! { test_vec_min_u8x16, u8x16,
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0],
+    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }
+
+    macro_rules! test_vec_max {
+        { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: s_t_l!($ty) = transmute($ty::new($($a),+));
+                let b: s_t_l!($ty) = transmute($ty::new($($b),+));
+
+                let d = $ty::new($($d),+);
+                let r : $ty = transmute(vec_max(a, b));
+                assert_eq!(d, r);
+            }
+         }
+    }
+
+    test_vec_max! { test_vec_max_i32x4, i32x4,
+    [-1, 0, 1, 2],
+    [2, 1, -1, -2],
+    [2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_u32x4, u32x4,
+    [0, 0, 1, 2],
+    [2, 1, 0, 0],
+    [2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_i16x8, i16x8,
+    [-1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2],
+    [2, 1, 1, 2, 2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_u16x8, u16x8,
+    [0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0],
+    [2, 1, 1, 2, 2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_i8x16, i8x16,
+    [-1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2, -1, 0, 1, 2],
+    [2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2, 2, 1, -1, -2],
+    [2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2] }
+
+    test_vec_max! { test_vec_max_u8x16, u8x16,
+    [0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2],
+    [2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0],
+    [2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2] }
+
+    macro_rules! test_vec_perm {
+        {$name:ident,
+         $shorttype:ident, $longtype:ident,
+         [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: $longtype = transmute($shorttype::new($($a),+));
+                let b: $longtype = transmute($shorttype::new($($b),+));
+                let c: vector_unsigned_char = transmute(u8x16::new($($c),+));
+                let d = $shorttype::new($($d),+);
+
+                let r: $shorttype = transmute(vec_perm(a, b, c));
+                assert_eq!(d, r);
+            }
+        }
+    }
+
+    test_vec_perm! {test_vec_perm_u8x16,
+    u8x16, vector_unsigned_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+    test_vec_perm! {test_vec_perm_i8x16,
+    i8x16, vector_signed_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+
+    test_vec_perm! {test_vec_perm_m8x16,
+    m8x16, vector_bool_char,
+    [false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, false, true, true, false, false, true, true, false, false, true, true, false, false, true, true]}
+    test_vec_perm! {test_vec_perm_u16x8,
+    u16x8, vector_unsigned_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
+    test_vec_perm! {test_vec_perm_i16x8,
+    i16x8, vector_signed_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
+    test_vec_perm! {test_vec_perm_m16x8,
+    m16x8, vector_bool_short,
+    [false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, true, false, true, false, true, false, true]}
+
+    test_vec_perm! {test_vec_perm_u32x4,
+    u32x4, vector_unsigned_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
+    test_vec_perm! {test_vec_perm_i32x4,
+    i32x4, vector_signed_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
+    test_vec_perm! {test_vec_perm_m32x4,
+    m32x4, vector_bool_int,
+    [false, false, false, false],
+    [true, true, true, true],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [false, true, false, true]}
+    test_vec_perm! {test_vec_perm_f32x4,
+    f32x4, vector_float,
+    [0.0, 1.0, 2.0, 3.0],
+    [1.0, 1.1, 1.2, 1.3],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0.0, 1.0, 1.0, 1.1]}
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_madds() {
+        let a: vector_signed_short = transmute(i16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_signed_short = transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_short = transmute(i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+
+        let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, 21);
+
+        assert_eq!(d, transmute(vec_madds(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_madd_float() {
+        let a: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let b: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let c: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let d = f32x4::new(
+            0.1 * 0.1 + 0.1,
+            0.2 * 0.2 + 0.2,
+            0.3 * 0.3 + 0.3,
+            0.4 * 0.4 + 0.4,
+        );
+
+        assert_eq!(d, transmute(vec_madd(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_nmsub_float() {
+        let a: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let b: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let c: vector_float = transmute(f32x4::new(0.1, 0.2, 0.3, 0.4));
+        let d = f32x4::new(
+            -(0.1 * 0.1 - 0.1),
+            -(0.2 * 0.2 - 0.2),
+            -(0.3 * 0.3 - 0.3),
+            -(0.4 * 0.4 - 0.4),
+        );
+        assert_eq!(d, transmute(vec_nmsub(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mradds() {
+        let a: vector_signed_short = transmute(i16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_signed_short = transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_short = transmute(i16x8::new(0, 1, 2, 3, 4, 5, 6, i16::MAX - 1));
+
+        let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, i16::MAX);
+
+        assert_eq!(d, transmute(vec_mradds(a, b, c)));
+    }
+
+    macro_rules! test_vec_mladd {
+        {$name:ident, $sa:ident, $la:ident, $sbc:ident, $lbc:ident, $sd:ident,
+            [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "altivec")]
+            unsafe fn $name() {
+                let a: $la = transmute($sa::new($($a),+));
+                let b: $lbc = transmute($sbc::new($($b),+));
+                let c = transmute($sbc::new($($c),+));
+                let d = $sd::new($($d),+);
+
+                assert_eq!(d, transmute(vec_mladd(a, b, c)));
+            }
+        }
+    }
+
+    test_vec_mladd! { test_vec_mladd_u16x8_u16x8, u16x8, vector_unsigned_short, u16x8, vector_unsigned_short, u16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+    test_vec_mladd! { test_vec_mladd_u16x8_i16x8, u16x8, vector_unsigned_short, i16x8, vector_unsigned_short, i16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+    test_vec_mladd! { test_vec_mladd_i16x8_u16x8, i16x8, vector_signed_short, u16x8, vector_unsigned_short, i16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+    test_vec_mladd! { test_vec_mladd_i16x8_i16x8, i16x8, vector_signed_short, i16x8, vector_unsigned_short, i16x8,
+        [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 6, 12, 20, 30, 42, 56]
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_unsigned_char() {
+        let a: vector_unsigned_char =
+            transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_unsigned_char = transmute(u8x16::new(
+            255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+        ));
+        let c: vector_unsigned_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            (0 + 1 + 2 + 3) * 255 + 0,
+            (4 + 5 + 6 + 7) * 255 + 1,
+            (0 + 1 + 2 + 3) * 255 + 2,
+            (4 + 5 + 6 + 7) * 255 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msum(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_signed_char() {
+        let a: vector_signed_char = transmute(i8x16::new(
+            0, -1, 2, -3, 1, -1, 1, -1, 0, 1, 2, 3, 4, -5, -6, -7,
+        ));
+        let b: vector_unsigned_char =
+            transmute(i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1));
+        let c: vector_signed_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            (0 - 1 + 2 - 3) + 0,
+            (0) + 1,
+            (0 + 1 + 2 + 3) + 2,
+            (4 - 5 - 6 - 7) + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msum(a, b, c)));
+    }
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_unsigned_short() {
+        let a: vector_unsigned_short = transmute(u16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_unsigned_short =
+            transmute(u16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_unsigned_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            (0 + 1) * 256 * 256 + 0,
+            (2 + 3) * 256 * 256 + 1,
+            (4 + 5) * 256 * 256 + 2,
+            (6 + 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msum(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msum_signed_short() {
+        let a: vector_signed_short = transmute(i16x8::new(
+            0 * 256,
+            -1 * 256,
+            2 * 256,
+            -3 * 256,
+            4 * 256,
+            -5 * 256,
+            6 * 256,
+            -7 * 256,
+        ));
+        let b: vector_signed_short = transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            (0 - 1) * 256 * 256 + 0,
+            (2 - 3) * 256 * 256 + 1,
+            (4 - 5) * 256 * 256 + 2,
+            (6 - 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msum(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msums_unsigned() {
+        let a: vector_unsigned_short = transmute(u16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ));
+        let b: vector_unsigned_short =
+            transmute(u16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_unsigned_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            (0 + 1) * 256 * 256 + 0,
+            (2 + 3) * 256 * 256 + 1,
+            (4 + 5) * 256 * 256 + 2,
+            (6 + 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msums(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_msums_signed() {
+        let a: vector_signed_short = transmute(i16x8::new(
+            0 * 256,
+            -1 * 256,
+            2 * 256,
+            -3 * 256,
+            4 * 256,
+            -5 * 256,
+            6 * 256,
+            -7 * 256,
+        ));
+        let b: vector_signed_short = transmute(i16x8::new(256, 256, 256, 256, 256, 256, 256, 256));
+        let c: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            (0 - 1) * 256 * 256 + 0,
+            (2 - 3) * 256 * 256 + 1,
+            (4 - 5) * 256 * 256 + 2,
+            (6 - 7) * 256 * 256 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_msums(a, b, c)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum2s() {
+        let a: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let b: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(0, 0 + 1 + 1, 0, 2 + 3 + 3);
+
+        assert_eq!(d, transmute(vec_sum2s(a, b)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum4s_unsigned_char() {
+        let a: vector_unsigned_char =
+            transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_unsigned_int = transmute(u32x4::new(0, 1, 2, 3));
+        let d = u32x4::new(
+            0 + 1 + 2 + 3 + 0,
+            4 + 5 + 6 + 7 + 1,
+            0 + 1 + 2 + 3 + 2,
+            4 + 5 + 6 + 7 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_sum4s(a, b)));
+    }
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum4s_signed_char() {
+        let a: vector_signed_char =
+            transmute(i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(
+            0 + 1 + 2 + 3 + 0,
+            4 + 5 + 6 + 7 + 1,
+            0 + 1 + 2 + 3 + 2,
+            4 + 5 + 6 + 7 + 3,
+        );
+
+        assert_eq!(d, transmute(vec_sum4s(a, b)));
+    }
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_sum4s_signed_short() {
+        let a: vector_signed_short = transmute(i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let b: vector_signed_int = transmute(i32x4::new(0, 1, 2, 3));
+        let d = i32x4::new(0 + 1 + 0, 2 + 3 + 1, 4 + 5 + 2, 6 + 7 + 3);
+
+        assert_eq!(d, transmute(vec_sum4s(a, b)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_unsigned_char() {
+        let a: vector_unsigned_char =
+            transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u16x8::new(0 * 0, 2 * 2, 4 * 4, 6 * 6, 0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_signed_char() {
+        let a: vector_signed_char = transmute(i8x16::new(
+            0, 1, -2, 3, -4, 5, -6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+        ));
+        let d = i16x8::new(0 * 0, 2 * 2, 4 * 4, 6 * 6, 0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_unsigned_short() {
+        let a: vector_unsigned_short = transmute(u16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u32x4::new(0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mule_signed_short() {
+        let a: vector_signed_short = transmute(i16x8::new(0, 1, -2, 3, -4, 5, -6, 7));
+        let d = i32x4::new(0 * 0, 2 * 2, 4 * 4, 6 * 6);
+
+        assert_eq!(d, transmute(vec_mule(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_unsigned_char() {
+        let a: vector_unsigned_char =
+            transmute(u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u16x8::new(1 * 1, 3 * 3, 5 * 5, 7 * 7, 1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_signed_char() {
+        let a: vector_signed_char = transmute(i8x16::new(
+            0, 1, -2, 3, -4, 5, -6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+        ));
+        let d = i16x8::new(1 * 1, 3 * 3, 5 * 5, 7 * 7, 1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_unsigned_short() {
+        let a: vector_unsigned_short = transmute(u16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let d = u32x4::new(1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mulo_signed_short() {
+        let a: vector_signed_short = transmute(i16x8::new(0, 1, -2, 3, -4, 5, -6, 7));
+        let d = i32x4::new(1 * 1, 3 * 3, 5 * 5, 7 * 7);
+
+        assert_eq!(d, transmute(vec_mulo(a, a)));
+    }
+
+    #[simd_test(enable = "altivec")]
+    unsafe fn vec_add_i32x4_i32x4() {
+        let x = i32x4::new(1, 2, 3, 4);
+        let y = i32x4::new(4, 3, 2, 1);
+        let x: vector_signed_int = transmute(x);
+        let y: vector_signed_int = transmute(y);
+        let z = vec_add(x, y);
+        assert_eq!(i32x4::splat(5), transmute(z));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc/mod.rs b/library/stdarch/crates/core_arch/src/powerpc/mod.rs
new file mode 100644
index 000000000..9765d11d1
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/mod.rs
@@ -0,0 +1,19 @@
+//! PowerPC intrinsics
+
+#[cfg(target_feature = "altivec")]
+mod altivec;
+#[cfg(target_feature = "altivec")]
+pub use self::altivec::*;
+
+mod vsx;
+pub use self::vsx::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Generates the trap instruction `TRAP`
+#[cfg_attr(test, assert_instr(trap))]
+#[inline]
+pub unsafe fn trap() -> ! {
+    crate::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc/vsx.rs b/library/stdarch/crates/core_arch/src/powerpc/vsx.rs
new file mode 100644
index 000000000..3141bc8bc
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc/vsx.rs
@@ -0,0 +1,118 @@
+//! PowerPC Vector Scalar eXtensions (VSX) intrinsics.
+//!
+//! The references are: [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA
+//! NVlink)] and [POWER ISA v3.0B (for POWER9)].
+//!
+//! [POWER ISA v2.07B (for POWER8 & POWER8 with NVIDIA NVlink)]: https://ibm.box.com/s/jd5w15gz301s5b5dt375mshpq9c3lh4u
+//! [POWER ISA v3.0B (for POWER9)]: https://ibm.box.com/s/1hzcwkwf8rbju5h9iyf44wm94amnlcrv
+
+#![allow(non_camel_case_types)]
+
+use crate::core_arch::simd_llvm::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::mem;
+
+types! {
+    // pub struct vector_Float16 = f16x8;
+    /// PowerPC-specific 128-bit wide vector of two packed `i64`
+    pub struct vector_signed_long(i64, i64);
+    /// PowerPC-specific 128-bit wide vector of two packed `u64`
+    pub struct vector_unsigned_long(u64, u64);
+    /// PowerPC-specific 128-bit wide vector mask of two `i64`
+    pub struct vector_bool_long(i64, i64);
+    /// PowerPC-specific 128-bit wide vector of two packed `f64`
+    pub struct vector_double(f64, f64);
+    // pub struct vector_signed_long_long = vector_signed_long;
+    // pub struct vector_unsigned_long_long = vector_unsigned_long;
+    // pub struct vector_bool_long_long = vector_bool_long;
+    // pub struct vector_signed___int128 = i128x1;
+    // pub struct vector_unsigned___int128 = i128x1;
+}
+
+mod sealed {
+    use super::*;
+    use crate::core_arch::simd::*;
+
+    pub trait VectorPermDI {
+        unsafe fn vec_xxpermdi(self, b: Self, dm: u8) -> Self;
+    }
+
+    // xxpermdi has an big-endian bias and extended mnemonics
+    #[inline]
+    #[target_feature(enable = "vsx")]
+    #[cfg_attr(all(test, target_endian = "little"), assert_instr(xxmrgld, dm = 0x0))]
+    #[cfg_attr(all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0))]
+    unsafe fn xxpermdi(a: i64x2, b: i64x2, dm: u8) -> i64x2 {
+        match dm & 0b11 {
+            0 => simd_shuffle2!(a, b, [0b00, 0b10]),
+            1 => simd_shuffle2!(a, b, [0b01, 0b10]),
+            2 => simd_shuffle2!(a, b, [0b00, 0b11]),
+            _ => simd_shuffle2!(a, b, [0b01, 0b11]),
+        }
+    }
+
+    macro_rules! vec_xxpermdi {
+        {$impl: ident} => {
+            impl VectorPermDI for $impl {
+                #[inline]
+                #[target_feature(enable = "vsx")]
+                unsafe fn vec_xxpermdi(self, b: Self, dm: u8) -> Self {
+                    mem::transmute(xxpermdi(mem::transmute(self), mem::transmute(b), dm))
+                }
+            }
+        }
+    }
+
+    vec_xxpermdi! { vector_unsigned_long }
+    vec_xxpermdi! { vector_signed_long }
+    vec_xxpermdi! { vector_bool_long }
+    vec_xxpermdi! { vector_double }
+}
+
+/// Vector permute.
+#[inline]
+#[target_feature(enable = "vsx")]
+//#[rustc_legacy_const_generics(2)]
+pub unsafe fn vec_xxpermdi<T, const DM: i32>(a: T, b: T) -> T
+where
+    T: sealed::VectorPermDI,
+{
+    static_assert_imm2!(DM);
+    a.vec_xxpermdi(b, DM as u8)
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(target_arch = "powerpc")]
+    use crate::core_arch::arch::powerpc::*;
+
+    #[cfg(target_arch = "powerpc64")]
+    use crate::core_arch::arch::powerpc64::*;
+
+    use super::mem;
+    use crate::core_arch::simd::*;
+    use stdarch_test::simd_test;
+
+    macro_rules! test_vec_xxpermdi {
+        {$name:ident, $shorttype:ident, $longtype:ident, [$($a:expr),+], [$($b:expr),+], [$($c:expr),+], [$($d:expr),+]} => {
+            #[simd_test(enable = "vsx")]
+            unsafe fn $name() {
+                let a: $longtype = mem::transmute($shorttype::new($($a),+, $($b),+));
+                let b = mem::transmute($shorttype::new($($c),+, $($d),+));
+
+                assert_eq!($shorttype::new($($a),+, $($c),+), mem::transmute(vec_xxpermdi::<_, 0>(a, b)));
+                assert_eq!($shorttype::new($($b),+, $($c),+), mem::transmute(vec_xxpermdi::<_, 1>(a, b)));
+                assert_eq!($shorttype::new($($a),+, $($d),+), mem::transmute(vec_xxpermdi::<_, 2>(a, b)));
+                assert_eq!($shorttype::new($($b),+, $($d),+), mem::transmute(vec_xxpermdi::<_, 3>(a, b)));
+            }
+        }
+    }
+
+    test_vec_xxpermdi! {test_vec_xxpermdi_u64x2, u64x2, vector_unsigned_long, [0], [1], [2], [3]}
+    test_vec_xxpermdi! {test_vec_xxpermdi_i64x2, i64x2, vector_signed_long, [0], [-1], [2], [-3]}
+    test_vec_xxpermdi! {test_vec_xxpermdi_m64x2, m64x2, vector_bool_long, [false], [true], [false], [true]}
+    test_vec_xxpermdi! {test_vec_xxpermdi_f64x2, f64x2, vector_double, [0.0], [1.0], [2.0], [3.0]}
+}
diff --git a/library/stdarch/crates/core_arch/src/powerpc64/mod.rs b/library/stdarch/crates/core_arch/src/powerpc64/mod.rs
new file mode 100644
index 000000000..3990a0e8d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/powerpc64/mod.rs
@@ -0,0 +1,8 @@
+//! PowerPC 64
+//!
+//! The reference is the [64-Bit ELF V2 ABI Specification - Power
+//! Architecture].
+//!
+//! [64-Bit ELF V2 ABI Specification - Power Architecture]: http://openpowerfoundation.org/wp-content/uploads/resources/leabi/leabi-20170510.pdf
+
+pub use crate::core_arch::powerpc::*;
diff --git a/library/stdarch/crates/core_arch/src/riscv64/mod.rs b/library/stdarch/crates/core_arch/src/riscv64/mod.rs
new file mode 100644
index 000000000..751b9a860
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv64/mod.rs
@@ -0,0 +1,49 @@
+//! RISC-V RV64 specific intrinsics
+use crate::arch::asm;
+
+/// Loads virtual machine memory by unsigned word integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This operation is not available under RV32 base instruction set.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.WU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hlv_wu(src: *const u32) -> u32 {
+    let value: u32;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x681", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Loads virtual machine memory by double integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This operation is not available under RV32 base instruction set.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.D`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hlv_d(src: *const i64) -> i64 {
+    let value: i64;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x6C0", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Stores virtual machine memory by double integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.D`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hsv_d(dst: *mut i64, src: i64) {
+    asm!(".insn r 0x73, 0x4, 0x37, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
new file mode 100644
index 000000000..347735df1
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
@@ -0,0 +1,771 @@
+//! Shared RISC-V intrinsics
+
+use crate::arch::asm;
+
+/// Generates the `PAUSE` instruction
+///
+/// The PAUSE instruction is a HINT that indicates the current hart's rate of instruction retirement
+/// should be temporarily reduced or paused. The duration of its effect must be bounded and may be zero.
+#[inline]
+pub fn pause() {
+    unsafe { asm!(".insn i 0x0F, 0, x0, x0, 0x010", options(nomem, nostack)) }
+}
+
+/// Generates the `NOP` instruction
+///
+/// The NOP instruction does not change any architecturally visible state, except for
+/// advancing the `pc` and incrementing any applicable performance counters.
+#[inline]
+pub fn nop() {
+    unsafe { asm!("nop", options(nomem, nostack)) }
+}
+
+/// Generates the `WFI` instruction
+///
+/// The WFI instruction provides a hint to the implementation that the current hart can be stalled
+/// until an interrupt might need servicing. This instruction is a hint,
+/// and a legal implementation is to simply implement WFI as a NOP.
+#[inline]
+pub unsafe fn wfi() {
+    asm!("wfi", options(nomem, nostack))
+}
+
+/// Generates the `FENCE.I` instruction
+///
+/// A FENCE.I instruction ensures that a subsequent instruction fetch on a RISC-V hart will see
+/// any previous data stores already visible to the same RISC-V hart.
+///
+/// FENCE.I does not ensure that other RISC-V harts' instruction fetches will observe the
+/// local hart's stores in a multiprocessor system.
+#[inline]
+pub unsafe fn fence_i() {
+    asm!("fence.i", options(nostack))
+}
+
+/// Supervisor memory management fence for given virtual address and address space
+///
+/// The fence orders only reads and writes made to leaf page table entries corresponding to
+/// the virtual address in parameter `vaddr`, for the address space identified by integer parameter
+/// `asid`. Accesses to global mappings are not ordered. The fence also invalidates all
+/// address-translation cache entries that contain leaf page table entries corresponding to the
+/// virtual address in parameter `vaddr` and that match the address space identified by integer
+/// parameter `asid`, except for entries containing global mappings.
+#[inline]
+pub unsafe fn sfence_vma(vaddr: usize, asid: usize) {
+    asm!("sfence.vma {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+}
+
+/// Supervisor memory management fence for given virtual address
+///
+/// The fence orders only reads and writes made to leaf page table entries corresponding to
+/// the virtual address in parameter `vaddr`, for all address spaces.
+/// The fence also invalidates all address-translation cache entries that contain leaf page
+/// table entries corresponding to the virtual address in parameter `vaddr`, for all address spaces.
+#[inline]
+pub unsafe fn sfence_vma_vaddr(vaddr: usize) {
+    asm!("sfence.vma {}, x0", in(reg) vaddr, options(nostack))
+}
+
+/// Supervisor memory management fence for given address space
+///
+/// The fence orders all reads and writes made to any level of the page tables,
+/// but only for the address space identified by integer parameter `asid`.
+///
+/// Accesses to global mappings are not ordered. The fence also invalidates all
+/// address-translation cache entries matching the address space identified by integer
+/// parameter `asid`, except for entries containing global mappings.
+#[inline]
+pub unsafe fn sfence_vma_asid(asid: usize) {
+    asm!("sfence.vma x0, {}", in(reg) asid, options(nostack))
+}
+
+/// Supervisor memory management fence for all address spaces and virtual addresses
+///
+/// The fence orders all reads and writes made to any level of the page
+/// tables, for all address spaces. The fence also invalidates all address-translation cache entries,
+/// for all address spaces.
+#[inline]
+pub unsafe fn sfence_vma_all() {
+    asm!("sfence.vma", options(nostack))
+}
+
+/// Invalidate supervisor translation cache for given virtual address and address space
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+#[inline]
+pub unsafe fn sinval_vma(vaddr: usize, asid: usize) {
+    // asm!("sinval.vma {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+    asm!(".insn r 0x73, 0, 0x0B, x0, {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+}
+
+/// Invalidate supervisor translation cache for given virtual address
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+#[inline]
+pub unsafe fn sinval_vma_vaddr(vaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x0B, x0, {}, x0", in(reg) vaddr, options(nostack))
+}
+
+/// Invalidate supervisor translation cache for given address space
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+#[inline]
+pub unsafe fn sinval_vma_asid(asid: usize) {
+    asm!(".insn r 0x73, 0, 0x0B, x0, x0, {}", in(reg) asid, options(nostack))
+}
+
+/// Invalidate supervisor translation cache for all address spaces and virtual addresses
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+#[inline]
+pub unsafe fn sinval_vma_all() {
+    asm!(".insn r 0x73, 0, 0x0B, x0, x0, x0", options(nostack))
+}
+
+/// Generates the `SFENCE.W.INVAL` instruction
+///
+/// This instruction guarantees that any previous stores already visible to the current RISC-V hart
+/// are ordered before subsequent `SINVAL.VMA` instructions executed by the same hart.
+#[inline]
+pub unsafe fn sfence_w_inval() {
+    // asm!("sfence.w.inval", options(nostack))
+    asm!(".insn i 0x73, 0, x0, x0, 0x180", options(nostack))
+}
+
+/// Generates the `SFENCE.INVAL.IR` instruction
+///
+/// This instruction guarantees that any previous SINVAL.VMA instructions executed by the current hart
+/// are ordered before subsequent implicit references by that hart to the memory-management data structures.
+#[inline]
+pub unsafe fn sfence_inval_ir() {
+    // asm!("sfence.inval.ir", options(nostack))
+    asm!(".insn i 0x73, 0, x0, x0, 0x181", options(nostack))
+}
+
+/// Loads virtual machine memory by signed byte integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.B`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hlv_b(src: *const i8) -> i8 {
+    let value: i8;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x600", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Loads virtual machine memory by unsigned byte integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.BU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hlv_bu(src: *const u8) -> u8 {
+    let value: u8;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x601", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Loads virtual machine memory by signed half integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.H`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hlv_h(src: *const i16) -> i16 {
+    let value: i16;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x640", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Loads virtual machine memory by unsigned half integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.HU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hlv_hu(src: *const u16) -> u16 {
+    let value: u16;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x641", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Accesses virtual machine instruction by unsigned half integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// the memory being read must be executable in both stages of address translation,
+/// but read permission is not required.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLVX.HU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hlvx_hu(src: *const u16) -> u16 {
+    let insn: u16;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x643", out(reg) insn, in(reg) src, options(readonly, nostack));
+    insn
+}
+
+/// Loads virtual machine memory by signed word integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.W`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hlv_w(src: *const i32) -> i32 {
+    let value: i32;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x680", out(reg) value, in(reg) src, options(readonly, nostack));
+    value
+}
+
+/// Accesses virtual machine instruction by unsigned word integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// the memory being read must be executable in both stages of address translation,
+/// but read permission is not required.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HLVX.WU`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hlvx_wu(src: *const u32) -> u32 {
+    let insn: u32;
+    asm!(".insn i 0x73, 0x4, {}, {}, 0x683", out(reg) insn, in(reg) src, options(readonly, nostack));
+    insn
+}
+
+/// Stores virtual machine memory by byte integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.B`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hsv_b(dst: *mut i8, src: i8) {
+    asm!(".insn r 0x73, 0x4, 0x31, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
+}
+
+/// Stores virtual machine memory by half integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.H`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hsv_h(dst: *mut i16, src: i16) {
+    asm!(".insn r 0x73, 0x4, 0x33, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
+}
+
+/// Stores virtual machine memory by word integer
+///
+/// This instruction performs an explicit memory access as though `V=1`;
+/// i.e., with the address translation and protection, and the endianness, that apply to memory
+/// accesses in either VS-mode or VU-mode.
+///
+/// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.W`
+/// instruction which is effectively a dereference to any memory address.
+#[inline]
+pub unsafe fn hsv_w(dst: *mut i32, src: i32) {
+    asm!(".insn r 0x73, 0x4, 0x35, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
+}
+
+/// Hypervisor memory management fence for given guest virtual address and guest address space
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all
+/// implicit reads by that hart done for VS-stage address translation for instructions that:
+/// - are subsequent to the `HFENCE.VVMA`, and
+/// - execute when `hgatp.VMID` has the same setting as it did when `HFENCE.VVMA` executed.
+///
+/// This fence specifies a single guest virtual address, and a single guest address-space identifier.
+#[inline]
+pub unsafe fn hfence_vvma(vaddr: usize, asid: usize) {
+    // asm!("hfence.vvma {}, {}", in(reg) vaddr, in(reg) asid)
+    asm!(".insn r 0x73, 0, 0x11, x0, {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+}
+
+/// Hypervisor memory management fence for given guest virtual address
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all
+/// implicit reads by that hart done for VS-stage address translation for instructions that:
+/// - are subsequent to the `HFENCE.VVMA`, and
+/// - execute when `hgatp.VMID` has the same setting as it did when `HFENCE.VVMA` executed.
+///
+/// This fence specifies a single guest virtual address.
+#[inline]
+pub unsafe fn hfence_vvma_vaddr(vaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x11, x0, {}, x0", in(reg) vaddr, options(nostack))
+}
+
+/// Hypervisor memory management fence for given guest address space
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all
+/// implicit reads by that hart done for VS-stage address translation for instructions that:
+/// - are subsequent to the `HFENCE.VVMA`, and
+/// - execute when `hgatp.VMID` has the same setting as it did when `HFENCE.VVMA` executed.
+///
+/// This fence specifies a single guest address-space identifier.
+#[inline]
+pub unsafe fn hfence_vvma_asid(asid: usize) {
+    asm!(".insn r 0x73, 0, 0x11, x0, x0, {}", in(reg) asid, options(nostack))
+}
+
+/// Hypervisor memory management fence for all guest address spaces and guest virtual addresses
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all
+/// implicit reads by that hart done for VS-stage address translation for instructions that:
+/// - are subsequent to the `HFENCE.VVMA`, and
+/// - execute when `hgatp.VMID` has the same setting as it did when `HFENCE.VVMA` executed.
+///
+/// This fence applies to any guest address spaces and guest virtual addresses.
+#[inline]
+pub unsafe fn hfence_vvma_all() {
+    asm!(".insn r 0x73, 0, 0x11, x0, x0, x0", options(nostack))
+}
+
+/// Hypervisor memory management fence for guest physical address and virtual machine
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all implicit reads
+/// by that hart done for G-stage address translation for instructions that follow the HFENCE.GVMA.
+///
+/// This fence specifies a single guest physical address, **shifted right by 2 bits**, and a single virtual machine
+/// by virtual machine identifier (VMID).
+#[inline]
+pub unsafe fn hfence_gvma(gaddr: usize, vmid: usize) {
+    // asm!("hfence.gvma {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
+    asm!(".insn r 0x73, 0, 0x31, x0, {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
+}
+
+/// Hypervisor memory management fence for guest physical address
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all implicit reads
+/// by that hart done for G-stage address translation for instructions that follow the HFENCE.GVMA.
+///
+/// This fence specifies a single guest physical address; **the physical address should be shifted right by 2 bits**.
+#[inline]
+pub unsafe fn hfence_gvma_gaddr(gaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x31, x0, {}, x0", in(reg) gaddr, options(nostack))
+}
+
+/// Hypervisor memory management fence for given virtual machine
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all implicit reads
+/// by that hart done for G-stage address translation for instructions that follow the HFENCE.GVMA.
+///
+/// This fence specifies a single virtual machine by virtual machine identifier (VMID).
+#[inline]
+pub unsafe fn hfence_gvma_vmid(vmid: usize) {
+    asm!(".insn r 0x73, 0, 0x31, x0, x0, {}", in(reg) vmid, options(nostack))
+}
+
+/// Hypervisor memory management fence for all virtual machines and guest physical addresses
+///
+/// Guarantees that any previous stores already visible to the current hart are ordered before all implicit reads
+/// by that hart done for G-stage address translation for instructions that follow the HFENCE.GVMA.
+///
+/// This fence specifies all guest physical addresses and all virtual machines.
+#[inline]
+pub unsafe fn hfence_gvma_all() {
+    asm!(".insn r 0x73, 0, 0x31, x0, x0, x0", options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for given guest virtual address and guest address space
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.VVMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+///
+/// This fence specifies a single guest virtual address, and a single guest address-space identifier.
+#[inline]
+pub unsafe fn hinval_vvma(vaddr: usize, asid: usize) {
+    // asm!("hinval.vvma {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+    asm!(".insn r 0x73, 0, 0x13, x0, {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for given guest virtual address
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.VVMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+///
+/// This fence specifies a single guest virtual address.
+#[inline]
+pub unsafe fn hinval_vvma_vaddr(vaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x13, x0, {}, x0", in(reg) vaddr, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for given guest address space
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.VVMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+///
+/// This fence specifies a single guest address-space identifier.
+#[inline]
+pub unsafe fn hinval_vvma_asid(asid: usize) {
+    asm!(".insn r 0x73, 0, 0x13, x0, x0, {}", in(reg) asid, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for all guest address spaces and guest virtual addresses
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.VVMA` instruction with the same values of `vaddr` and `asid` would invalidate.
+///
+/// This fence applies to any guest address spaces and guest virtual addresses.
+#[inline]
+pub unsafe fn hinval_vvma_all() {
+    asm!(".insn r 0x73, 0, 0x13, x0, x0, x0", options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for guest physical address and virtual machine
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
+///
+/// This fence specifies a single guest physical address, **shifted right by 2 bits**, and a single virtual machine
+/// by virtual machine identifier (VMID).
+#[inline]
+pub unsafe fn hinval_gvma(gaddr: usize, vmid: usize) {
+    // asm!("hinval.gvma {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
+    asm!(".insn r 0x73, 0, 0x33, x0, {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for guest physical address
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
+///
+/// This fence specifies a single guest physical address; **the physical address should be shifted right by 2 bits**.
+#[inline]
+pub unsafe fn hinval_gvma_gaddr(gaddr: usize) {
+    asm!(".insn r 0x73, 0, 0x33, x0, {}, x0", in(reg) gaddr, options(nostack))
+}
+
+/// Invalidate hypervisor translation cache for given virtual machine
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
+///
+/// This fence specifies a single virtual machine by virtual machine identifier (VMID).
+#[inline]
+pub unsafe fn hinval_gvma_vmid(vmid: usize) {
+    asm!(".insn r 0x73, 0, 0x33, x0, x0, {}", in(reg) vmid, options(nostack))
+}
+
+/// Reads the floating-point control and status register `fcsr`
+///
+/// Register `fcsr` is a 32-bit read/write register that selects the dynamic rounding mode
+/// for floating-point arithmetic operations and holds the accrued exception flag.
+///
+/// Accoding to "F" Standard Extension for Single-Precision Floating-Point, Version 2.2,
+/// register `fcsr` is defined as:
+///
+/// | Bit index | Meaning |
+/// |:----------|:--------|
+/// | 0..=4 | Accrued Exceptions (`fflags`) |
+/// | 5..=7 | Rounding Mode (`frm`) |
+/// | 8..=31 | _Reserved_ |
+///
+/// For definition of each field, visit [`frrm`] and [`frflags`].
+///
+/// [`frrm`]: fn.frrm.html
+/// [`frflags`]: fn.frflags.html
+#[inline]
+pub fn frcsr() -> u32 {
+    let value: u32;
+    unsafe { asm!("frcsr {}", out(reg) value, options(nomem, nostack)) };
+    value
+}
+
+/// Swaps the floating-point control and status register `fcsr`
+///
+/// This function swaps the value in `fcsr` by copying the original value to be returned,
+/// and then writing a new value obtained from input variable `value` into `fcsr`.
+#[inline]
+pub fn fscsr(value: u32) -> u32 {
+    let original: u32;
+    unsafe { asm!("fscsr {}, {}", out(reg) original, in(reg) value, options(nomem, nostack)) }
+    original
+}
+
+/// Reads the floating-point rounding mode register `frm`
+///
+/// Accoding to "F" Standard Extension for Single-Precision Floating-Point, Version 2.2,
+/// the rounding mode field is defined as listed in the table below:
+///
+/// | Rounding Mode | Mnemonic | Meaning |
+/// |:-------------|:----------|:---------|
+/// | 000 | RNE | Round to Nearest, ties to Even |
+/// | 001 | RTZ | Round towards Zero |
+/// | 010 | RDN | Round Down (towards −∞) |
+/// | 011 | RUP | Round Up (towards +∞) |
+/// | 100 | RMM | Round to Nearest, ties to Max Magnitude |
+/// | 101 |     | _Reserved for future use._ |
+/// | 110 |     | _Reserved for future use._ |
+/// | 111 | DYN | In Rounding Mode register, _reserved_. |
+#[inline]
+pub fn frrm() -> u32 {
+    let value: u32;
+    unsafe { asm!("frrm {}", out(reg) value, options(nomem, nostack)) };
+    value
+}
+
+/// Swaps the floating-point rounding mode register `frm`
+///
+/// This function swaps the value in `frm` by copying the original value to be returned,
+/// and then writing a new value obtained from the three least-significant bits of
+/// input variable `value` into `frm`.
+#[inline]
+pub fn fsrm(value: u32) -> u32 {
+    let original: u32;
+    unsafe { asm!("fsrm {}, {}", out(reg) original, in(reg) value, options(nomem, nostack)) }
+    original
+}
+
+/// Reads the floating-point accrued exception flags register `fflags`
+///
+/// The accrued exception flags indicate the exception conditions that have arisen
+/// on any floating-point arithmetic instruction since the field was last reset by software.
+///
+/// Accoding to "F" Standard Extension for Single-Precision Floating-Point, Version 2.2,
+/// the accured exception flags is defined as a bit vector of 5 bits.
+/// The meaning of each binary bit is listed in the table below.
+///
+/// | Bit index | Mnemonic | Meaning |
+/// |:--|:---|:-----------------|
+/// | 4 | NV | Invalid Operation |
+/// | 3 | DZ | Divide by Zero |
+/// | 2 | OF | Overflow |
+/// | 1 | UF | Underflow |
+/// | 0 | NX | Inexact |
+#[inline]
+pub fn frflags() -> u32 {
+    let value: u32;
+    unsafe { asm!("frflags {}", out(reg) value, options(nomem, nostack)) };
+    value
+}
+
+/// Swaps the floating-point accrued exception flags register `fflags`
+///
+/// This function swaps the value in `fflags` by copying the original value to be returned,
+/// and then writing a new value obtained from the five least-significant bits of
+/// input variable `value` into `fflags`.
+#[inline]
+pub fn fsflags(value: u32) -> u32 {
+    let original: u32;
+    unsafe { asm!("fsflags {}, {}", out(reg) original, in(reg) value, options(nomem, nostack)) }
+    original
+}
+
+/// Invalidate hypervisor translation cache for all virtual machines and guest physical addresses
+///
+/// This instruction invalidates any address-translation cache entries that an
+/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate.
+///
+/// This fence specifies all guest physical addresses and all virtual machines.
+#[inline]
+pub unsafe fn hinval_gvma_all() {
+    asm!(".insn r 0x73, 0, 0x33, x0, x0, x0", options(nostack))
+}
+
+/// `P0` transformation function as is used in the SM3 hash algorithm
+///
+/// This function is included in `Zksh` extension. It's defined as:
+///
+/// ```text
+/// P0(X) = X ⊕ (X ≪ 9) ⊕ (X ≪ 17)
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+///
+/// In the SM3 algorithm, the `P0` transformation is used as `E ← P0(TT2)` when the
+/// compression function `CF` uses the intermediate value `TT2` to calculate
+/// the variable `E` in one iteration for subsequent processes.
+///
+/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
+/// this instruction must always be independent from the data it operates on.
+#[inline]
+pub fn sm3p0(x: u32) -> u32 {
+    let ans: u32;
+    unsafe {
+        // asm!("sm3p0 {}, {}", out(reg) ans, in(reg) x, options(nomem, nostack))
+        asm!(".insn i 0x13, 0x1, {}, {}, 0x108", out(reg) ans, in(reg) x, options(nomem, nostack))
+    };
+    ans
+}
+
+/// `P1` transformation function as is used in the SM3 hash algorithm
+///
+/// This function is included in `Zksh` extension. It's defined as:
+///
+/// ```text
+/// P1(X) = X ⊕ (X ≪ 15) ⊕ (X ≪ 23)
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+///
+/// In the SM3 algorithm, the `P1` transformation is used to expand message,
+/// where expanded word `Wj` can be generated from the previous words.
+/// The whole process can be described as the following pseudocode:
+///
+/// ```text
+/// FOR j=16 TO 67
+///     Wj ← P1(Wj−16 ⊕ Wj−9 ⊕ (Wj−3 ≪ 15)) ⊕ (Wj−13 ≪ 7) ⊕ Wj−6
+/// ENDFOR
+/// ```
+///
+/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
+/// this instruction must always be independent from the data it operates on.
+#[inline]
+pub fn sm3p1(x: u32) -> u32 {
+    let ans: u32;
+    unsafe {
+        // asm!("sm3p1 {}, {}", out(reg) ans, in(reg) x, options(nomem, nostack))
+        asm!(".insn i 0x13, 0x1, {}, {}, 0x109", out(reg) ans, in(reg) x, options(nomem, nostack))
+    };
+    ans
+}
+
+/// Accelerates the round function `F` in the SM4 block cipher algorithm
+///
+/// This instruction is included in extension `Zksed`. It's defined as:
+///
+/// ```text
+/// SM4ED(x, a, BS) = x ⊕ T(ai)
+/// ... where
+/// ai = a.bytes[BS]
+/// T(ai) = L(τ(ai))
+/// bi = τ(ai) = SM4-S-Box(ai)
+/// ci = L(bi) = bi ⊕ (bi ≪ 2) ⊕ (bi ≪ 10) ⊕ (bi ≪ 18) ⊕ (bi ≪ 24)
+/// SM4ED = (ci ≪ (BS * 8)) ⊕ x
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+/// As is defined above, `T` is a combined transformation of non linear S-Box transform `τ`
+/// and linear layer transform `L`.
+///
+/// In the SM4 algorithm, the round function `F` is defined as:
+///
+/// ```text
+/// F(x0, x1, x2, x3, rk) = x0 ⊕ T(x1 ⊕ x2 ⊕ x3 ⊕ rk)
+/// ... where
+/// T(A) = L(τ(A))
+/// B = τ(A) = (SM4-S-Box(a0), SM4-S-Box(a1), SM4-S-Box(a2), SM4-S-Box(a3))
+/// C = L(B) = B ⊕ (B ≪ 2) ⊕ (B ≪ 10) ⊕ (B ≪ 18) ⊕ (B ≪ 24)
+/// ```
+///
+/// It can be implemented by `sm4ed` instruction like:
+///
+/// ```no_run
+/// let a = x1 ^ x2 ^ x3 ^ rk;
+/// let c0 = sm4ed::<0>(x0, a);
+/// let c1 = sm4ed::<1>(c0, a); // c1 represents c[0..=1], etc.
+/// let c2 = sm4ed::<2>(c1, a);
+/// let c3 = sm4ed::<3>(c2, a);
+/// return c3; // c3 represents c[0..=3]
+/// ```
+///
+/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
+/// this instruction must always be independent from the data it operates on.
+pub fn sm4ed<const BS: u8>(x: u32, a: u32) -> u32 {
+    static_assert!(BS: u8 where BS <= 3);
+    let ans: u32;
+    match BS {
+        0 => unsafe {
+            asm!(".insn r 0x33, 0, 0x18, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack))
+        },
+        1 => unsafe {
+            asm!(".insn r 0x33, 0, 0x38, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack))
+        },
+        2 => unsafe {
+            asm!(".insn r 0x33, 0, 0x58, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack))
+        },
+        3 => unsafe {
+            asm!(".insn r 0x33, 0, 0x78, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack))
+        },
+        _ => unreachable!(),
+    };
+    ans
+}
+
+/// Accelerates the key schedule operation in the SM4 block cipher algorithm
+///
+/// This instruction is included in extension `Zksed`. It's defined as:
+///
+/// ```text
+/// SM4KS(x, k, BS) = x ⊕ T'(ki)
+/// ... where
+/// ki = k.bytes[BS]
+/// T'(ki) = L'(τ(ki))
+/// bi = τ(ki) = SM4-S-Box(ki)
+/// ci = L'(bi) = bi ⊕ (bi ≪ 13) ⊕ (bi ≪ 23)
+/// SM4KS = (ci ≪ (BS * 8)) ⊕ x
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+/// As is defined above, `T'` is a combined transformation of non linear S-Box transform `τ`
+/// and the replaced linear layer transform `L'`.
+///
+/// In the SM4 algorithm, the key schedule is defined as:
+///
+/// ```text
+/// rk[i] = K[i+4] = K[i] ⊕ T'(K[i+1] ⊕ K[i+2] ⊕ K[i+3] ⊕ CK[i])
+/// ... where
+/// K[0..=3] = MK[0..=3] ⊕ FK[0..=3]
+/// T'(K) = L'(τ(K))
+/// B = τ(K) = (SM4-S-Box(k0), SM4-S-Box(k1), SM4-S-Box(k2), SM4-S-Box(k3))
+/// C = L'(B) = B ⊕ (B ≪ 13) ⊕ (B ≪ 23)
+/// ```
+///
+/// where `MK` represents the input 128-bit encryption key,
+/// constants `FK` and `CK` are fixed system configuration constant values defined by the SM4 algorithm.
+/// Hence, the key schedule operation can be implemented by `sm4ks` instruction like:
+///
+/// ```no_run
+/// let k = k1 ^ k2 ^ k3 ^ ck_i;
+/// let c0 = sm4ks::<0>(k0, k);
+/// let c1 = sm4ks::<1>(c0, k); // c1 represents c[0..=1], etc.
+/// let c2 = sm4ks::<2>(c1, k);
+/// let c3 = sm4ks::<3>(c2, k);
+/// return c3; // c3 represents c[0..=3]
+/// ```
+///
+/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
+/// this instruction must always be independent from the data it operates on.
+pub fn sm4ks<const BS: u8>(x: u32, k: u32) -> u32 {
+    static_assert!(BS: u8 where BS <= 3);
+    let ans: u32;
+    match BS {
+        0 => unsafe {
+            asm!(".insn r 0x33, 0, 0x1A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack))
+        },
+        1 => unsafe {
+            asm!(".insn r 0x33, 0, 0x3A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack))
+        },
+        2 => unsafe {
+            asm!(".insn r 0x33, 0, 0x5A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack))
+        },
+        3 => unsafe {
+            asm!(".insn r 0x33, 0, 0x7A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack))
+        },
+        _ => unreachable!(),
+    };
+    ans
+}
diff --git a/library/stdarch/crates/core_arch/src/simd.rs b/library/stdarch/crates/core_arch/src/simd.rs
new file mode 100644
index 000000000..281fefba4
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/simd.rs
@@ -0,0 +1,1105 @@
+//! Internal `#[repr(simd)]` types
+
+#![allow(non_camel_case_types)]
+
+macro_rules! simd_ty {
+    ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone, Debug, PartialEq)]
+        pub(crate) struct $id($(pub $elem_ty),*);
+
+        #[allow(clippy::use_self)]
+        impl $id {
+            #[inline(always)]
+            pub(crate) const fn new($($elem_name: $elem_ty),*) -> Self {
+                $id($($elem_name),*)
+            }
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) const fn splat(value: $ety) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    value
+                }),*)
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) fn extract(self, index: usize) -> $ety {
+                unsafe {
+                    crate::core_arch::simd_llvm::simd_extract(self, index as u32)
+                }
+            }
+        }
+    }
+}
+
+macro_rules! simd_m_ty {
+    ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone, Debug, PartialEq)]
+        pub(crate) struct $id($(pub $elem_ty),*);
+
+        #[allow(clippy::use_self)]
+        impl $id {
+            #[inline(always)]
+            const fn bool_to_internal(x: bool) -> $ety {
+                [0 as $ety, !(0 as $ety)][x as usize]
+            }
+
+            #[inline(always)]
+            pub(crate) const fn new($($elem_name: bool),*) -> Self {
+                $id($(Self::bool_to_internal($elem_name)),*)
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) const fn splat(value: bool) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    Self::bool_to_internal(value)
+                }),*)
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) fn extract(self, index: usize) -> bool {
+                let r: $ety = unsafe {
+                    crate::core_arch::simd_llvm::simd_extract(self, index as u32)
+                };
+                r != 0
+            }
+        }
+    }
+}
+
+// 16-bit wide types:
+
+simd_ty!(u8x2[u8]: u8, u8 | x0, x1);
+simd_ty!(i8x2[i8]: i8, i8 | x0, x1);
+
+// 32-bit wide types:
+
+simd_ty!(u8x4[u8]: u8, u8, u8, u8 | x0, x1, x2, x3);
+simd_ty!(u16x2[u16]: u16, u16 | x0, x1);
+
+simd_ty!(i8x4[i8]: i8, i8, i8, i8 | x0, x1, x2, x3);
+simd_ty!(i16x2[i16]: i16, i16 | x0, x1);
+
+// 64-bit wide types:
+
+simd_ty!(
+    u8x8[u8]: u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(u16x4[u16]: u16, u16, u16, u16 | x0, x1, x2, x3);
+simd_ty!(u32x2[u32]: u32, u32 | x0, x1);
+simd_ty!(u64x1[u64]: u64 | x1);
+
+simd_ty!(
+    i8x8[i8]: i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(i16x4[i16]: i16, i16, i16, i16 | x0, x1, x2, x3);
+simd_ty!(i32x2[i32]: i32, i32 | x0, x1);
+simd_ty!(i64x1[i64]: i64 | x1);
+
+simd_ty!(f32x2[f32]: f32, f32 | x0, x1);
+simd_ty!(f64x1[f64]: f64 | x1);
+
+// 128-bit wide types:
+
+simd_ty!(
+    u8x16[u8]: u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_ty!(
+    u16x8[u16]: u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(u32x4[u32]: u32, u32, u32, u32 | x0, x1, x2, x3);
+simd_ty!(u64x2[u64]: u64, u64 | x0, x1);
+
+simd_ty!(
+    i8x16[i8]: i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_ty!(
+    i16x8[i16]: i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(i32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3);
+simd_ty!(i64x2[i64]: i64, i64 | x0, x1);
+
+simd_ty!(f32x4[f32]: f32, f32, f32, f32 | x0, x1, x2, x3);
+simd_ty!(f64x2[f64]: f64, f64 | x0, x1);
+simd_ty!(f64x4[f64]: f64, f64, f64, f64 | x0, x1, x2, x3);
+
+simd_m_ty!(
+    m8x16[i8]: i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_m_ty!(
+    m16x8[i16]: i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_m_ty!(m32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3);
+simd_m_ty!(m64x2[i64]: i64, i64 | x0, x1);
+
+// 256-bit wide types:
+
+simd_ty!(
+    u8x32[u8]: u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+simd_ty!(
+    u16x16[u16]: u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_ty!(
+    u32x8[u32]: u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(u64x4[u64]: u64, u64, u64, u64 | x0, x1, x2, x3);
+
+simd_ty!(
+    i8x32[i8]: i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+simd_ty!(
+    i16x16[i16]: i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+simd_ty!(
+    i32x8[i32]: i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3);
+
+simd_ty!(
+    f32x8[f32]: f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+
+// 512-bit wide types:
+
+simd_ty!(
+    i8x64[i8]: i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8,
+    i8 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31,
+    x32,
+    x33,
+    x34,
+    x35,
+    x36,
+    x37,
+    x38,
+    x39,
+    x40,
+    x41,
+    x42,
+    x43,
+    x44,
+    x45,
+    x46,
+    x47,
+    x48,
+    x49,
+    x50,
+    x51,
+    x52,
+    x53,
+    x54,
+    x55,
+    x56,
+    x57,
+    x58,
+    x59,
+    x60,
+    x61,
+    x62,
+    x63
+);
+
+simd_ty!(
+    u8x64[u8]: u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31,
+    x32,
+    x33,
+    x34,
+    x35,
+    x36,
+    x37,
+    x38,
+    x39,
+    x40,
+    x41,
+    x42,
+    x43,
+    x44,
+    x45,
+    x46,
+    x47,
+    x48,
+    x49,
+    x50,
+    x51,
+    x52,
+    x53,
+    x54,
+    x55,
+    x56,
+    x57,
+    x58,
+    x59,
+    x60,
+    x61,
+    x62,
+    x63
+);
+
+simd_ty!(
+    i16x32[i16]: i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16,
+    i16 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+
+simd_ty!(
+    u16x32[u16]: u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16,
+    u16 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+
+simd_ty!(
+    i32x16[i32]: i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32,
+    i32 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+
+simd_ty!(
+    u32x16[u32]: u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32,
+    u32 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+
+simd_ty!(
+    f32x16[f32]: f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32,
+    f32 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15
+);
+
+simd_ty!(
+    i64x8[i64]: i64,
+    i64,
+    i64,
+    i64,
+    i64,
+    i64,
+    i64,
+    i64 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+
+simd_ty!(
+    u64x8[u64]: u64,
+    u64,
+    u64,
+    u64,
+    u64,
+    u64,
+    u64,
+    u64 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
+
+simd_ty!(
+    f64x8[f64]: f64,
+    f64,
+    f64,
+    f64,
+    f64,
+    f64,
+    f64,
+    f64 | x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7
+);
diff --git a/library/stdarch/crates/core_arch/src/simd_llvm.rs b/library/stdarch/crates/core_arch/src/simd_llvm.rs
new file mode 100644
index 000000000..1970e5c69
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/simd_llvm.rs
@@ -0,0 +1,81 @@
+//! LLVM's SIMD platform intrinsics
+
+extern "platform-intrinsic" {
+    //pub fn simd_select_bitmask
+    pub fn simd_eq<T, U>(x: T, y: T) -> U;
+    pub fn simd_ne<T, U>(x: T, y: T) -> U;
+    pub fn simd_lt<T, U>(x: T, y: T) -> U;
+    pub fn simd_le<T, U>(x: T, y: T) -> U;
+    pub fn simd_gt<T, U>(x: T, y: T) -> U;
+    pub fn simd_ge<T, U>(x: T, y: T) -> U;
+
+    pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
+    pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
+    pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
+    pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
+    pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
+    pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U;
+    pub fn simd_shuffle128<T, U>(x: T, y: T, idx: [u32; 128]) -> U;
+
+    #[rustc_const_unstable(feature = "const_simd_insert", issue = "none")]
+    pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
+    #[rustc_const_unstable(feature = "const_simd_extract", issue = "none")]
+    pub fn simd_extract<T, U>(x: T, idx: u32) -> U;
+    //pub fn simd_select
+    pub fn simd_bitmask<T, U>(x: T) -> U;
+
+    pub fn simd_cast<T, U>(x: T) -> U;
+
+    pub fn simd_add<T>(x: T, y: T) -> T;
+    pub fn simd_sub<T>(x: T, y: T) -> T;
+    pub fn simd_mul<T>(x: T, y: T) -> T;
+    pub fn simd_div<T>(x: T, y: T) -> T;
+    pub fn simd_shl<T>(x: T, y: T) -> T;
+    pub fn simd_shr<T>(x: T, y: T) -> T;
+    pub fn simd_and<T>(x: T, y: T) -> T;
+    pub fn simd_or<T>(x: T, y: T) -> T;
+    pub fn simd_xor<T>(x: T, y: T) -> T;
+
+    pub fn simd_neg<T>(x: T) -> T;
+
+    pub fn simd_saturating_add<T>(x: T, y: T) -> T;
+    pub fn simd_saturating_sub<T>(x: T, y: T) -> T;
+
+    pub fn simd_gather<T, U, V>(values: T, pointers: U, mask: V) -> T;
+    pub fn simd_scatter<T, U, V>(values: T, pointers: U, mask: V);
+
+    pub fn simd_reduce_add_unordered<T, U>(x: T) -> U;
+    pub fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
+    pub fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
+    pub fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;
+    pub fn simd_reduce_min<T, U>(x: T) -> U;
+    pub fn simd_reduce_max<T, U>(x: T) -> U;
+    pub fn simd_reduce_min_nanless<T, U>(x: T) -> U;
+    pub fn simd_reduce_max_nanless<T, U>(x: T) -> U;
+    pub fn simd_reduce_and<T, U>(x: T) -> U;
+    pub fn simd_reduce_or<T, U>(x: T) -> U;
+    pub fn simd_reduce_xor<T, U>(x: T) -> U;
+    pub fn simd_reduce_all<T>(x: T) -> bool;
+    pub fn simd_reduce_any<T>(x: T) -> bool;
+
+    pub fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+    pub fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
+
+    pub fn simd_fmin<T>(a: T, b: T) -> T;
+    pub fn simd_fmax<T>(a: T, b: T) -> T;
+
+    pub fn simd_fsqrt<T>(a: T) -> T;
+    pub fn simd_fsin<T>(a: T) -> T;
+    pub fn simd_fcos<T>(a: T) -> T;
+    pub fn simd_fabs<T>(a: T) -> T;
+    pub fn simd_floor<T>(a: T) -> T;
+    pub fn simd_ceil<T>(a: T) -> T;
+    pub fn simd_fexp<T>(a: T) -> T;
+    pub fn simd_fexp2<T>(a: T) -> T;
+    pub fn simd_flog10<T>(a: T) -> T;
+    pub fn simd_flog2<T>(a: T) -> T;
+    pub fn simd_flog<T>(a: T) -> T;
+    //pub fn simd_fpowi
+    //pub fn simd_fpow
+    pub fn simd_fma<T>(a: T, b: T, c: T) -> T;
+}
diff --git a/library/stdarch/crates/core_arch/src/v64.rs b/library/stdarch/crates/core_arch/src/v64.rs
new file mode 100644
index 000000000..631b76a85
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/v64.rs
@@ -0,0 +1,85 @@
+//! 64-bit wide vector types
+
+use crate::prelude::v1::*;
+
+use crate::core_arch::simd_llvm::*;
+
+define_ty_doc! {
+    f32x2, f32, f32 |
+    /// A 64-bit vector with 2 `f32` lanes.
+}
+define_impl! { f32x2, f32, 2, i32x2, x0, x1 }
+
+define_ty_doc! {
+    u32x2, u32, u32 |
+    /// A 64-bit vector with 2 `u32` lanes.
+}
+define_impl! { u32x2, u32, 2, i32x2, x0, x1 }
+
+define_ty! { i32x2, i32, i32 }
+define_impl! { i32x2, i32, 2, i32x2, x0, x1 }
+
+define_ty! { u16x4, u16, u16, u16, u16 }
+define_impl! { u16x4, u16, 4, i16x4, x0, x1, x2, x3 }
+
+define_ty! { i16x4, i16, i16, i16, i16 }
+define_impl! { i16x4, i16, 4, i16x4, x0, x1, x2, x3 }
+
+define_ty! { u8x8, u8, u8, u8, u8, u8, u8, u8, u8 }
+define_impl! { u8x8, u8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! { i8x8, i8, i8, i8, i8, i8, i8, i8, i8 }
+define_impl! { i8x8, i8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_from!(u32x2, i32x2, u16x4, i16x4, u8x8, i8x8);
+define_from!(i32x2, u32x2, u16x4, i16x4, u8x8, i8x8);
+define_from!(u16x4, u32x2, i32x2, i16x4, u8x8, i8x8);
+define_from!(i16x4, u32x2, i32x2, u16x4, u8x8, i8x8);
+define_from!(u8x8, u32x2, i32x2, u16x4, i16x4, i8x8);
+define_from!(i8x8, u32x2, i32x2, u16x4, i16x4, u8x8);
+
+define_common_ops!(f32x2, u32x2, i32x2, u16x4, i16x4, u8x8, i8x8);
+define_float_ops!(f32x2);
+define_integer_ops!(
+    (u32x2, u32),
+    (i32x2, i32),
+    (u16x4, u16),
+    (i16x4, i16),
+    (u8x8, u8),
+    (i8x8, i8)
+);
+define_signed_integer_ops!(i32x2, i16x4, i8x8);
+define_casts!(
+    (f32x2, f64x2, as_f64x2),
+    (f32x2, u32x2, as_u32x2),
+    (f32x2, i32x2, as_i32x2),
+    (u32x2, f32x2, as_f32x2),
+    (u32x2, i32x2, as_i32x2),
+    (i32x2, f32x2, as_f32x2),
+    (i32x2, u32x2, as_u32x2),
+    (u16x4, i16x4, as_i16x4),
+    (i16x4, u16x4, as_u16x4),
+    (u8x8, i8x8, as_i8x8),
+    (i8x8, u8x8, as_u8x8),
+    (i8x8, i16x8, as_i16x8),
+    (u8x8, i16x8, as_i16x8),
+    (i16x4, i32x4, as_i32x4),
+    (i32x2, i64x2, as_i64x2),
+    (u8x8, u16x8, as_u16x8),
+    (u16x4, u32x4, as_u32x4),
+    (u16x4, i32x4, as_i32x4),
+    (u32x2, u64x2, as_u64x2),
+    (u32x2, i64x2, as_i64x2)
+);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn operators() {
+        test_ops_si!(i8x8, i16x4, i32x2);
+        test_ops_ui!(u8x8, u16x4, u32x2);
+        test_ops_f!(f32x2);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/atomic.rs b/library/stdarch/crates/core_arch/src/wasm32/atomic.rs
new file mode 100644
index 000000000..52d4bea87
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/atomic.rs
@@ -0,0 +1,93 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+extern "C" {
+    #[link_name = "llvm.wasm.memory.atomic.wait32"]
+    fn llvm_atomic_wait_i32(ptr: *mut i32, exp: i32, timeout: i64) -> i32;
+    #[link_name = "llvm.wasm.memory.atomic.wait64"]
+    fn llvm_atomic_wait_i64(ptr: *mut i64, exp: i64, timeout: i64) -> i32;
+    #[link_name = "llvm.wasm.memory.atomic.notify"]
+    fn llvm_atomic_notify(ptr: *mut i32, cnt: i32) -> i32;
+}
+
+/// Corresponding intrinsic to wasm's [`memory.atomic.wait32` instruction][instr]
+///
+/// This function, when called, will block the current thread if the memory
+/// pointed to by `ptr` is equal to `expression` (performing this action
+/// atomically).
+///
+/// The argument `timeout_ns` is a maximum number of nanoseconds the calling
+/// thread will be blocked for, if it blocks. If the timeout is negative then
+/// the calling thread will be blocked forever.
+///
+/// The calling thread can only be woken up with a call to the `wake` intrinsic
+/// once it has been blocked. Changing the memory behind `ptr` will not wake
+/// the thread once it's blocked.
+///
+/// # Return value
+///
+/// * 0 - indicates that the thread blocked and then was woken up
+/// * 1 - the loaded value from `ptr` didn't match `expression`, the thread
+///   didn't block
+/// * 2 - the thread blocked, but the timeout expired.
+///
+/// [instr]: https://webassembly.github.io/threads/core/syntax/instructions.html#syntax-instr-atomic-memory
+#[inline]
+#[cfg_attr(test, assert_instr(memory.atomic.wait32))]
+#[target_feature(enable = "atomics")]
+#[doc(alias("memory.atomic.wait32"))]
+pub unsafe fn memory_atomic_wait32(ptr: *mut i32, expression: i32, timeout_ns: i64) -> i32 {
+    llvm_atomic_wait_i32(ptr, expression, timeout_ns)
+}
+
+/// Corresponding intrinsic to wasm's [`memory.atomic.wait64` instruction][instr]
+///
+/// This function, when called, will block the current thread if the memory
+/// pointed to by `ptr` is equal to `expression` (performing this action
+/// atomically).
+///
+/// The argument `timeout_ns` is a maximum number of nanoseconds the calling
+/// thread will be blocked for, if it blocks. If the timeout is negative then
+/// the calling thread will be blocked forever.
+///
+/// The calling thread can only be woken up with a call to the `wake` intrinsic
+/// once it has been blocked. Changing the memory behind `ptr` will not wake
+/// the thread once it's blocked.
+///
+/// # Return value
+///
+/// * 0 - indicates that the thread blocked and then was woken up
+/// * 1 - the loaded value from `ptr` didn't match `expression`, the thread
+///   didn't block
+/// * 2 - the thread blocked, but the timeout expired.
+///
+/// [instr]: https://webassembly.github.io/threads/core/syntax/instructions.html#syntax-instr-atomic-memory
+#[inline]
+#[cfg_attr(test, assert_instr(memory.atomic.wait64))]
+#[target_feature(enable = "atomics")]
+#[doc(alias("memory.atomic.wait64"))]
+pub unsafe fn memory_atomic_wait64(ptr: *mut i64, expression: i64, timeout_ns: i64) -> i32 {
+    llvm_atomic_wait_i64(ptr, expression, timeout_ns)
+}
+
+/// Corresponding intrinsic to wasm's [`memory.atomic.notify` instruction][instr]
+///
+/// This function will notify a number of threads blocked on the address
+/// indicated by `ptr`. Threads previously blocked with the `i32_atomic_wait`
+/// and `i64_atomic_wait` functions above will be woken up.
+///
+/// The `waiters` argument indicates how many waiters should be woken up (a
+/// maximum). If the value is zero no waiters are woken up.
+///
+/// # Return value
+///
+/// Returns the number of waiters which were actually notified.
+///
+/// [instr]: https://webassembly.github.io/threads/core/syntax/instructions.html#syntax-instr-atomic-memory
+#[inline]
+#[cfg_attr(test, assert_instr(memory.atomic.notify))]
+#[target_feature(enable = "atomics")]
+#[doc(alias("memory.atomic.notify"))]
+pub unsafe fn memory_atomic_notify(ptr: *mut i32, waiters: u32) -> u32 {
+    llvm_atomic_notify(ptr, waiters as i32) as u32
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/memory.rs b/library/stdarch/crates/core_arch/src/wasm32/memory.rs
new file mode 100644
index 000000000..b5cf13e98
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/memory.rs
@@ -0,0 +1,58 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+extern "C" {
+    #[link_name = "llvm.wasm.memory.grow"]
+    fn llvm_memory_grow(mem: u32, pages: usize) -> usize;
+    #[link_name = "llvm.wasm.memory.size"]
+    fn llvm_memory_size(mem: u32) -> usize;
+}
+
+/// Corresponding intrinsic to wasm's [`memory.size` instruction][instr]
+///
+/// This function, when called, will return the current memory size in units of
+/// pages. The current WebAssembly page size is 65536 bytes (64 KB).
+///
+/// The argument `MEM` is the numerical index of which memory to return the
+/// size of. Note that currently the WebAssembly specification only supports one
+/// memory, so it is required that zero is passed in. The argument is present to
+/// be forward-compatible with future WebAssembly revisions. If a nonzero
+/// argument is passed to this function it will currently unconditionally abort.
+///
+/// [instr]: http://webassembly.github.io/spec/core/exec/instructions.html#exec-memory-size
+#[inline]
+#[cfg_attr(test, assert_instr("memory.size", MEM = 0))]
+#[rustc_legacy_const_generics(0)]
+#[stable(feature = "simd_wasm32", since = "1.33.0")]
+#[doc(alias("memory.size"))]
+pub fn memory_size<const MEM: u32>() -> usize {
+    static_assert!(MEM: u32 where MEM == 0);
+    unsafe { llvm_memory_size(MEM) }
+}
+
+/// Corresponding intrinsic to wasm's [`memory.grow` instruction][instr]
+///
+/// This function, when called, will attempt to grow the default linear memory
+/// by the specified `delta` of pages. The current WebAssembly page size is
+/// 65536 bytes (64 KB). If memory is successfully grown then the previous size
+/// of memory, in pages, is returned. If memory cannot be grown then
+/// `usize::MAX` is returned.
+///
+/// The argument `MEM` is the numerical index of which memory to return the
+/// size of. Note that currently the WebAssembly specification only supports one
+/// memory, so it is required that zero is passed in. The argument is present to
+/// be forward-compatible with future WebAssembly revisions. If a nonzero
+/// argument is passed to this function it will currently unconditionally abort.
+///
+/// [instr]: http://webassembly.github.io/spec/core/exec/instructions.html#exec-memory-grow
+#[inline]
+#[cfg_attr(test, assert_instr("memory.grow", MEM = 0))]
+#[rustc_legacy_const_generics(0)]
+#[stable(feature = "simd_wasm32", since = "1.33.0")]
+#[doc(alias("memory.grow"))]
+pub fn memory_grow<const MEM: u32>(delta: usize) -> usize {
+    unsafe {
+        static_assert!(MEM: u32 where MEM == 0);
+        llvm_memory_grow(MEM, delta)
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/mod.rs b/library/stdarch/crates/core_arch/src/wasm32/mod.rs
new file mode 100644
index 000000000..2fbe80e99
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/mod.rs
@@ -0,0 +1,26 @@
+//! WASM32 intrinsics
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+mod atomic;
+pub use self::atomic::*;
+
+mod simd128;
+pub use self::simd128::*;
+
+mod memory;
+pub use self::memory::*;
+
+/// Generates the [`unreachable`] instruction, which causes an unconditional [trap].
+///
+/// This function is safe to call and immediately aborts the execution.
+///
+/// [`unreachable`]: https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-instr-control
+/// [trap]: https://webassembly.github.io/spec/core/intro/overview.html#trap
+#[cfg_attr(test, assert_instr(unreachable))]
+#[inline]
+#[stable(feature = "unreachable_wasm32", since = "1.37.0")]
+pub fn unreachable() -> ! {
+    crate::intrinsics::abort()
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/simd128.rs b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
new file mode 100644
index 000000000..c0025696b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
@@ -0,0 +1,6136 @@
+//! This module implements the [WebAssembly `SIMD128` ISA].
+//!
+//! [WebAssembly `SIMD128` ISA]:
+//! https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md
+
+#![allow(non_camel_case_types)]
+#![allow(unused_imports)]
+
+use crate::{
+    core_arch::{simd, simd_llvm::*},
+    marker::Sized,
+    mem, ptr,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+types! {
+    /// WASM-specific 128-bit wide SIMD vector type.
+    ///
+    /// This type corresponds to the `v128` type in the [WebAssembly SIMD
+    /// proposal](https://github.com/webassembly/simd). This type is 128-bits
+    /// large and the meaning of all the bits is defined within the context of
+    /// how this value is used.
+    ///
+    /// This same type is used simultaneously for all 128-bit-wide SIMD types,
+    /// for example:
+    ///
+    /// * sixteen 8-bit integers (both `i8` and `u8`)
+    /// * eight 16-bit integers (both `i16` and `u16`)
+    /// * four 32-bit integers (both `i32` and `u32`)
+    /// * two 64-bit integers (both `i64` and `u64`)
+    /// * four 32-bit floats (`f32`)
+    /// * two 64-bit floats (`f64`)
+    ///
+    /// The `v128` type in Rust is intended to be quite analogous to the `v128`
+    /// type in WebAssembly. Operations on `v128` can only be performed with the
+    /// functions in this module.
+    // N.B., internals here are arbitrary.
+    #[stable(feature = "wasm_simd", since = "1.54.0")]
+    pub struct v128(i32, i32, i32, i32);
+}
+
+macro_rules! conversions {
+    ($(($name:ident = $ty:ty))*) => {
+        impl v128 {
+            $(
+                #[inline(always)]
+                fn $name(self) -> $ty {
+                    unsafe { mem::transmute(self) }
+                }
+            )*
+        }
+        $(
+            impl $ty {
+                #[inline(always)]
+                #[rustc_const_stable(feature = "wasm_simd_const", since = "1.56.0")]
+                const fn v128(self) -> v128 {
+                    unsafe { mem::transmute(self) }
+                }
+            }
+        )*
+    }
+}
+
+conversions! {
+    (as_u8x16 = simd::u8x16)
+    (as_u16x8 = simd::u16x8)
+    (as_u32x4 = simd::u32x4)
+    (as_u64x2 = simd::u64x2)
+    (as_i8x16 = simd::i8x16)
+    (as_i16x8 = simd::i16x8)
+    (as_i32x4 = simd::i32x4)
+    (as_i64x2 = simd::i64x2)
+    (as_f32x4 = simd::f32x4)
+    (as_f64x2 = simd::f64x2)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.wasm.swizzle"]
+    fn llvm_swizzle(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;
+
+    #[link_name = "llvm.wasm.bitselect.v16i8"]
+    fn llvm_bitselect(a: simd::i8x16, b: simd::i8x16, c: simd::i8x16) -> simd::i8x16;
+    #[link_name = "llvm.wasm.anytrue.v16i8"]
+    fn llvm_any_true_i8x16(x: simd::i8x16) -> i32;
+
+    #[link_name = "llvm.wasm.alltrue.v16i8"]
+    fn llvm_i8x16_all_true(x: simd::i8x16) -> i32;
+    #[link_name = "llvm.ctpop.v16i8"]
+    fn llvm_popcnt(a: simd::i8x16) -> simd::i8x16;
+    #[link_name = "llvm.wasm.bitmask.v16i8"]
+    fn llvm_bitmask_i8x16(a: simd::i8x16) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_s(a: simd::i16x8, b: simd::i16x8) -> simd::i8x16;
+    #[link_name = "llvm.wasm.narrow.unsigned.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_u(a: simd::i16x8, b: simd::i16x8) -> simd::i8x16;
+    #[link_name = "llvm.sadd.sat.v16i8"]
+    fn llvm_i8x16_add_sat_s(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;
+    #[link_name = "llvm.uadd.sat.v16i8"]
+    fn llvm_i8x16_add_sat_u(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;
+    #[link_name = "llvm.wasm.sub.sat.signed.v16i8"]
+    fn llvm_i8x16_sub_sat_s(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;
+    #[link_name = "llvm.wasm.sub.sat.unsigned.v16i8"]
+    fn llvm_i8x16_sub_sat_u(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;
+    #[link_name = "llvm.wasm.avgr.unsigned.v16i8"]
+    fn llvm_avgr_u_i8x16(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;
+
+    #[link_name = "llvm.wasm.extadd.pairwise.signed.v8i16"]
+    fn llvm_i16x8_extadd_pairwise_i8x16_s(x: simd::i8x16) -> simd::i16x8;
+    #[link_name = "llvm.wasm.extadd.pairwise.unsigned.v8i16"]
+    fn llvm_i16x8_extadd_pairwise_i8x16_u(x: simd::i8x16) -> simd::i16x8;
+    #[link_name = "llvm.wasm.q15mulr.sat.signed"]
+    fn llvm_q15mulr(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
+    #[link_name = "llvm.wasm.alltrue.v8i16"]
+    fn llvm_i16x8_all_true(x: simd::i16x8) -> i32;
+    #[link_name = "llvm.wasm.bitmask.v8i16"]
+    fn llvm_bitmask_i16x8(a: simd::i16x8) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v8i16.v4i32"]
+    fn llvm_narrow_i16x8_s(a: simd::i32x4, b: simd::i32x4) -> simd::i16x8;
+    #[link_name = "llvm.wasm.narrow.unsigned.v8i16.v4i32"]
+    fn llvm_narrow_i16x8_u(a: simd::i32x4, b: simd::i32x4) -> simd::i16x8;
+    #[link_name = "llvm.sadd.sat.v8i16"]
+    fn llvm_i16x8_add_sat_s(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
+    #[link_name = "llvm.uadd.sat.v8i16"]
+    fn llvm_i16x8_add_sat_u(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
+    #[link_name = "llvm.wasm.sub.sat.signed.v8i16"]
+    fn llvm_i16x8_sub_sat_s(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
+    #[link_name = "llvm.wasm.sub.sat.unsigned.v8i16"]
+    fn llvm_i16x8_sub_sat_u(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
+    #[link_name = "llvm.wasm.avgr.unsigned.v8i16"]
+    fn llvm_avgr_u_i16x8(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
+
+    #[link_name = "llvm.wasm.extadd.pairwise.signed.v16i8"]
+    fn llvm_i32x4_extadd_pairwise_i16x8_s(x: simd::i16x8) -> simd::i32x4;
+    #[link_name = "llvm.wasm.extadd.pairwise.unsigned.v16i8"]
+    fn llvm_i32x4_extadd_pairwise_i16x8_u(x: simd::i16x8) -> simd::i32x4;
+    #[link_name = "llvm.wasm.alltrue.v4i32"]
+    fn llvm_i32x4_all_true(x: simd::i32x4) -> i32;
+    #[link_name = "llvm.wasm.bitmask.v4i32"]
+    fn llvm_bitmask_i32x4(a: simd::i32x4) -> i32;
+    #[link_name = "llvm.wasm.dot"]
+    fn llvm_i32x4_dot_i16x8_s(a: simd::i16x8, b: simd::i16x8) -> simd::i32x4;
+
+    #[link_name = "llvm.wasm.alltrue.v2i64"]
+    fn llvm_i64x2_all_true(x: simd::i64x2) -> i32;
+    #[link_name = "llvm.wasm.bitmask.v2i64"]
+    fn llvm_bitmask_i64x2(a: simd::i64x2) -> i32;
+
+    #[link_name = "llvm.ceil.v4f32"]
+    fn llvm_f32x4_ceil(x: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.floor.v4f32"]
+    fn llvm_f32x4_floor(x: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.trunc.v4f32"]
+    fn llvm_f32x4_trunc(x: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.nearbyint.v4f32"]
+    fn llvm_f32x4_nearest(x: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.fabs.v4f32"]
+    fn llvm_f32x4_abs(x: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.sqrt.v4f32"]
+    fn llvm_f32x4_sqrt(x: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.minimum.v4f32"]
+    fn llvm_f32x4_min(x: simd::f32x4, y: simd::f32x4) -> simd::f32x4;
+    #[link_name = "llvm.maximum.v4f32"]
+    fn llvm_f32x4_max(x: simd::f32x4, y: simd::f32x4) -> simd::f32x4;
+
+    #[link_name = "llvm.ceil.v2f64"]
+    fn llvm_f64x2_ceil(x: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.floor.v2f64"]
+    fn llvm_f64x2_floor(x: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.trunc.v2f64"]
+    fn llvm_f64x2_trunc(x: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.nearbyint.v2f64"]
+    fn llvm_f64x2_nearest(x: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.fabs.v2f64"]
+    fn llvm_f64x2_abs(x: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.sqrt.v2f64"]
+    fn llvm_f64x2_sqrt(x: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.minimum.v2f64"]
+    fn llvm_f64x2_min(x: simd::f64x2, y: simd::f64x2) -> simd::f64x2;
+    #[link_name = "llvm.maximum.v2f64"]
+    fn llvm_f64x2_max(x: simd::f64x2, y: simd::f64x2) -> simd::f64x2;
+
+    #[link_name = "llvm.fptosi.sat.v4i32.v4f32"]
+    fn llvm_i32x4_trunc_sat_f32x4_s(x: simd::f32x4) -> simd::i32x4;
+    #[link_name = "llvm.fptoui.sat.v4i32.v4f32"]
+    fn llvm_i32x4_trunc_sat_f32x4_u(x: simd::f32x4) -> simd::i32x4;
+    #[link_name = "llvm.fptosi.sat.v2i32.v2f64"]
+    fn llvm_i32x2_trunc_sat_f64x2_s(x: simd::f64x2) -> simd::i32x2;
+    #[link_name = "llvm.fptoui.sat.v2i32.v2f64"]
+    fn llvm_i32x2_trunc_sat_f64x2_u(x: simd::f64x2) -> simd::i32x2;
+}
+
+#[repr(packed)]
+#[derive(Copy)]
+struct Unaligned<T>(T);
+
+impl<T: Copy> Clone for Unaligned<T> {
+    fn clone(&self) -> Unaligned<T> {
+        *self
+    }
+}
+
+/// Loads a `v128` vector from the given heap address.
+///
+/// This intrinsic will emit a load with an alignment of 1. While this is
+/// provided for completeness it is not strictly necessary, you can also load
+/// the pointer directly:
+///
+/// ```rust,ignore
+/// let a: &v128 = ...;
+/// let value = unsafe { v128_load(a) };
+/// // .. is the same as ..
+/// let value = *a;
+/// ```
+///
+/// The alignment of the load can be configured by doing a manual load without
+/// this intrinsic.
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 16 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load(m: *const v128) -> v128 {
+    (*(m as *const Unaligned<v128>)).0
+}
+
+/// Load eight 8-bit integers and sign extend each one to a 16-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load8x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load8x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i16x8_load_extend_i8x8(m: *const i8) -> v128 {
+    let m = *(m as *const Unaligned<simd::i8x8>);
+    simd_cast::<_, simd::i16x8>(m.0).v128()
+}
+
+/// Load eight 8-bit integers and zero extend each one to a 16-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load8x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load8x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i16x8_load_extend_u8x8(m: *const u8) -> v128 {
+    let m = *(m as *const Unaligned<simd::u8x8>);
+    simd_cast::<_, simd::u16x8>(m.0).v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_load_extend_u8x8 as u16x8_load_extend_u8x8;
+
+/// Load four 16-bit integers and sign extend each one to a 32-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load16x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load16x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i32x4_load_extend_i16x4(m: *const i16) -> v128 {
+    let m = *(m as *const Unaligned<simd::i16x4>);
+    simd_cast::<_, simd::i32x4>(m.0).v128()
+}
+
+/// Load four 16-bit integers and zero extend each one to a 32-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load16x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load16x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i32x4_load_extend_u16x4(m: *const u16) -> v128 {
+    let m = *(m as *const Unaligned<simd::u16x4>);
+    simd_cast::<_, simd::u32x4>(m.0).v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_load_extend_u16x4 as u32x4_load_extend_u16x4;
+
+/// Load two 32-bit integers and sign extend each one to a 64-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32x2_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32x2_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i64x2_load_extend_i32x2(m: *const i32) -> v128 {
+    let m = *(m as *const Unaligned<simd::i32x2>);
+    simd_cast::<_, simd::i64x2>(m.0).v128()
+}
+
+/// Load two 32-bit integers and zero extend each one to a 64-bit lane
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32x2_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32x2_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn i64x2_load_extend_u32x2(m: *const u32) -> v128 {
+    let m = *(m as *const Unaligned<simd::u32x2>);
+    simd_cast::<_, simd::u64x2>(m.0).v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_load_extend_u32x2 as u64x2_load_extend_u32x2;
+
+/// Load a single element and splat to all lanes of a v128 vector.
+///
+/// While this intrinsic is provided for completeness it can also be replaced
+/// with `u8x16_splat(*m)` and it should generate equivalent code (and also not
+/// require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 1 byte from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load8_splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load8_splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load8_splat(m: *const u8) -> v128 {
+    u8x16_splat(*m)
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+///
+/// While this intrinsic is provided for completeness it can also be replaced
+/// with `u16x8_splat(*m)` and it should generate equivalent code (and also not
+/// require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 2 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load16_splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load16_splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load16_splat(m: *const u16) -> v128 {
+    u16x8_splat(ptr::read_unaligned(m))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+///
+/// While this intrinsic is provided for completeness it can also be replaced
+/// with `u32x4_splat(*m)` and it should generate equivalent code (and also not
+/// require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 4 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32_splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32_splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load32_splat(m: *const u32) -> v128 {
+    u32x4_splat(ptr::read_unaligned(m))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+///
+/// While this intrinsic is provided for completeness it can also be replaced
+/// with `u64x2_splat(*m)` and it should generate equivalent code (and also not
+/// require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load64_splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load64_splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load64_splat(m: *const u64) -> v128 {
+    u64x2_splat(ptr::read_unaligned(m))
+}
+
+/// Load a 32-bit element into the low bits of the vector and sets all other
+/// bits to zero.
+///
+/// This intrinsic is provided for completeness and is equivalent to `u32x4(*m,
+/// 0, 0, 0)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 4 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load32_zero(m: *const u32) -> v128 {
+    u32x4(ptr::read_unaligned(m), 0, 0, 0)
+}
+
+/// Load a 64-bit element into the low bits of the vector and sets all other
+/// bits to zero.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u64x2_replace_lane::<0>(u64x2(0, 0), *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load64_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load64_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load64_zero(m: *const u64) -> v128 {
+    u64x2_replace_lane::<0>(u64x2(0, 0), ptr::read_unaligned(m))
+}
+
+/// Stores a `v128` vector to the given heap address.
+///
+/// This intrinsic will emit a store with an alignment of 1. While this is
+/// provided for completeness it is not strictly necessary, you can also store
+/// the pointer directly:
+///
+/// ```rust,ignore
+/// let a: &mut v128 = ...;
+/// unsafe { v128_store(a, value) };
+/// // .. is the same as ..
+/// *a = value;
+/// ```
+///
+/// The alignment of the store can be configured by doing a manual store without
+/// this intrinsic.
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 16 bytes to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store(m: *mut v128, a: v128) {
+    *(m as *mut Unaligned<v128>) = Unaligned(a);
+}
+
+/// Loads an 8-bit value from `m` and sets lane `L` of `v` to that value.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u8x16_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 1 byte from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load8_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load8_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load8_lane<const L: usize>(v: v128, m: *const u8) -> v128 {
+    u8x16_replace_lane::<L>(v, *m)
+}
+
+/// Loads a 16-bit value from `m` and sets lane `L` of `v` to that value.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u16x8_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 2 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load16_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load16_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load16_lane<const L: usize>(v: v128, m: *const u16) -> v128 {
+    u16x8_replace_lane::<L>(v, ptr::read_unaligned(m))
+}
+
+/// Loads a 32-bit value from `m` and sets lane `L` of `v` to that value.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u32x4_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 4 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load32_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load32_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load32_lane<const L: usize>(v: v128, m: *const u32) -> v128 {
+    u32x4_replace_lane::<L>(v, ptr::read_unaligned(m))
+}
+
+/// Loads a 64-bit value from `m` and sets lane `L` of `v` to that value.
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u64x2_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to load 8 bytes from. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned load.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.load64_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.load64_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_load64_lane<const L: usize>(v: v128, m: *const u64) -> v128 {
+    u64x2_replace_lane::<L>(v, ptr::read_unaligned(m))
+}
+
+/// Stores the 8-bit value from lane `L` of `v` into `m`
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `*m = u8x16_extract_lane::<L>(v)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 1 byte to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store8_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store8_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store8_lane<const L: usize>(v: v128, m: *mut u8) {
+    *m = u8x16_extract_lane::<L>(v);
+}
+
+/// Stores the 16-bit value from lane `L` of `v` into `m`
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `*m = u16x8_extract_lane::<L>(v)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 2 bytes to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store16_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store16_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store16_lane<const L: usize>(v: v128, m: *mut u16) {
+    ptr::write_unaligned(m, u16x8_extract_lane::<L>(v))
+}
+
+/// Stores the 32-bit value from lane `L` of `v` into `m`
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `*m = u32x4_extract_lane::<L>(v)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 4 bytes to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store32_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store32_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store32_lane<const L: usize>(v: v128, m: *mut u32) {
+    ptr::write_unaligned(m, u32x4_extract_lane::<L>(v))
+}
+
+/// Stores the 64-bit value from lane `L` of `v` into `m`
+///
+/// This intrinsic is provided for completeness and is equivalent to
+/// `*m = u64x2_extract_lane::<L>(v)` (which doesn't require `unsafe`).
+///
+/// # Unsafety
+///
+/// This intrinsic is unsafe because it takes a raw pointer as an argument, and
+/// the pointer must be valid to store 8 bytes to. Note that there is no
+/// alignment requirement on this pointer since this intrinsic performs a
+/// 1-aligned store.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.store64_lane, L = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.store64_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub unsafe fn v128_store64_lane<const L: usize>(v: v128, m: *mut u64) {
+    ptr::write_unaligned(m, u64x2_extract_lane::<L>(v))
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[cfg_attr(
+    test,
+    assert_instr(
+        v128.const,
+        a0 = 0,
+        a1 = 1,
+        a2 = 2,
+        a3 = 3,
+        a4 = 4,
+        a5 = 5,
+        a6 = 6,
+        a7 = 7,
+        a8 = 8,
+        a9 = 9,
+        a10 = 10,
+        a11 = 11,
+        a12 = 12,
+        a13 = 13,
+        a14 = 14,
+        a15 = 15,
+    )
+)]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+pub const fn i8x16(
+    a0: i8,
+    a1: i8,
+    a2: i8,
+    a3: i8,
+    a4: i8,
+    a5: i8,
+    a6: i8,
+    a7: i8,
+    a8: i8,
+    a9: i8,
+    a10: i8,
+    a11: i8,
+    a12: i8,
+    a13: i8,
+    a14: i8,
+    a15: i8,
+) -> v128 {
+    simd::i8x16(
+        a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15,
+    )
+    .v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+pub const fn u8x16(
+    a0: u8,
+    a1: u8,
+    a2: u8,
+    a3: u8,
+    a4: u8,
+    a5: u8,
+    a6: u8,
+    a7: u8,
+    a8: u8,
+    a9: u8,
+    a10: u8,
+    a11: u8,
+    a12: u8,
+    a13: u8,
+    a14: u8,
+    a15: u8,
+) -> v128 {
+    simd::u8x16(
+        a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15,
+    )
+    .v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[cfg_attr(
+    test,
+    assert_instr(
+        v128.const,
+        a0 = 0,
+        a1 = 1,
+        a2 = 2,
+        a3 = 3,
+        a4 = 4,
+        a5 = 5,
+        a6 = 6,
+        a7 = 7,
+    )
+)]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+pub const fn i16x8(a0: i16, a1: i16, a2: i16, a3: i16, a4: i16, a5: i16, a6: i16, a7: i16) -> v128 {
+    simd::i16x8(a0, a1, a2, a3, a4, a5, a6, a7).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+pub const fn u16x8(a0: u16, a1: u16, a2: u16, a3: u16, a4: u16, a5: u16, a6: u16, a7: u16) -> v128 {
+    simd::u16x8(a0, a1, a2, a3, a4, a5, a6, a7).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[cfg_attr(test, assert_instr(v128.const, a0 = 0, a1 = 1, a2 = 2, a3 = 3))]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+pub const fn i32x4(a0: i32, a1: i32, a2: i32, a3: i32) -> v128 {
+    simd::i32x4(a0, a1, a2, a3).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+pub const fn u32x4(a0: u32, a1: u32, a2: u32, a3: u32) -> v128 {
+    simd::u32x4(a0, a1, a2, a3).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[cfg_attr(test, assert_instr(v128.const, a0 = 1, a1 = 2))]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+pub const fn i64x2(a0: i64, a1: i64) -> v128 {
+    simd::i64x2(a0, a1).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
+pub const fn u64x2(a0: u64, a1: u64) -> v128 {
+    simd::u64x2(a0, a1).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[cfg_attr(test, assert_instr(v128.const, a0 = 0.0, a1 = 1.0, a2 = 2.0, a3 = 3.0))]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd_const", since = "1.56.0")]
+pub const fn f32x4(a0: f32, a1: f32, a2: f32, a3: f32) -> v128 {
+    simd::f32x4(a0, a1, a2, a3).v128()
+}
+
+/// Materializes a SIMD value from the provided operands.
+///
+/// If possible this will generate a `v128.const` instruction, otherwise it may
+/// be lowered to a sequence of instructions to materialize the vector value.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[cfg_attr(test, assert_instr(v128.const, a0 = 0.0, a1 = 1.0))]
+#[doc(alias("v128.const"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+#[rustc_const_stable(feature = "wasm_simd_const", since = "1.56.0")]
+pub const fn f64x2(a0: f64, a1: f64) -> v128 {
+    simd::f64x2(a0, a1).v128()
+}
+
+/// Returns a new vector with lanes selected from the lanes of the two input
+/// vectors `$a` and `$b` specified in the 16 immediate operands.
+///
+/// The `$a` and `$b` expressions must have type `v128`, and this function
+/// generates a wasm instruction that is encoded with 16 bytes providing the
+/// indices of the elements to return. The indices `i` in range [0, 15] select
+/// the `i`-th element of `a`. The indices in range [16, 31] select the `i -
+/// 16`-th element of `b`.
+///
+/// Note that this is a macro due to the codegen requirements of all of the
+/// index expressions `$i*` must be constant. A compiler error will be
+/// generated if any of the expressions are not constant.
+///
+/// All indexes `$i*` must have the type `u32`.
+#[inline]
+#[cfg_attr(test,
+    assert_instr(
+        i8x16.shuffle,
+        I0 = 0,
+        I1 = 2,
+        I2 = 4,
+        I3 = 6,
+        I4 = 8,
+        I5 = 10,
+        I6 = 12,
+        I7 = 14,
+        I8 = 16,
+        I9 = 18,
+        I10 = 20,
+        I11 = 22,
+        I12 = 24,
+        I13 = 26,
+        I14 = 28,
+        I15 = 30,
+    )
+)]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shuffle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_shuffle<
+    const I0: usize,
+    const I1: usize,
+    const I2: usize,
+    const I3: usize,
+    const I4: usize,
+    const I5: usize,
+    const I6: usize,
+    const I7: usize,
+    const I8: usize,
+    const I9: usize,
+    const I10: usize,
+    const I11: usize,
+    const I12: usize,
+    const I13: usize,
+    const I14: usize,
+    const I15: usize,
+>(
+    a: v128,
+    b: v128,
+) -> v128 {
+    static_assert!(I0: usize where I0 < 32);
+    static_assert!(I1: usize where I1 < 32);
+    static_assert!(I2: usize where I2 < 32);
+    static_assert!(I3: usize where I3 < 32);
+    static_assert!(I4: usize where I4 < 32);
+    static_assert!(I5: usize where I5 < 32);
+    static_assert!(I6: usize where I6 < 32);
+    static_assert!(I7: usize where I7 < 32);
+    static_assert!(I8: usize where I8 < 32);
+    static_assert!(I9: usize where I9 < 32);
+    static_assert!(I10: usize where I10 < 32);
+    static_assert!(I11: usize where I11 < 32);
+    static_assert!(I12: usize where I12 < 32);
+    static_assert!(I13: usize where I13 < 32);
+    static_assert!(I14: usize where I14 < 32);
+    static_assert!(I15: usize where I15 < 32);
+    let shuf: simd::u8x16 = unsafe {
+        simd_shuffle16!(
+            a.as_u8x16(),
+            b.as_u8x16(),
+            <
+                const I0: usize,
+                const I1: usize,
+                const I2: usize,
+                const I3: usize,
+                const I4: usize,
+                const I5: usize,
+                const I6: usize,
+                const I7: usize,
+                const I8: usize,
+                const I9: usize,
+                const I10: usize,
+                const I11: usize,
+                const I12: usize,
+                const I13: usize,
+                const I14: usize,
+                const I15: usize,
+            > [
+                I0 as u32, I1 as u32, I2 as u32, I3 as u32, I4 as u32, I5 as u32, I6 as u32, I7 as u32,
+                I8 as u32, I9 as u32, I10 as u32, I11 as u32, I12 as u32, I13 as u32, I14 as u32,
+                I15 as u32,
+            ],
+        )
+    };
+    shuf.v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_shuffle as u8x16_shuffle;
+
+/// Same as [`i8x16_shuffle`], except operates as if the inputs were eight
+/// 16-bit integers, only taking 8 indices to shuffle.
+///
+/// Indices in the range [0, 7] select from `a` while [8, 15] select from `b`.
+/// Note that this will generate the `i8x16.shuffle` instruction, since there
+/// is no native `i16x8.shuffle` instruction (there is no need for one since
+/// `i8x16.shuffle` suffices).
+#[inline]
+#[cfg_attr(test,
+    assert_instr(
+        i8x16.shuffle,
+        I0 = 0,
+        I1 = 2,
+        I2 = 4,
+        I3 = 6,
+        I4 = 8,
+        I5 = 10,
+        I6 = 12,
+        I7 = 14,
+    )
+)]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shuffle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_shuffle<
+    const I0: usize,
+    const I1: usize,
+    const I2: usize,
+    const I3: usize,
+    const I4: usize,
+    const I5: usize,
+    const I6: usize,
+    const I7: usize,
+>(
+    a: v128,
+    b: v128,
+) -> v128 {
+    static_assert!(I0: usize where I0 < 16);
+    static_assert!(I1: usize where I1 < 16);
+    static_assert!(I2: usize where I2 < 16);
+    static_assert!(I3: usize where I3 < 16);
+    static_assert!(I4: usize where I4 < 16);
+    static_assert!(I5: usize where I5 < 16);
+    static_assert!(I6: usize where I6 < 16);
+    static_assert!(I7: usize where I7 < 16);
+    let shuf: simd::u16x8 = unsafe {
+        simd_shuffle8!(
+            a.as_u16x8(),
+            b.as_u16x8(),
+            <
+                const I0: usize,
+                const I1: usize,
+                const I2: usize,
+                const I3: usize,
+                const I4: usize,
+                const I5: usize,
+                const I6: usize,
+                const I7: usize,
+            > [
+                I0 as u32, I1 as u32, I2 as u32, I3 as u32, I4 as u32, I5 as u32, I6 as u32, I7 as u32,
+            ],
+        )
+    };
+    shuf.v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_shuffle as u16x8_shuffle;
+
+/// Same as [`i8x16_shuffle`], except operates as if the inputs were four
+/// 32-bit integers, only taking 4 indices to shuffle.
+///
+/// Indices in the range [0, 3] select from `a` while [4, 7] select from `b`.
+/// Note that this will generate the `i8x16.shuffle` instruction, since there
+/// is no native `i32x4.shuffle` instruction (there is no need for one since
+/// `i8x16.shuffle` suffices).
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shuffle, I0 = 0, I1 = 2, I2 = 4, I3 = 6))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shuffle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_shuffle<const I0: usize, const I1: usize, const I2: usize, const I3: usize>(
+    a: v128,
+    b: v128,
+) -> v128 {
+    static_assert!(I0: usize where I0 < 8);
+    static_assert!(I1: usize where I1 < 8);
+    static_assert!(I2: usize where I2 < 8);
+    static_assert!(I3: usize where I3 < 8);
+    let shuf: simd::u32x4 = unsafe {
+        simd_shuffle4!(
+            a.as_u32x4(),
+            b.as_u32x4(),
+            <const I0: usize, const I1: usize, const I2: usize, const I3: usize> [I0 as u32, I1 as u32, I2 as u32, I3 as u32],
+        )
+    };
+    shuf.v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_shuffle as u32x4_shuffle;
+
+/// Same as [`i8x16_shuffle`], except operates as if the inputs were two
+/// 64-bit integers, only taking 2 indices to shuffle.
+///
+/// Indices in the range [0, 1] select from `a` while [2, 3] select from `b`.
+/// Note that this will generate the `v8x16.shuffle` instruction, since there
+/// is no native `i64x2.shuffle` instruction (there is no need for one since
+/// `i8x16.shuffle` suffices).
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shuffle, I0 = 0, I1 = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shuffle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_shuffle<const I0: usize, const I1: usize>(a: v128, b: v128) -> v128 {
+    static_assert!(I0: usize where I0 < 4);
+    static_assert!(I1: usize where I1 < 4);
+    let shuf: simd::u64x2 = unsafe {
+        simd_shuffle2!(
+            a.as_u64x2(),
+            b.as_u64x2(),
+            <const I0: usize, const I1: usize> [I0 as u32, I1 as u32],
+        )
+    };
+    shuf.v128()
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_shuffle as u64x2_shuffle;
+
+/// Extracts a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.extract_lane_s, N = 3))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.extract_lane_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_extract_lane<const N: usize>(a: v128) -> i8 {
+    static_assert!(N: usize where N < 16);
+    unsafe { simd_extract(a.as_i8x16(), N as u32) }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 16 packed u8 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.extract_lane_u, N = 3))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.extract_lane_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_extract_lane<const N: usize>(a: v128) -> u8 {
+    static_assert!(N: usize where N < 16);
+    unsafe { simd_extract(a.as_u8x16(), N as u32) }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_replace_lane<const N: usize>(a: v128, val: i8) -> v128 {
+    static_assert!(N: usize where N < 16);
+    unsafe { simd_insert(a.as_i8x16(), N as u32, val).v128() }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 16 packed u8 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_replace_lane<const N: usize>(a: v128, val: u8) -> v128 {
+    static_assert!(N: usize where N < 16);
+    unsafe { simd_insert(a.as_u8x16(), N as u32, val).v128() }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
+///
+/// Extracts a the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extract_lane_s, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extract_lane_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extract_lane<const N: usize>(a: v128) -> i16 {
+    static_assert!(N: usize where N < 8);
+    unsafe { simd_extract(a.as_i16x8(), N as u32) }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 8 packed u16 numbers.
+///
+/// Extracts a the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extract_lane_u, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extract_lane_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_extract_lane<const N: usize>(a: v128) -> u16 {
+    static_assert!(N: usize where N < 8);
+    unsafe { simd_extract(a.as_u16x8(), N as u32) }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_replace_lane<const N: usize>(a: v128, val: i16) -> v128 {
+    static_assert!(N: usize where N < 8);
+    unsafe { simd_insert(a.as_i16x8(), N as u32, val).v128() }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 8 packed u16 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_replace_lane<const N: usize>(a: v128, val: u16) -> v128 {
+    static_assert!(N: usize where N < 8);
+    unsafe { simd_insert(a.as_u16x8(), N as u32, val).v128() }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extract_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extract_lane<const N: usize>(a: v128) -> i32 {
+    static_assert!(N: usize where N < 4);
+    unsafe { simd_extract(a.as_i32x4(), N as u32) }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 4 packed u32 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_extract_lane<const N: usize>(a: v128) -> u32 {
+    i32x4_extract_lane::<N>(a) as u32
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.replace_lane, N = 2))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_replace_lane<const N: usize>(a: v128, val: i32) -> v128 {
+    static_assert!(N: usize where N < 4);
+    unsafe { simd_insert(a.as_i32x4(), N as u32, val).v128() }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 4 packed u32 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_replace_lane<const N: usize>(a: v128, val: u32) -> v128 {
+    i32x4_replace_lane::<N>(a, val as i32)
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 2 packed i64 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extract_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extract_lane<const N: usize>(a: v128) -> i64 {
+    static_assert!(N: usize where N < 2);
+    unsafe { simd_extract(a.as_i64x2(), N as u32) }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 2 packed u64 numbers.
+///
+/// Extracts the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u64x2_extract_lane<const N: usize>(a: v128) -> u64 {
+    i64x2_extract_lane::<N>(a) as u64
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 2 packed i64 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.replace_lane, N = 0))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_replace_lane<const N: usize>(a: v128, val: i64) -> v128 {
+    static_assert!(N: usize where N < 2);
+    unsafe { simd_insert(a.as_i64x2(), N as u32, val).v128() }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 2 packed u64 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u64x2_replace_lane<const N: usize>(a: v128, val: u64) -> v128 {
+    i64x2_replace_lane::<N>(a, val as i64)
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
+///
+/// Extracts the scalar value of lane specified fn the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.extract_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_extract_lane<const N: usize>(a: v128) -> f32 {
+    static_assert!(N: usize where N < 4);
+    unsafe { simd_extract(a.as_f32x4(), N as u32) }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
+///
+/// Replaces the scalar value of lane specified fn the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.replace_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_replace_lane<const N: usize>(a: v128, val: f32) -> v128 {
+    static_assert!(N: usize where N < 4);
+    unsafe { simd_insert(a.as_f32x4(), N as u32, val).v128() }
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 2 packed f64 numbers.
+///
+/// Extracts the scalar value of lane specified fn the immediate mode operand
+/// `N` from `a`. If `N` fs out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.extract_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.extract_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_extract_lane<const N: usize>(a: v128) -> f64 {
+    static_assert!(N: usize where N < 2);
+    unsafe { simd_extract(a.as_f64x2(), N as u32) }
+}
+
+/// Replaces a lane from a 128-bit vector interpreted as 2 packed f64 numbers.
+///
+/// Replaces the scalar value of lane specified in the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.replace_lane, N = 1))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.replace_lane"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_replace_lane<const N: usize>(a: v128, val: f64) -> v128 {
+    static_assert!(N: usize where N < 2);
+    unsafe { simd_insert(a.as_f64x2(), N as u32, val).v128() }
+}
+
+/// Returns a new vector with lanes selected from the lanes of the first input
+/// vector `a` specified in the second input vector `s`.
+///
+/// The indices `i` in range [0, 15] select the `i`-th element of `a`. For
+/// indices outside of the range the resulting lane is 0.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.swizzle))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.swizzle"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_swizzle(a: v128, s: v128) -> v128 {
+    unsafe { llvm_swizzle(a.as_i8x16(), s.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_swizzle as u8x16_swizzle;
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 16 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_splat(a: i8) -> v128 {
+    simd::i8x16::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 16 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_splat(a: u8) -> v128 {
+    simd::u8x16::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 8 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_splat(a: i16) -> v128 {
+    simd::i16x8::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 8 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_splat(a: u16) -> v128 {
+    simd::u16x8::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_splat(a: i32) -> v128 {
+    simd::i32x4::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_splat(a: u32) -> v128 {
+    i32x4_splat(a as i32)
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_splat(a: i64) -> v128 {
+    simd::i64x2::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[doc(alias("u64x2.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u64x2_splat(a: u64) -> v128 {
+    i64x2_splat(a as i64)
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_splat(a: f32) -> v128 {
+    simd::f32x4::splat(a).v128()
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.splat))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.splat"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_splat(a: f64) -> v128 {
+    simd::f64x2::splat(a).v128()
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_eq as u8x16_eq;
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_ne as u8x16_ne;
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.lt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.lt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.lt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.lt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i8x16>(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.gt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.gt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.gt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.gt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i8x16>(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.le_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.le_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.le_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.le_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i8x16>(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.ge_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.ge_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i8x16>(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.ge_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.ge_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i8x16>(a.as_u8x16(), b.as_u8x16()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_eq as u16x8_eq;
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_ne as u16x8_ne;
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.lt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.lt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.lt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.lt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i16x8>(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.gt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.gt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.gt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.gt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i16x8>(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.le_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.le_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.le_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.le_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i16x8>(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.ge_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.ge_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i16x8>(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.ge_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.ge_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i16x8>(a.as_u16x8(), b.as_u16x8()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_eq as u32x4_eq;
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_ne as u32x4_ne;
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.lt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.lt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.lt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.lt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i32x4>(a.as_u32x4(), b.as_u32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.gt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.gt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.gt_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.gt_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i32x4>(a.as_u32x4(), b.as_u32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.le_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.le_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.le_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.le_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i32x4>(a.as_u32x4(), b.as_u32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.ge_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.ge_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i32x4>(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// unsigned integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.ge_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.ge_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i32x4>(a.as_u32x4(), b.as_u32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_eq as u64x2_eq;
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_ne as u64x2_ne;
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.lt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.lt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.gt_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.gt_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.le_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.le_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.ge_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.ge_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i64x2>(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.lt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.lt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.gt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.gt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.le))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.le"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.ge))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.ge"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i32x4>(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.eq))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.eq"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_eq(a: v128, b: v128) -> v128 {
+    unsafe { simd_eq::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the corresponding input elements
+/// were not equal, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.ne))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.ne"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_ne(a: v128, b: v128) -> v128 {
+    unsafe { simd_ne::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.lt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.lt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_lt(a: v128, b: v128) -> v128 {
+    unsafe { simd_lt::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.gt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.gt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_gt(a: v128, b: v128) -> v128 {
+    unsafe { simd_gt::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is less than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.le))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.le"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_le(a: v128, b: v128) -> v128 {
+    unsafe { simd_le::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// floating point numbers.
+///
+/// Returns a new vector where each lane is all ones if the lane-wise left
+/// element is greater than the right element, or all zeros otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.ge))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.ge"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_ge(a: v128, b: v128) -> v128 {
+    unsafe { simd_ge::<_, simd::i64x2>(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Flips each bit of the 128-bit input vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.not))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.not"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_not(a: v128) -> v128 {
+    unsafe { simd_xor(a.as_i64x2(), simd::i64x2(!0, !0)).v128() }
+}
+
+/// Performs a bitwise and of the two input 128-bit vectors, returning the
+/// resulting vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.and))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.and"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_and(a: v128, b: v128) -> v128 {
+    unsafe { simd_and(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Bitwise AND of bits of `a` and the logical inverse of bits of `b`.
+///
+/// This operation is equivalent to `v128.and(a, v128.not(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(v128.andnot))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.andnot"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_andnot(a: v128, b: v128) -> v128 {
+    unsafe { simd_and(a.as_i64x2(), simd_xor(b.as_i64x2(), simd::i64x2(-1, -1))).v128() }
+}
+
+/// Performs a bitwise or of the two input 128-bit vectors, returning the
+/// resulting vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.or))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.or"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_or(a: v128, b: v128) -> v128 {
+    unsafe { simd_or(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Performs a bitwise xor of the two input 128-bit vectors, returning the
+/// resulting vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.xor))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.xor"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_xor(a: v128, b: v128) -> v128 {
+    unsafe { simd_xor(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+/// Use the bitmask in `c` to select bits from `v1` when 1 and `v2` when 0.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.bitselect))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.bitselect"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_bitselect(v1: v128, v2: v128, c: v128) -> v128 {
+    unsafe { llvm_bitselect(v1.as_i8x16(), v2.as_i8x16(), c.as_i8x16()).v128() }
+}
+
+/// Returns `true` if any bit in `a` is set, or `false` otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(v128.any_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("v128.any_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn v128_any_true(a: v128) -> bool {
+    unsafe { llvm_any_true_i8x16(a.as_i8x16()) != 0 }
+}
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_abs(a: v128) -> v128 {
+    unsafe {
+        let a = a.as_i8x16();
+        let zero = simd::i8x16::splat(0);
+        simd_select::<simd::m8x16, simd::i8x16>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
+    }
+}
+
+/// Negates a 128-bit vectors interpreted as sixteen 8-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_neg(a: v128) -> v128 {
+    unsafe { simd_mul(a.as_i8x16(), simd::i8x16::splat(-1)).v128() }
+}
+
+/// Count the number of bits set to one within each lane.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.popcnt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.popcnt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_popcnt(v: v128) -> v128 {
+    unsafe { llvm_popcnt(v.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_popcnt as u8x16_popcnt;
+
+/// Returns true if all lanes are non-zero, false otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.all_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.all_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_all_true(a: v128) -> bool {
+    unsafe { llvm_i8x16_all_true(a.as_i8x16()) != 0 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_all_true as u8x16_all_true;
+
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.bitmask))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.bitmask"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_bitmask(a: v128) -> u16 {
+    // FIXME(https://bugs.llvm.org/show_bug.cgi?id=50507) - this produces an
+    // extraneous `i32.and` instruction against a mask of 65535 when converting
+    // from the native intrinsic's i32 return value to our desired u16. This
+    // shouldn't be necessary, though, but requires upstream LLVM changes.
+    unsafe { llvm_bitmask_i8x16(a.as_i8x16()) as u16 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_bitmask as u8x16_bitmask;
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x7f or 0x80 is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.narrow_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_narrow_i16x8(a: v128, b: v128) -> v128 {
+    unsafe { llvm_narrow_i8x16_s(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x00 or 0xff is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.narrow_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_narrow_i16x8(a: v128, b: v128) -> v128 {
+    unsafe { llvm_narrow_i8x16_u(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shl))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shl"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shl(a.as_i8x16(), simd::i8x16::splat(amt as i8)).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_shl as u8x16_shl;
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shr_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shr_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_shr(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shr(a.as_i8x16(), simd::i8x16::splat(amt as i8)).v128() }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.shr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.shr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_shr(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shr(a.as_u8x16(), simd::u8x16::splat(amt as u8)).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed sixteen 8-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_add as u8x16_add;
+
+/// Adds two 128-bit vectors as if they were two packed sixteen 8-bit signed
+/// integers, saturating on overflow to `i8::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.add_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.add_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_add_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i8x16_add_sat_s(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed sixteen 8-bit unsigned
+/// integers, saturating on overflow to `u8::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.add_sat_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.add_sat_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_add_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i8x16_add_sat_u(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i8x16_sub as u8x16_sub;
+
+/// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
+/// signed integers, saturating on overflow to `i8::MIN`.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.sub_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.sub_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_sub_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i8x16_sub_sat_s(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
+/// unsigned integers, saturating on overflow to 0.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.sub_sat_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.sub_sat_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_sub_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i8x16_sub_sat_u(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.min_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.min_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_min(a: v128, b: v128) -> v128 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    unsafe { simd_select::<simd::i8x16, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.min_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.min_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_min(a: v128, b: v128) -> v128 {
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    unsafe { simd_select::<simd::i8x16, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.max_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.max_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i8x16_max(a: v128, b: v128) -> v128 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    unsafe { simd_select::<simd::i8x16, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.max_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.max_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_max(a: v128, b: v128) -> v128 {
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    unsafe { simd_select::<simd::i8x16, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Lane-wise rounding average.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.avgr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i8x16.avgr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u8x16_avgr(a: v128, b: v128) -> v128 {
+    unsafe { llvm_avgr_u_i8x16(a.as_i8x16(), b.as_i8x16()).v128() }
+}
+
+/// Integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extadd_pairwise_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extadd_pairwise_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extadd_pairwise_i8x16(a: v128) -> v128 {
+    unsafe { llvm_i16x8_extadd_pairwise_i8x16_s(a.as_i8x16()).v128() }
+}
+
+/// Integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extadd_pairwise_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extadd_pairwise_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extadd_pairwise_u8x16(a: v128) -> v128 {
+    unsafe { llvm_i16x8_extadd_pairwise_i8x16_u(a.as_i8x16()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extadd_pairwise_u8x16 as u16x8_extadd_pairwise_u8x16;
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_abs(a: v128) -> v128 {
+    let a = a.as_i16x8();
+    let zero = simd::i16x8::splat(0);
+    unsafe {
+        simd_select::<simd::m16x8, simd::i16x8>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
+    }
+}
+
+/// Negates a 128-bit vectors interpreted as eight 16-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_neg(a: v128) -> v128 {
+    unsafe { simd_mul(a.as_i16x8(), simd::i16x8::splat(-1)).v128() }
+}
+
+/// Lane-wise saturating rounding multiplication in Q15 format.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.q15mulr_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.q15mulr_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_q15mulr_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_q15mulr(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Returns true if all lanes are non-zero, false otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.all_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.all_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_all_true(a: v128) -> bool {
+    unsafe { llvm_i16x8_all_true(a.as_i16x8()) != 0 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_all_true as u16x8_all_true;
+
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.bitmask))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.bitmask"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_bitmask(a: v128) -> u8 {
+    unsafe { llvm_bitmask_i16x8(a.as_i16x8()) as u8 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_bitmask as u16x8_bitmask;
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x7fff or 0x8000 is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.narrow_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_narrow_i32x4(a: v128, b: v128) -> v128 {
+    unsafe { llvm_narrow_i16x8_s(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x0000 or 0xffff is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.narrow_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_narrow_i32x4(a: v128, b: v128) -> v128 {
+    unsafe { llvm_narrow_i16x8_u(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extend_low_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extend_low_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extend_low_i8x16(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ))
+        .v128()
+    }
+}
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extend_high_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extend_high_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extend_high_i8x16(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ))
+        .v128()
+    }
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extend_low_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extend_low_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extend_low_u8x16(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ))
+        .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extend_low_u8x16 as u16x8_extend_low_u8x16;
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extend_high_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extend_high_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extend_high_u8x16(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ))
+        .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extend_high_u8x16 as u16x8_extend_high_u8x16;
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.shl))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.shl"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shl(a.as_i16x8(), simd::i16x8::splat(amt as i16)).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_shl as u16x8_shl;
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.shr_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.shr_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_shr(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shr(a.as_i16x8(), simd::i16x8::splat(amt as i16)).v128() }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.shr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.shr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_shr(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shr(a.as_u16x8(), simd::u16x8::splat(amt as u16)).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed eight 16-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_add as u16x8_add;
+
+/// Adds two 128-bit vectors as if they were two packed eight 16-bit signed
+/// integers, saturating on overflow to `i16::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.add_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.add_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_add_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i16x8_add_sat_s(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed eight 16-bit unsigned
+/// integers, saturating on overflow to `u16::MAX`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.add_sat_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.add_sat_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_add_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i16x8_add_sat_u(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed eight 16-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_sub as u16x8_sub;
+
+/// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
+/// signed integers, saturating on overflow to `i16::MIN`.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.sub_sat_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.sub_sat_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_sub_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i16x8_sub_sat_s(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
+/// unsigned integers, saturating on overflow to 0.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.sub_sat_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.sub_sat_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_sub_sat(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i16x8_sub_sat_u(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Multiplies two 128-bit vectors as if they were two packed eight 16-bit
+/// signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_mul as u16x8_mul;
+
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.min_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.min_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_min(a: v128, b: v128) -> v128 {
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe { simd_select::<simd::i16x8, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.min_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.min_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_min(a: v128, b: v128) -> v128 {
+    let a = a.as_u16x8();
+    let b = b.as_u16x8();
+    unsafe { simd_select::<simd::i16x8, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.max_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.max_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_max(a: v128, b: v128) -> v128 {
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe { simd_select::<simd::i16x8, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.max_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.max_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_max(a: v128, b: v128) -> v128 {
+    let a = a.as_u16x8();
+    let b = b.as_u16x8();
+    unsafe { simd_select::<simd::i16x8, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Lane-wise rounding average.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.avgr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.avgr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u16x8_avgr(a: v128, b: v128) -> v128 {
+    unsafe { llvm_avgr_u_i16x8(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_low_i8x16(a), i16x8_extend_low_i8x16(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extmul_low_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extmul_low_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extmul_low_i8x16(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        let rhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            b.as_i8x16(),
+            b.as_i8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_high_i8x16(a), i16x8_extend_high_i8x16(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extmul_high_i8x16_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extmul_high_i8x16_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extmul_high_i8x16(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        let rhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            b.as_i8x16(),
+            b.as_i8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_low_u8x16(a), i16x8_extend_low_u8x16(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extmul_low_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extmul_low_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extmul_low_u8x16(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        let rhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            b.as_u8x16(),
+            b.as_u8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extmul_low_u8x16 as u16x8_extmul_low_u8x16;
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_high_u8x16(a), i16x8_extend_high_u8x16(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.extmul_high_i8x16_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i16x8.extmul_high_i8x16_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i16x8_extmul_high_u8x16(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        let rhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            b.as_u8x16(),
+            b.as_u8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i16x8_extmul_high_u8x16 as u16x8_extmul_high_u8x16;
+
+/// Integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extadd_pairwise_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extadd_pairwise_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extadd_pairwise_i16x8(a: v128) -> v128 {
+    unsafe { llvm_i32x4_extadd_pairwise_i16x8_s(a.as_i16x8()).v128() }
+}
+
+/// Integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extadd_pairwise_i16x8_u))]
+#[doc(alias("i32x4.extadd_pairwise_i16x8_u"))]
+#[target_feature(enable = "simd128")]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extadd_pairwise_u16x8(a: v128) -> v128 {
+    unsafe { llvm_i32x4_extadd_pairwise_i16x8_u(a.as_i16x8()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extadd_pairwise_u16x8 as u32x4_extadd_pairwise_u16x8;
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_abs(a: v128) -> v128 {
+    let a = a.as_i32x4();
+    let zero = simd::i32x4::splat(0);
+    unsafe {
+        simd_select::<simd::m32x4, simd::i32x4>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
+    }
+}
+
+/// Negates a 128-bit vectors interpreted as four 32-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_neg(a: v128) -> v128 {
+    unsafe { simd_mul(a.as_i32x4(), simd::i32x4::splat(-1)).v128() }
+}
+
+/// Returns true if all lanes are non-zero, false otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.all_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.all_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_all_true(a: v128) -> bool {
+    unsafe { llvm_i32x4_all_true(a.as_i32x4()) != 0 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_all_true as u32x4_all_true;
+
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.bitmask))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.bitmask"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_bitmask(a: v128) -> u8 {
+    unsafe { llvm_bitmask_i32x4(a.as_i32x4()) as u8 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_bitmask as u32x4_bitmask;
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extend_low_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extend_low_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extend_low_i16x8(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [0, 1, 2, 3]
+        ))
+        .v128()
+    }
+}
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extend_high_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extend_high_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extend_high_i16x8(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [4, 5, 6, 7]
+        ))
+        .v128()
+    }
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extend_low_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extend_low_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extend_low_u16x8(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [0, 1, 2, 3]
+        ))
+        .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extend_low_u16x8 as u32x4_extend_low_u16x8;
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extend_high_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extend_high_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extend_high_u16x8(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [4, 5, 6, 7]
+        ))
+        .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extend_high_u16x8 as u32x4_extend_high_u16x8;
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.shl))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.shl"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shl(a.as_i32x4(), simd::i32x4::splat(amt as i32)).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_shl as u32x4_shl;
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.shr_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.shr_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_shr(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shr(a.as_i32x4(), simd::i32x4::splat(amt as i32)).v128() }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.shr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.shr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_shr(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shr(a.as_u32x4(), simd::u32x4::splat(amt as u32)).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed four 32-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_add as u32x4_add;
+
+/// Subtracts two 128-bit vectors as if they were two packed four 32-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_sub as u32x4_sub;
+
+/// Multiplies two 128-bit vectors as if they were two packed four 32-bit
+/// signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_i32x4(), b.as_i32x4()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_mul as u32x4_mul;
+
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.min_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.min_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_min(a: v128, b: v128) -> v128 {
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe { simd_select::<simd::i32x4, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.min_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.min_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_min(a: v128, b: v128) -> v128 {
+    let a = a.as_u32x4();
+    let b = b.as_u32x4();
+    unsafe { simd_select::<simd::i32x4, _>(simd_lt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.max_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.max_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_max(a: v128, b: v128) -> v128 {
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe { simd_select::<simd::i32x4, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.max_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.max_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_max(a: v128, b: v128) -> v128 {
+    let a = a.as_u32x4();
+    let b = b.as_u32x4();
+    unsafe { simd_select::<simd::i32x4, _>(simd_gt(a, b), a, b).v128() }
+}
+
+/// Lane-wise multiply signed 16-bit integers in the two input vectors and add
+/// adjacent pairs of the full 32-bit results.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.dot_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.dot_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_dot_i16x8(a: v128, b: v128) -> v128 {
+    unsafe { llvm_i32x4_dot_i16x8_s(a.as_i16x8(), b.as_i16x8()).v128() }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_low_i16x8_s(a), i32x4_extend_low_i16x8_s(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extmul_low_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extmul_low_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extmul_low_i16x8(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [0, 1, 2, 3]
+        ));
+        let rhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            b.as_i16x8(),
+            b.as_i16x8(),
+            [0, 1, 2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_high_i16x8_s(a), i32x4_extend_high_i16x8_s(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extmul_high_i16x8_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extmul_high_i16x8_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extmul_high_i16x8(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [4, 5, 6, 7]
+        ));
+        let rhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            b.as_i16x8(),
+            b.as_i16x8(),
+            [4, 5, 6, 7]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_low_u16x8(a), i32x4_extend_low_u16x8(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extmul_low_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extmul_low_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extmul_low_u16x8(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [0, 1, 2, 3]
+        ));
+        let rhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            b.as_u16x8(),
+            b.as_u16x8(),
+            [0, 1, 2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extmul_low_u16x8 as u32x4_extmul_low_u16x8;
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_high_u16x8(a), i32x4_extend_high_u16x8(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.extmul_high_i16x8_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.extmul_high_i16x8_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_extmul_high_u16x8(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [4, 5, 6, 7]
+        ));
+        let rhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            b.as_u16x8(),
+            b.as_u16x8(),
+            [4, 5, 6, 7]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i32x4_extmul_high_u16x8 as u32x4_extmul_high_u16x8;
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.abs))] // FIXME llvm
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_abs(a: v128) -> v128 {
+    let a = a.as_i64x2();
+    let zero = simd::i64x2::splat(0);
+    unsafe {
+        simd_select::<simd::m64x2, simd::i64x2>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
+    }
+}
+
+/// Negates a 128-bit vectors interpreted as two 64-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_neg(a: v128) -> v128 {
+    unsafe { simd_mul(a.as_i64x2(), simd::i64x2::splat(-1)).v128() }
+}
+
+/// Returns true if all lanes are non-zero, false otherwise.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.all_true))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.all_true"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_all_true(a: v128) -> bool {
+    unsafe { llvm_i64x2_all_true(a.as_i64x2()) != 0 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_all_true as u64x2_all_true;
+
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.bitmask))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.bitmask"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_bitmask(a: v128) -> u8 {
+    unsafe { llvm_bitmask_i64x2(a.as_i64x2()) as u8 }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_bitmask as u64x2_bitmask;
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extend_low_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extend_low_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extend_low_i32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(a.as_i32x4(), a.as_i32x4(), [0, 1]))
+            .v128()
+    }
+}
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extend_high_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extend_high_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extend_high_i32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(a.as_i32x4(), a.as_i32x4(), [2, 3]))
+            .v128()
+    }
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extend_low_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extend_low_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extend_low_u32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u32x2, simd::i64x2>(simd_shuffle2!(a.as_u32x4(), a.as_u32x4(), [0, 1]))
+            .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_extend_low_u32x4 as u64x2_extend_low_u32x4;
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extend_high_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extend_high_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extend_high_u32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u32x2, simd::i64x2>(simd_shuffle2!(a.as_u32x4(), a.as_u32x4(), [2, 3]))
+            .v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_extend_high_u32x4 as u64x2_extend_high_u32x4;
+
+/// Shifts each lane to the left by the specified number of bits.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.shl))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.shl"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shl(a.as_i64x2(), simd::i64x2::splat(amt as i64)).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_shl as u64x2_shl;
+
+/// Shifts each lane to the right by the specified number of bits, sign
+/// extending.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.shr_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.shr_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_shr(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shr(a.as_i64x2(), simd::i64x2::splat(amt as i64)).v128() }
+}
+
+/// Shifts each lane to the right by the specified number of bits, shifting in
+/// zeros.
+///
+/// Only the low bits of the shift amount are used if the shift amount is
+/// greater than the lane width.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.shr_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.shr_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u64x2_shr(a: v128, amt: u32) -> v128 {
+    unsafe { simd_shr(a.as_u64x2(), simd::u64x2::splat(amt as u64)).v128() }
+}
+
+/// Adds two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_add as u64x2_add;
+
+/// Subtracts two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_sub as u64x2_sub;
+
+/// Multiplies two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_i64x2(), b.as_i64x2()).v128() }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_mul as u64x2_mul;
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_low_i32x4_s(a), i64x2_extend_low_i32x4_s(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extmul_low_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extmul_low_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extmul_low_i32x4(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(
+            a.as_i32x4(),
+            a.as_i32x4(),
+            [0, 1]
+        ));
+        let rhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(
+            b.as_i32x4(),
+            b.as_i32x4(),
+            [0, 1]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_high_i32x4_s(a), i64x2_extend_high_i32x4_s(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extmul_high_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extmul_high_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extmul_high_i32x4(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(
+            a.as_i32x4(),
+            a.as_i32x4(),
+            [2, 3]
+        ));
+        let rhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(
+            b.as_i32x4(),
+            b.as_i32x4(),
+            [2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_low_i32x4_u(a), i64x2_extend_low_i32x4_u(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extmul_low_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extmul_low_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extmul_low_u32x4(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle2!(
+            a.as_u32x4(),
+            a.as_u32x4(),
+            [0, 1]
+        ));
+        let rhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle2!(
+            b.as_u32x4(),
+            b.as_u32x4(),
+            [0, 1]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_extmul_low_u32x4 as u64x2_extmul_low_u32x4;
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_high_i32x4_u(a), i64x2_extend_high_i32x4_u(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.extmul_high_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i64x2.extmul_high_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i64x2_extmul_high_u32x4(a: v128, b: v128) -> v128 {
+    unsafe {
+        let lhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle2!(
+            a.as_u32x4(),
+            a.as_u32x4(),
+            [2, 3]
+        ));
+        let rhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle2!(
+            b.as_u32x4(),
+            b.as_u32x4(),
+            [2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
+}
+
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub use i64x2_extmul_high_u32x4 as u64x2_extmul_high_u32x4;
+
+/// Lane-wise rounding to the nearest integral value not smaller than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.ceil))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.ceil"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_ceil(a: v128) -> v128 {
+    unsafe { llvm_f32x4_ceil(a.as_f32x4()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value not greater than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.floor))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.floor"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_floor(a: v128) -> v128 {
+    unsafe { llvm_f32x4_floor(a.as_f32x4()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value with the magnitude not
+/// larger than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.trunc))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.trunc"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_trunc(a: v128) -> v128 {
+    unsafe { llvm_f32x4_trunc(a.as_f32x4()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value; if two values are equally
+/// near, rounds to the even one.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.nearest))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.nearest"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_nearest(a: v128) -> v128 {
+    unsafe { llvm_f32x4_nearest(a.as_f32x4()).v128() }
+}
+
+/// Calculates the absolute value of each lane of a 128-bit vector interpreted
+/// as four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_abs(a: v128) -> v128 {
+    unsafe { llvm_f32x4_abs(a.as_f32x4()).v128() }
+}
+
+/// Negates each lane of a 128-bit vector interpreted as four 32-bit floating
+/// point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_neg(a: v128) -> v128 {
+    f32x4_mul(a, f32x4_splat(-1.))
+}
+
+/// Calculates the square root of each lane of a 128-bit vector interpreted as
+/// four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.sqrt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.sqrt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_sqrt(a: v128) -> v128 {
+    unsafe { llvm_f32x4_sqrt(a.as_f32x4()).v128() }
+}
+
+/// Lane-wise addition of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Lane-wise subtraction of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Lane-wise multiplication of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Lane-wise division of two 128-bit vectors interpreted as four 32-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.div))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.div"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_div(a: v128, b: v128) -> v128 {
+    unsafe { simd_div(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Calculates the lane-wise minimum of two 128-bit vectors interpreted
+/// as four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.min))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.min"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_min(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f32x4_min(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Calculates the lane-wise minimum of two 128-bit vectors interpreted
+/// as four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.max))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.max"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_max(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f32x4_max(a.as_f32x4(), b.as_f32x4()).v128() }
+}
+
+/// Lane-wise minimum value, defined as `b < a ? b : a`
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.pmin))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.pmin"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_pmin(a: v128, b: v128) -> v128 {
+    unsafe {
+        simd_select::<simd::m32x4, simd::f32x4>(
+            simd_lt(b.as_f32x4(), a.as_f32x4()),
+            b.as_f32x4(),
+            a.as_f32x4(),
+        )
+        .v128()
+    }
+}
+
+/// Lane-wise maximum value, defined as `a < b ? b : a`
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.pmax))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.pmax"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_pmax(a: v128, b: v128) -> v128 {
+    unsafe {
+        simd_select::<simd::m32x4, simd::f32x4>(
+            simd_lt(a.as_f32x4(), b.as_f32x4()),
+            b.as_f32x4(),
+            a.as_f32x4(),
+        )
+        .v128()
+    }
+}
+
+/// Lane-wise rounding to the nearest integral value not smaller than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.ceil))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.ceil"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_ceil(a: v128) -> v128 {
+    unsafe { llvm_f64x2_ceil(a.as_f64x2()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value not greater than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.floor))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.floor"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_floor(a: v128) -> v128 {
+    unsafe { llvm_f64x2_floor(a.as_f64x2()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value with the magnitude not
+/// larger than the input.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.trunc))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.trunc"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_trunc(a: v128) -> v128 {
+    unsafe { llvm_f64x2_trunc(a.as_f64x2()).v128() }
+}
+
+/// Lane-wise rounding to the nearest integral value; if two values are equally
+/// near, rounds to the even one.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.nearest))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.nearest"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_nearest(a: v128) -> v128 {
+    unsafe { llvm_f64x2_nearest(a.as_f64x2()).v128() }
+}
+
+/// Calculates the absolute value of each lane of a 128-bit vector interpreted
+/// as two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.abs))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.abs"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_abs(a: v128) -> v128 {
+    unsafe { llvm_f64x2_abs(a.as_f64x2()).v128() }
+}
+
+/// Negates each lane of a 128-bit vector interpreted as two 64-bit floating
+/// point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.neg))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.neg"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_neg(a: v128) -> v128 {
+    f64x2_mul(a, f64x2_splat(-1.0))
+}
+
+/// Calculates the square root of each lane of a 128-bit vector interpreted as
+/// two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.sqrt))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.sqrt"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_sqrt(a: v128) -> v128 {
+    unsafe { llvm_f64x2_sqrt(a.as_f64x2()).v128() }
+}
+
+/// Lane-wise add of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.add))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.add"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_add(a: v128, b: v128) -> v128 {
+    unsafe { simd_add(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Lane-wise subtract of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.sub))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.sub"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_sub(a: v128, b: v128) -> v128 {
+    unsafe { simd_sub(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Lane-wise multiply of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.mul))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.mul"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_mul(a: v128, b: v128) -> v128 {
+    unsafe { simd_mul(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Lane-wise divide of two 128-bit vectors interpreted as two 64-bit
+/// floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.div))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.div"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_div(a: v128, b: v128) -> v128 {
+    unsafe { simd_div(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Calculates the lane-wise minimum of two 128-bit vectors interpreted
+/// as two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.min))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.min"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_min(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f64x2_min(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Calculates the lane-wise maximum of two 128-bit vectors interpreted
+/// as two 64-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.max))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.max"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_max(a: v128, b: v128) -> v128 {
+    unsafe { llvm_f64x2_max(a.as_f64x2(), b.as_f64x2()).v128() }
+}
+
+/// Lane-wise minimum value, defined as `b < a ? b : a`
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.pmin))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.pmin"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_pmin(a: v128, b: v128) -> v128 {
+    unsafe {
+        simd_select::<simd::m64x2, simd::f64x2>(
+            simd_lt(b.as_f64x2(), a.as_f64x2()),
+            b.as_f64x2(),
+            a.as_f64x2(),
+        )
+        .v128()
+    }
+}
+
+/// Lane-wise maximum value, defined as `a < b ? b : a`
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.pmax))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.pmax"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_pmax(a: v128, b: v128) -> v128 {
+    unsafe {
+        simd_select::<simd::m64x2, simd::f64x2>(
+            simd_lt(a.as_f64x2(), b.as_f64x2()),
+            b.as_f64x2(),
+            a.as_f64x2(),
+        )
+        .v128()
+    }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
+/// into a 128-bit vector of four 32-bit signed integers.
+///
+/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
+/// representable intger.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.trunc_sat_f32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_trunc_sat_f32x4(a: v128) -> v128 {
+    unsafe { llvm_i32x4_trunc_sat_f32x4_s(a.as_f32x4()).v128() }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
+/// into a 128-bit vector of four 32-bit unsigned integers.
+///
+/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
+/// representable intger.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.trunc_sat_f32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_trunc_sat_f32x4(a: v128) -> v128 {
+    unsafe { llvm_i32x4_trunc_sat_f32x4_u(a.as_f32x4()).v128() }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit signed integers into a
+/// 128-bit vector of four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.convert_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.convert_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_convert_i32x4(a: v128) -> v128 {
+    unsafe { simd_cast::<_, simd::f32x4>(a.as_i32x4()).v128() }
+}
+
+/// Converts a 128-bit vector interpreted as four 32-bit unsigned integers into a
+/// 128-bit vector of four 32-bit floating point numbers.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.convert_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.convert_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_convert_u32x4(a: v128) -> v128 {
+    unsafe { simd_cast::<_, simd::f32x4>(a.as_u32x4()).v128() }
+}
+
+/// Saturating conversion of the two double-precision floating point lanes to
+/// two lower integer lanes using the IEEE `convertToIntegerTowardZero`
+/// function.
+///
+/// The two higher lanes of the result are initialized to zero. If any input
+/// lane is a NaN, the resulting lane is 0. If the rounded integer value of a
+/// lane is outside the range of the destination type, the result is saturated
+/// to the nearest representable integer value.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f64x2_s_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.trunc_sat_f64x2_s_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn i32x4_trunc_sat_f64x2_zero(a: v128) -> v128 {
+    let ret: simd::i32x4 = unsafe {
+        simd_shuffle4!(
+            llvm_i32x2_trunc_sat_f64x2_s(a.as_f64x2()),
+            simd::i32x2::splat(0),
+            [0, 1, 2, 3],
+        )
+    };
+    ret.v128()
+}
+
+/// Saturating conversion of the two double-precision floating point lanes to
+/// two lower integer lanes using the IEEE `convertToIntegerTowardZero`
+/// function.
+///
+/// The two higher lanes of the result are initialized to zero. If any input
+/// lane is a NaN, the resulting lane is 0. If the rounded integer value of a
+/// lane is outside the range of the destination type, the result is saturated
+/// to the nearest representable integer value.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f64x2_u_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("i32x4.trunc_sat_f64x2_u_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn u32x4_trunc_sat_f64x2_zero(a: v128) -> v128 {
+    let ret: simd::i32x4 = unsafe {
+        simd_shuffle4!(
+            llvm_i32x2_trunc_sat_f64x2_u(a.as_f64x2()),
+            simd::i32x2::splat(0),
+            [0, 1, 2, 3],
+        )
+    };
+    ret.v128()
+}
+
+/// Lane-wise conversion from integer to floating point.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.convert_low_i32x4_s))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.convert_low_i32x4_s"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_convert_low_i32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::i32x2, simd::f64x2>(simd_shuffle2!(a.as_i32x4(), a.as_i32x4(), [0, 1],))
+            .v128()
+    }
+}
+
+/// Lane-wise conversion from integer to floating point.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.convert_low_i32x4_u))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f64x2.convert_low_i32x4_u"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_convert_low_u32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::u32x2, simd::f64x2>(simd_shuffle2!(a.as_u32x4(), a.as_u32x4(), [0, 1],))
+            .v128()
+    }
+}
+
+/// Conversion of the two double-precision floating point lanes to two lower
+/// single-precision lanes of the result. The two higher lanes of the result are
+/// initialized to zero. If the conversion result is not representable as a
+/// single-precision floating point number, it is rounded to the nearest-even
+/// representable number.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.demote_f64x2_zero))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.demote_f64x2_zero"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f32x4_demote_f64x2_zero(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::f64x4, simd::f32x4>(simd_shuffle4!(
+            a.as_f64x2(),
+            simd::f64x2::splat(0.0),
+            [0, 1, 2, 3]
+        ))
+        .v128()
+    }
+}
+
+/// Conversion of the two lower single-precision floating point lanes to the two
+/// double-precision lanes of the result.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.promote_low_f32x4))]
+#[target_feature(enable = "simd128")]
+#[doc(alias("f32x4.promote_low_f32x4"))]
+#[stable(feature = "wasm_simd", since = "1.54.0")]
+pub fn f64x2_promote_low_f32x4(a: v128) -> v128 {
+    unsafe {
+        simd_cast::<simd::f32x2, simd::f64x2>(simd_shuffle2!(a.as_f32x4(), a.as_f32x4(), [0, 1]))
+            .v128()
+    }
+}
+
+#[cfg(test)]
+pub mod tests {
+    use super::*;
+    use core::ops::{Add, Div, Mul, Neg, Sub};
+    use std;
+    use std::fmt::Debug;
+    use std::mem::transmute;
+    use std::num::Wrapping;
+    use std::prelude::v1::*;
+
+    fn compare_bytes(a: v128, b: v128) {
+        let a: [u8; 16] = unsafe { transmute(a) };
+        let b: [u8; 16] = unsafe { transmute(b) };
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn test_load() {
+        unsafe {
+            let arr: [i32; 4] = [0, 1, 2, 3];
+            let vec = v128_load(arr.as_ptr() as *const v128);
+            compare_bytes(vec, i32x4(0, 1, 2, 3));
+        }
+    }
+
+    #[test]
+    fn test_load_extend() {
+        unsafe {
+            let arr: [i8; 8] = [-3, -2, -1, 0, 1, 2, 3, 4];
+            let vec = i16x8_load_extend_i8x8(arr.as_ptr());
+            compare_bytes(vec, i16x8(-3, -2, -1, 0, 1, 2, 3, 4));
+            let vec = i16x8_load_extend_u8x8(arr.as_ptr() as *const u8);
+            compare_bytes(vec, i16x8(253, 254, 255, 0, 1, 2, 3, 4));
+
+            let arr: [i16; 4] = [-1, 0, 1, 2];
+            let vec = i32x4_load_extend_i16x4(arr.as_ptr());
+            compare_bytes(vec, i32x4(-1, 0, 1, 2));
+            let vec = i32x4_load_extend_u16x4(arr.as_ptr() as *const u16);
+            compare_bytes(vec, i32x4(65535, 0, 1, 2));
+
+            let arr: [i32; 2] = [-1, 1];
+            let vec = i64x2_load_extend_i32x2(arr.as_ptr());
+            compare_bytes(vec, i64x2(-1, 1));
+            let vec = i64x2_load_extend_u32x2(arr.as_ptr() as *const u32);
+            compare_bytes(vec, i64x2(u32::max_value().into(), 1));
+        }
+    }
+
+    #[test]
+    fn test_load_splat() {
+        unsafe {
+            compare_bytes(v128_load8_splat(&8), i8x16_splat(8));
+            compare_bytes(v128_load16_splat(&9), i16x8_splat(9));
+            compare_bytes(v128_load32_splat(&10), i32x4_splat(10));
+            compare_bytes(v128_load64_splat(&11), i64x2_splat(11));
+        }
+    }
+
+    #[test]
+    fn test_load_zero() {
+        unsafe {
+            compare_bytes(v128_load32_zero(&10), i32x4(10, 0, 0, 0));
+            compare_bytes(v128_load64_zero(&11), i64x2(11, 0));
+        }
+    }
+
+    #[test]
+    fn test_store() {
+        unsafe {
+            let mut spot = i8x16_splat(0);
+            v128_store(&mut spot, i8x16_splat(1));
+            compare_bytes(spot, i8x16_splat(1));
+        }
+    }
+
+    #[test]
+    fn test_load_lane() {
+        unsafe {
+            let zero = i8x16_splat(0);
+            compare_bytes(
+                v128_load8_lane::<2>(zero, &1),
+                i8x16_replace_lane::<2>(zero, 1),
+            );
+
+            compare_bytes(
+                v128_load16_lane::<2>(zero, &1),
+                i16x8_replace_lane::<2>(zero, 1),
+            );
+
+            compare_bytes(
+                v128_load32_lane::<2>(zero, &1),
+                i32x4_replace_lane::<2>(zero, 1),
+            );
+
+            compare_bytes(
+                v128_load64_lane::<1>(zero, &1),
+                i64x2_replace_lane::<1>(zero, 1),
+            );
+        }
+    }
+
+    #[test]
+    fn test_store_lane() {
+        unsafe {
+            let mut spot = 0;
+            let zero = i8x16_splat(0);
+            v128_store8_lane::<5>(i8x16_replace_lane::<5>(zero, 7), &mut spot);
+            assert_eq!(spot, 7);
+
+            let mut spot = 0;
+            v128_store16_lane::<5>(i16x8_replace_lane::<5>(zero, 7), &mut spot);
+            assert_eq!(spot, 7);
+
+            let mut spot = 0;
+            v128_store32_lane::<3>(i32x4_replace_lane::<3>(zero, 7), &mut spot);
+            assert_eq!(spot, 7);
+
+            let mut spot = 0;
+            v128_store64_lane::<0>(i64x2_replace_lane::<0>(zero, 7), &mut spot);
+            assert_eq!(spot, 7);
+        }
+    }
+
+    #[test]
+    fn test_i8x16() {
+        const A: v128 = super::i8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        compare_bytes(A, A);
+
+        const _: v128 = i16x8(0, 1, 2, 3, 4, 5, 6, 7);
+        const _: v128 = i32x4(0, 1, 2, 3);
+        const _: v128 = i64x2(0, 1);
+        const _: v128 = f32x4(0., 1., 2., 3.);
+        const _: v128 = f64x2(0., 1.);
+
+        let bytes: [i16; 8] = unsafe { mem::transmute(i16x8(-1, -2, -3, -4, -5, -6, -7, -8)) };
+        assert_eq!(bytes, [-1, -2, -3, -4, -5, -6, -7, -8]);
+        let bytes: [i8; 16] = unsafe {
+            mem::transmute(i8x16(
+                -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16,
+            ))
+        };
+        assert_eq!(
+            bytes,
+            [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16]
+        );
+    }
+
+    #[test]
+    fn test_shuffle() {
+        let vec_a = i8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let vec_b = i8x16(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+
+        let vec_r = i8x16_shuffle::<0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30>(
+            vec_a, vec_b,
+        );
+        let vec_e = i8x16(0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+        compare_bytes(vec_r, vec_e);
+
+        let vec_a = i16x8(0, 1, 2, 3, 4, 5, 6, 7);
+        let vec_b = i16x8(8, 9, 10, 11, 12, 13, 14, 15);
+        let vec_r = i16x8_shuffle::<0, 8, 2, 10, 4, 12, 6, 14>(vec_a, vec_b);
+        let vec_e = i16x8(0, 8, 2, 10, 4, 12, 6, 14);
+        compare_bytes(vec_r, vec_e);
+
+        let vec_a = i32x4(0, 1, 2, 3);
+        let vec_b = i32x4(4, 5, 6, 7);
+        let vec_r = i32x4_shuffle::<0, 4, 2, 6>(vec_a, vec_b);
+        let vec_e = i32x4(0, 4, 2, 6);
+        compare_bytes(vec_r, vec_e);
+
+        let vec_a = i64x2(0, 1);
+        let vec_b = i64x2(2, 3);
+        let vec_r = i64x2_shuffle::<0, 2>(vec_a, vec_b);
+        let vec_e = i64x2(0, 2);
+        compare_bytes(vec_r, vec_e);
+    }
+
+    // tests extract and replace lanes
+    macro_rules! test_extract {
+        (
+            name: $test_id:ident,
+            extract: $extract:ident,
+            replace: $replace:ident,
+            elem: $elem:ty,
+            count: $count:expr,
+            indices: [$($idx:expr),*],
+        ) => {
+            #[test]
+            fn $test_id() {
+                unsafe {
+                    let arr: [$elem; $count] = [123 as $elem; $count];
+                    let vec: v128 = transmute(arr);
+                    $(
+                        assert_eq!($extract::<$idx>(vec), 123 as $elem);
+                    )*
+
+                    // create a vector from array and check that the indices contain
+                    // the same values as in the array:
+                    let arr: [$elem; $count] = [$($idx as $elem),*];
+                    let vec: v128 = transmute(arr);
+                    $(
+                        assert_eq!($extract::<$idx>(vec), $idx as $elem);
+
+                        let tmp = $replace::<$idx>(vec, 124 as $elem);
+                        assert_eq!($extract::<$idx>(tmp), 124 as $elem);
+                    )*
+                }
+            }
+        }
+    }
+
+    test_extract! {
+        name: test_i8x16_extract_replace,
+        extract: i8x16_extract_lane,
+        replace: i8x16_replace_lane,
+        elem: i8,
+        count: 16,
+        indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    }
+    test_extract! {
+        name: test_i16x8_extract_replace,
+        extract: i16x8_extract_lane,
+        replace: i16x8_replace_lane,
+        elem: i16,
+        count: 8,
+        indices: [0, 1, 2, 3, 4, 5, 6, 7],
+    }
+    test_extract! {
+        name: test_i32x4_extract_replace,
+        extract: i32x4_extract_lane,
+        replace: i32x4_replace_lane,
+        elem: i32,
+        count: 4,
+        indices: [0, 1, 2, 3],
+    }
+    test_extract! {
+        name: test_i64x2_extract_replace,
+        extract: i64x2_extract_lane,
+        replace: i64x2_replace_lane,
+        elem: i64,
+        count: 2,
+        indices: [0, 1],
+    }
+    test_extract! {
+        name: test_f32x4_extract_replace,
+        extract: f32x4_extract_lane,
+        replace: f32x4_replace_lane,
+        elem: f32,
+        count: 4,
+        indices: [0, 1, 2, 3],
+    }
+    test_extract! {
+        name: test_f64x2_extract_replace,
+        extract: f64x2_extract_lane,
+        replace: f64x2_replace_lane,
+        elem: f64,
+        count: 2,
+        indices: [0, 1],
+    }
+
+    #[test]
+    #[rustfmt::skip]
+    fn test_swizzle() {
+        compare_bytes(
+            i8x16_swizzle(
+                i32x4(1, 2, 3, 4),
+                i8x16(
+                    32, 31, 30, 29,
+                    0, 1, 2, 3,
+                    12, 13, 14, 15,
+                    0, 4, 8, 12),
+            ),
+            i32x4(0, 1, 4, 0x04030201),
+        );
+    }
+
+    macro_rules! test_splat {
+        ($test_id:ident: $val:expr => $($vals:expr),*) => {
+            #[test]
+            fn $test_id() {
+                let a = super::$test_id($val);
+                let b = u8x16($($vals as u8),*);
+                compare_bytes(a, b);
+            }
+        }
+    }
+
+    mod splats {
+        use super::*;
+        test_splat!(i8x16_splat: 42 => 42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42);
+        test_splat!(i16x8_splat: 42 => 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0);
+        test_splat!(i32x4_splat: 42 => 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0);
+        test_splat!(i64x2_splat: 42 => 42, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0);
+        test_splat!(f32x4_splat: 42. => 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66);
+        test_splat!(f64x2_splat: 42. => 0, 0, 0, 0, 0, 0, 69, 64, 0, 0, 0, 0, 0, 0, 69, 64);
+    }
+
+    #[test]
+    fn test_bitmasks() {
+        let zero = i8x16_splat(0);
+        let ones = i8x16_splat(!0);
+
+        assert_eq!(i8x16_bitmask(zero), 0);
+        assert_eq!(i8x16_bitmask(ones), 0xffff);
+        assert_eq!(i8x16_bitmask(i8x16_splat(i8::MAX)), 0);
+        assert_eq!(i8x16_bitmask(i8x16_splat(i8::MIN)), 0xffff);
+        assert_eq!(i8x16_bitmask(i8x16_replace_lane::<1>(zero, -1)), 0b10);
+
+        assert_eq!(i16x8_bitmask(zero), 0);
+        assert_eq!(i16x8_bitmask(ones), 0xff);
+        assert_eq!(i16x8_bitmask(i16x8_splat(i16::MAX)), 0);
+        assert_eq!(i16x8_bitmask(i16x8_splat(i16::MIN)), 0xff);
+        assert_eq!(i16x8_bitmask(i16x8_replace_lane::<1>(zero, -1)), 0b10);
+
+        assert_eq!(i32x4_bitmask(zero), 0);
+        assert_eq!(i32x4_bitmask(ones), 0b1111);
+        assert_eq!(i32x4_bitmask(i32x4_splat(i32::MAX)), 0);
+        assert_eq!(i32x4_bitmask(i32x4_splat(i32::MIN)), 0b1111);
+        assert_eq!(i32x4_bitmask(i32x4_replace_lane::<1>(zero, -1)), 0b10);
+
+        assert_eq!(i64x2_bitmask(zero), 0);
+        assert_eq!(i64x2_bitmask(ones), 0b11);
+        assert_eq!(i64x2_bitmask(i64x2_splat(i64::MAX)), 0);
+        assert_eq!(i64x2_bitmask(i64x2_splat(i64::MIN)), 0b11);
+        assert_eq!(i64x2_bitmask(i64x2_replace_lane::<1>(zero, -1)), 0b10);
+    }
+
+    #[test]
+    fn test_narrow() {
+        let zero = i8x16_splat(0);
+        let ones = i8x16_splat(!0);
+
+        compare_bytes(i8x16_narrow_i16x8(zero, zero), zero);
+        compare_bytes(u8x16_narrow_i16x8(zero, zero), zero);
+        compare_bytes(i8x16_narrow_i16x8(ones, ones), ones);
+        compare_bytes(u8x16_narrow_i16x8(ones, ones), zero);
+
+        compare_bytes(
+            i8x16_narrow_i16x8(
+                i16x8(
+                    0,
+                    1,
+                    2,
+                    -1,
+                    i8::MIN.into(),
+                    i8::MAX.into(),
+                    u8::MIN.into(),
+                    u8::MAX.into(),
+                ),
+                i16x8(
+                    i16::MIN.into(),
+                    i16::MAX.into(),
+                    u16::MIN as i16,
+                    u16::MAX as i16,
+                    0,
+                    0,
+                    0,
+                    0,
+                ),
+            ),
+            i8x16(0, 1, 2, -1, -128, 127, 0, 127, -128, 127, 0, -1, 0, 0, 0, 0),
+        );
+
+        compare_bytes(
+            u8x16_narrow_i16x8(
+                i16x8(
+                    0,
+                    1,
+                    2,
+                    -1,
+                    i8::MIN.into(),
+                    i8::MAX.into(),
+                    u8::MIN.into(),
+                    u8::MAX.into(),
+                ),
+                i16x8(
+                    i16::MIN.into(),
+                    i16::MAX.into(),
+                    u16::MIN as i16,
+                    u16::MAX as i16,
+                    0,
+                    0,
+                    0,
+                    0,
+                ),
+            ),
+            i8x16(0, 1, 2, 0, 0, 127, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0),
+        );
+
+        compare_bytes(i16x8_narrow_i32x4(zero, zero), zero);
+        compare_bytes(u16x8_narrow_i32x4(zero, zero), zero);
+        compare_bytes(i16x8_narrow_i32x4(ones, ones), ones);
+        compare_bytes(u16x8_narrow_i32x4(ones, ones), zero);
+
+        compare_bytes(
+            i16x8_narrow_i32x4(
+                i32x4(0, -1, i16::MIN.into(), i16::MAX.into()),
+                i32x4(
+                    i32::MIN.into(),
+                    i32::MAX.into(),
+                    u32::MIN as i32,
+                    u32::MAX as i32,
+                ),
+            ),
+            i16x8(0, -1, i16::MIN, i16::MAX, i16::MIN, i16::MAX, 0, -1),
+        );
+
+        compare_bytes(
+            u16x8_narrow_i32x4(
+                i32x4(u16::MAX.into(), -1, i16::MIN.into(), i16::MAX.into()),
+                i32x4(
+                    i32::MIN.into(),
+                    i32::MAX.into(),
+                    u32::MIN as i32,
+                    u32::MAX as i32,
+                ),
+            ),
+            i16x8(-1, 0, 0, i16::MAX, 0, -1, 0, 0),
+        );
+    }
+
+    #[test]
+    fn test_extend() {
+        let zero = i8x16_splat(0);
+        let ones = i8x16_splat(!0);
+
+        compare_bytes(i16x8_extend_low_i8x16(zero), zero);
+        compare_bytes(i16x8_extend_high_i8x16(zero), zero);
+        compare_bytes(i16x8_extend_low_u8x16(zero), zero);
+        compare_bytes(i16x8_extend_high_u8x16(zero), zero);
+        compare_bytes(i16x8_extend_low_i8x16(ones), ones);
+        compare_bytes(i16x8_extend_high_i8x16(ones), ones);
+        let halves = u16x8_splat(u8::MAX.into());
+        compare_bytes(i16x8_extend_low_u8x16(ones), halves);
+        compare_bytes(i16x8_extend_high_u8x16(ones), halves);
+
+        compare_bytes(i32x4_extend_low_i16x8(zero), zero);
+        compare_bytes(i32x4_extend_high_i16x8(zero), zero);
+        compare_bytes(i32x4_extend_low_u16x8(zero), zero);
+        compare_bytes(i32x4_extend_high_u16x8(zero), zero);
+        compare_bytes(i32x4_extend_low_i16x8(ones), ones);
+        compare_bytes(i32x4_extend_high_i16x8(ones), ones);
+        let halves = u32x4_splat(u16::MAX.into());
+        compare_bytes(i32x4_extend_low_u16x8(ones), halves);
+        compare_bytes(i32x4_extend_high_u16x8(ones), halves);
+
+        compare_bytes(i64x2_extend_low_i32x4(zero), zero);
+        compare_bytes(i64x2_extend_high_i32x4(zero), zero);
+        compare_bytes(i64x2_extend_low_u32x4(zero), zero);
+        compare_bytes(i64x2_extend_high_u32x4(zero), zero);
+        compare_bytes(i64x2_extend_low_i32x4(ones), ones);
+        compare_bytes(i64x2_extend_high_i32x4(ones), ones);
+        let halves = i64x2_splat(u32::MAX.into());
+        compare_bytes(u64x2_extend_low_u32x4(ones), halves);
+        compare_bytes(u64x2_extend_high_u32x4(ones), halves);
+    }
+
+    #[test]
+    fn test_dot() {
+        let zero = i8x16_splat(0);
+        let ones = i8x16_splat(!0);
+        let two = i32x4_splat(2);
+        compare_bytes(i32x4_dot_i16x8(zero, zero), zero);
+        compare_bytes(i32x4_dot_i16x8(ones, ones), two);
+    }
+
+    macro_rules! test_binop {
+        (
+            $($name:ident => {
+                $([$($vec1:tt)*] ($op:ident | $f:ident) [$($vec2:tt)*],)*
+            })*
+        ) => ($(
+            #[test]
+            fn $name() {
+                unsafe {
+                    $(
+                        let v1 = [$($vec1)*];
+                        let v2 = [$($vec2)*];
+                        let v1_v128: v128 = mem::transmute(v1);
+                        let v2_v128: v128 = mem::transmute(v2);
+                        let v3_v128 = super::$f(v1_v128, v2_v128);
+                        let mut v3 = [$($vec1)*];
+                        drop(v3);
+                        v3 = mem::transmute(v3_v128);
+
+                        for (i, actual) in v3.iter().enumerate() {
+                            let expected = v1[i].$op(v2[i]);
+                            assert_eq!(*actual, expected);
+                        }
+                    )*
+                }
+            }
+        )*)
+    }
+
+    macro_rules! test_unop {
+        (
+            $($name:ident => {
+                $(($op:ident | $f:ident) [$($vec1:tt)*],)*
+            })*
+        ) => ($(
+            #[test]
+            fn $name() {
+                unsafe {
+                    $(
+                        let v1 = [$($vec1)*];
+                        let v1_v128: v128 = mem::transmute(v1);
+                        let v2_v128 = super::$f(v1_v128);
+                        let mut v2 = [$($vec1)*];
+                        drop(v2);
+                        v2 = mem::transmute(v2_v128);
+
+                        for (i, actual) in v2.iter().enumerate() {
+                            let expected = v1[i].$op();
+                            assert_eq!(*actual, expected);
+                        }
+                    )*
+                }
+            }
+        )*)
+    }
+
+    trait Avgr: Sized {
+        fn avgr(self, other: Self) -> Self;
+    }
+
+    macro_rules! impl_avgr {
+        ($($i:ident)*) => ($(impl Avgr for $i {
+            fn avgr(self, other: Self) -> Self {
+                ((self as u64 + other as u64 + 1) / 2) as $i
+            }
+        })*)
+    }
+
+    impl_avgr!(u8 u16);
+
+    test_binop! {
+        test_i8x16_add => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_add | i8x16_add)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (wrapping_add | i8x16_add)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (wrapping_add | i8x16_add)
+            [127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 9, -24],
+        }
+
+        test_i8x16_add_sat_s => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_add | i8x16_add_sat)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_add | i8x16_add_sat)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_add | i8x16_add_sat)
+            [127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 9, -24],
+        }
+
+        test_i8x16_add_sat_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_add | u8x16_add_sat)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_add | u8x16_add_sat)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_add | u8x16_add_sat)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i8x16_sub => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_sub | i8x16_sub)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (wrapping_sub | i8x16_sub)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (wrapping_sub | i8x16_sub)
+            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
+        }
+
+        test_i8x16_sub_sat_s => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_sub | i8x16_sub_sat)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_sub | i8x16_sub_sat)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_sub | i8x16_sub_sat)
+            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
+        }
+
+        test_i8x16_sub_sat_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_sub | u8x16_sub_sat)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_sub | u8x16_sub_sat)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (saturating_sub | u8x16_sub_sat)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i8x16_min_s => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (min | i8x16_min)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (min | i8x16_min)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (min | i8x16_min)
+            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
+        }
+
+        test_i8x16_min_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (min | u8x16_min)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (min | u8x16_min)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (min | u8x16_min)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i8x16_max_s => {
+            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (max | i8x16_max)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (max | i8x16_max)
+            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (max | i8x16_max)
+            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
+        }
+
+        test_i8x16_max_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (max | u8x16_max)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (max | u8x16_max)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (max | u8x16_max)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i8x16_avgr_u => {
+            [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                (avgr | u8x16_avgr)
+            [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (avgr | u8x16_avgr)
+            [255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240],
+
+            [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+                (avgr | u8x16_avgr)
+            [127, -44i8 as u8, 43, 126, 4, 2, 9, -3i8 as u8, -59i8 as u8, -43i8 as u8, 39, -69i8 as u8, 79, -3i8 as u8, 9, -24i8 as u8],
+        }
+
+        test_i16x8_add => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_add | i16x8_add)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (wrapping_add | i16x8_add)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_add_sat_s => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_add | i16x8_add_sat)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (saturating_add | i16x8_add_sat)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_add_sat_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_add | u16x8_add_sat)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (saturating_add | u16x8_add_sat)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i16x8_sub => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_sub | i16x8_sub)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (wrapping_sub | i16x8_sub)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_sub_sat_s => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_sub | i16x8_sub_sat)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (saturating_sub | i16x8_sub_sat)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_sub_sat_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (saturating_sub | u16x8_sub_sat)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (saturating_sub | u16x8_sub_sat)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i16x8_mul => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (wrapping_mul | i16x8_mul)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (wrapping_mul | i16x8_mul)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_min_s => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (min | i16x8_min)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (min | i16x8_min)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_min_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (min | u16x8_min)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (min | u16x8_min)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i16x8_max_s => {
+            [0i16, 0, 0, 0, 0, 0, 0, 0]
+                (max | i16x8_max)
+            [1i16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1i16, 2, 3, 4, 5, 6, 7, 8]
+                (max | i16x8_max)
+            [32767, 8, -2494,-4, 4882, -4, 848, 3830],
+        }
+
+        test_i16x8_max_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (max | u16x8_max)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (max | u16x8_max)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i16x8_avgr_u => {
+            [0u16, 0, 0, 0, 0, 0, 0, 0]
+                (avgr | u16x8_avgr)
+            [1u16, 1, 1, 1, 1, 1, 1, 1],
+
+            [1u16, 2, 3, 4, 5, 6, 7, 8]
+                (avgr | u16x8_avgr)
+            [32767, 8, -2494i16 as u16,-4i16 as u16, 4882, -4i16 as u16, 848, 3830],
+        }
+
+        test_i32x4_add => {
+            [0i32, 0, 0, 0] (wrapping_add | i32x4_add) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (wrapping_add | i32x4_add)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_sub => {
+            [0i32, 0, 0, 0] (wrapping_sub | i32x4_sub) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (wrapping_sub | i32x4_sub)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_mul => {
+            [0i32, 0, 0, 0] (wrapping_mul | i32x4_mul) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (wrapping_mul | i32x4_mul)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_min_s => {
+            [0i32, 0, 0, 0] (min | i32x4_min) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (min | i32x4_min)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_min_u => {
+            [0u32, 0, 0, 0] (min | u32x4_min) [1, 2, 3, 4],
+            [1u32, 1283, i32::MAX as u32, i32::MIN as u32]
+                (min | u32x4_min)
+            [i32::MAX as u32; 4],
+        }
+
+        test_i32x4_max_s => {
+            [0i32, 0, 0, 0] (max | i32x4_max) [1, 2, 3, 4],
+            [1i32, 1283, i32::MAX, i32::MIN]
+                (max | i32x4_max)
+            [i32::MAX; 4],
+        }
+
+        test_i32x4_max_u => {
+            [0u32, 0, 0, 0] (max | u32x4_max) [1, 2, 3, 4],
+            [1u32, 1283, i32::MAX as u32, i32::MIN as u32]
+                (max | u32x4_max)
+            [i32::MAX as u32; 4],
+        }
+
+        test_i64x2_add => {
+            [0i64, 0] (wrapping_add | i64x2_add) [1, 2],
+            [i64::MIN, i64::MAX] (wrapping_add | i64x2_add) [i64::MAX, i64::MIN],
+            [i64::MAX; 2] (wrapping_add | i64x2_add) [i64::MAX; 2],
+            [-4i64, -4] (wrapping_add | i64x2_add) [800, 939],
+        }
+
+        test_i64x2_sub => {
+            [0i64, 0] (wrapping_sub | i64x2_sub) [1, 2],
+            [i64::MIN, i64::MAX] (wrapping_sub | i64x2_sub) [i64::MAX, i64::MIN],
+            [i64::MAX; 2] (wrapping_sub | i64x2_sub) [i64::MAX; 2],
+            [-4i64, -4] (wrapping_sub | i64x2_sub) [800, 939],
+        }
+
+        test_i64x2_mul => {
+            [0i64, 0] (wrapping_mul | i64x2_mul) [1, 2],
+            [i64::MIN, i64::MAX] (wrapping_mul | i64x2_mul) [i64::MAX, i64::MIN],
+            [i64::MAX; 2] (wrapping_mul | i64x2_mul) [i64::MAX; 2],
+            [-4i64, -4] (wrapping_mul | i64x2_mul) [800, 939],
+        }
+
+        test_f32x4_add => {
+            [-1.0f32, 2.0, 3.0, 4.0] (add | f32x4_add) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (add | f32x4_add)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_sub => {
+            [-1.0f32, 2.0, 3.0, 4.0] (sub | f32x4_sub) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (sub | f32x4_sub)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_mul => {
+            [-1.0f32, 2.0, 3.0, 4.0] (mul | f32x4_mul) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (mul | f32x4_mul)
+            [1., 2., 1., 0.],
+        }
+
+        test_f32x4_div => {
+            [-1.0f32, 2.0, 3.0, 4.0] (div | f32x4_div) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (div | f32x4_div)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_min => {
+            [-1.0f32, 2.0, 3.0, 4.0] (min | f32x4_min) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (min | f32x4_min)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_max => {
+            [-1.0f32, 2.0, 3.0, 4.0] (max | f32x4_max) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (max | f32x4_max)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_pmin => {
+            [-1.0f32, 2.0, 3.0, 4.0] (min | f32x4_pmin) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (min | f32x4_pmin)
+            [1., 2., 0., 0.],
+        }
+
+        test_f32x4_pmax => {
+            [-1.0f32, 2.0, 3.0, 4.0] (max | f32x4_pmax) [1., 2., 0., 0.],
+            [f32::INFINITY, -0.0, f32::NEG_INFINITY, 3.0]
+                (max | f32x4_pmax)
+            [1., 2., 0., 0.],
+        }
+
+        test_f64x2_add => {
+            [-1.0f64, 2.0] (add | f64x2_add) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (add | f64x2_add) [1., 2.],
+        }
+
+        test_f64x2_sub => {
+            [-1.0f64, 2.0] (sub | f64x2_sub) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (sub | f64x2_sub) [1., 2.],
+        }
+
+        test_f64x2_mul => {
+            [-1.0f64, 2.0] (mul | f64x2_mul) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (mul | f64x2_mul) [1., 2.],
+        }
+
+        test_f64x2_div => {
+            [-1.0f64, 2.0] (div | f64x2_div) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (div | f64x2_div) [1., 2.],
+        }
+
+        test_f64x2_min => {
+            [-1.0f64, 2.0] (min | f64x2_min) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (min | f64x2_min) [1., 2.],
+        }
+
+        test_f64x2_max => {
+            [-1.0f64, 2.0] (max | f64x2_max) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (max | f64x2_max) [1., 2.],
+        }
+
+        test_f64x2_pmin => {
+            [-1.0f64, 2.0] (min | f64x2_pmin) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (min | f64x2_pmin) [1., 2.],
+        }
+
+        test_f64x2_pmax => {
+            [-1.0f64, 2.0] (max | f64x2_pmax) [1., 2.],
+            [f64::INFINITY, f64::NEG_INFINITY] (max | f64x2_pmax) [1., 2.],
+        }
+    }
+
+    test_unop! {
+        test_i8x16_abs => {
+            (wrapping_abs | i8x16_abs)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            (wrapping_abs | i8x16_abs)
+            [-2i8, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            (wrapping_abs | i8x16_abs)
+            [-127i8, -44, 43, 126, 4, -128, 127, -59, -43, 39, -69, 79, -3, 35, 83, 13],
+        }
+
+        test_i8x16_neg => {
+            (wrapping_neg | i8x16_neg)
+            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+            (wrapping_neg | i8x16_neg)
+            [-2i8, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
+
+            (wrapping_neg | i8x16_neg)
+            [-127i8, -44, 43, 126, 4, -128, 127, -59, -43, 39, -69, 79, -3, 35, 83, 13],
+        }
+
+        test_i16x8_abs => {
+            (wrapping_abs | i16x8_abs) [1i16, 1, 1, 1, 1, 1, 1, 1],
+            (wrapping_abs | i16x8_abs) [2i16, 0x7fff, !0, 4, 42, -5, 33, -4847],
+        }
+
+        test_i16x8_neg => {
+            (wrapping_neg | i16x8_neg) [1i16, 1, 1, 1, 1, 1, 1, 1],
+            (wrapping_neg | i16x8_neg) [2i16, 0x7fff, !0, 4, 42, -5, 33, -4847],
+        }
+
+        test_i32x4_abs => {
+            (wrapping_abs | i32x4_abs) [1i32, 2, 3, 4],
+            (wrapping_abs | i32x4_abs) [i32::MIN, i32::MAX, 0, 4],
+        }
+
+        test_i32x4_neg => {
+            (wrapping_neg | i32x4_neg) [1i32, 2, 3, 4],
+            (wrapping_neg | i32x4_neg) [i32::MIN, i32::MAX, 0, 4],
+        }
+
+        test_i64x2_abs => {
+            (wrapping_abs | i64x2_abs) [1i64, 2],
+            (wrapping_abs | i64x2_abs) [i64::MIN, i64::MAX],
+        }
+
+        test_i64x2_neg => {
+            (wrapping_neg | i64x2_neg) [1i64, 2],
+            (wrapping_neg | i64x2_neg) [i64::MIN, i64::MAX],
+        }
+
+        test_f32x4_ceil => {
+            (ceil | f32x4_ceil) [1.0f32, 2., 2.5, 3.3],
+            (ceil | f32x4_ceil) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_floor => {
+            (floor | f32x4_floor) [1.0f32, 2., 2.5, 3.3],
+            (floor | f32x4_floor) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_trunc => {
+            (trunc | f32x4_trunc) [1.0f32, 2., 2.5, 3.3],
+            (trunc | f32x4_trunc) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_nearest => {
+            (round | f32x4_nearest) [1.0f32, 2., 2.6, 3.3],
+            (round | f32x4_nearest) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_abs => {
+            (abs | f32x4_abs) [1.0f32, 2., 2.6, 3.3],
+            (abs | f32x4_abs) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_neg => {
+            (neg | f32x4_neg) [1.0f32, 2., 2.6, 3.3],
+            (neg | f32x4_neg) [0.0, -0.3, f32::INFINITY, -0.0],
+        }
+
+        test_f32x4_sqrt => {
+            (sqrt | f32x4_sqrt) [1.0f32, 2., 2.6, 3.3],
+            (sqrt | f32x4_sqrt) [0.0, 0.3, f32::INFINITY, 0.1],
+        }
+
+        test_f64x2_ceil => {
+            (ceil | f64x2_ceil) [1.0f64, 2.3],
+            (ceil | f64x2_ceil) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_floor => {
+            (floor | f64x2_floor) [1.0f64, 2.3],
+            (floor | f64x2_floor) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_trunc => {
+            (trunc | f64x2_trunc) [1.0f64, 2.3],
+            (trunc | f64x2_trunc) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_nearest => {
+            (round | f64x2_nearest) [1.0f64, 2.3],
+            (round | f64x2_nearest) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_abs => {
+            (abs | f64x2_abs) [1.0f64, 2.3],
+            (abs | f64x2_abs) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_neg => {
+            (neg | f64x2_neg) [1.0f64, 2.3],
+            (neg | f64x2_neg) [f64::INFINITY, -0.1],
+        }
+
+        test_f64x2_sqrt => {
+            (sqrt | f64x2_sqrt) [1.0f64, 2.3],
+            (sqrt | f64x2_sqrt) [f64::INFINITY, 0.1],
+        }
+    }
+
+    macro_rules! floating_point {
+        (f32) => {
+            true
+        };
+        (f64) => {
+            true
+        };
+        ($id:ident) => {
+            false
+        };
+    }
+
+    trait IsNan: Sized {
+        fn is_nan(self) -> bool {
+            false
+        }
+    }
+    impl IsNan for i8 {}
+    impl IsNan for i16 {}
+    impl IsNan for i32 {}
+    impl IsNan for i64 {}
+
+    macro_rules! test_bop {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $binary_op:ident [$op_test_id:ident] :
+          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+             test_bop!(
+                 $id[$ety; $ecount] => $ety | $binary_op [ $op_test_id ]:
+                 ([$($in_a),*], [$($in_b),*]) => [$($out),*]
+             );
+
+         };
+         ($id:ident[$ety:ident; $ecount:expr] => $oty:ident |
+          $binary_op:ident [$op_test_id:ident] :
+          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let b_input: [$ety; $ecount] = [$($in_b),*];
+                     let output: [$oty; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let b_vec_in: v128 = transmute(b_input);
+                     let vec_res: v128 = $binary_op(a_vec_in, b_vec_in);
+
+                     let res: [$oty; $ecount] = transmute(vec_res);
+
+                     if !floating_point!($ety) {
+                         assert_eq!(res, output);
+                     } else {
+                         for i in 0..$ecount {
+                             let r = res[i];
+                             let o = output[i];
+                             assert_eq!(r.is_nan(), o.is_nan());
+                             if !r.is_nan() {
+                                 assert_eq!(r, o);
+                             }
+                         }
+                     }
+                 }
+             }
+         }
+     }
+
+    macro_rules! test_bops {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $binary_op:ident [$op_test_id:ident]:
+          ([$($in_a:expr),*], $in_b:expr) => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let output: [$ety; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let vec_res: v128 = $binary_op(a_vec_in, $in_b);
+
+                     let res: [$ety; $ecount] = transmute(vec_res);
+                     assert_eq!(res, output);
+                 }
+             }
+         }
+     }
+
+    macro_rules! test_uop {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $unary_op:ident [$op_test_id:ident]: [$($in_a:expr),*] => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let output: [$ety; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let vec_res: v128 = $unary_op(a_vec_in);
+
+                     let res: [$ety; $ecount] = transmute(vec_res);
+                     assert_eq!(res, output);
+                 }
+             }
+         }
+     }
+
+    test_bops!(i8x16[i8; 16] | i8x16_shl[i8x16_shl_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -2, 4, 6, 8, 10, 12, -2, 2, 2, 2, 2, 2, 2, 2, 2]);
+    test_bops!(i16x8[i16; 8] | i16x8_shl[i16x8_shl_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+                [0, -2, 4, 6, 8, 10, 12, -2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shl[i32x4_shl_test]:
+                ([0, -1, 2, 3], 1) => [0, -2, 4, 6]);
+    test_bops!(i64x2[i64; 2] | i64x2_shl[i64x2_shl_test]:
+                ([0, -1], 1) => [0, -2]);
+
+    test_bops!(i8x16[i8; 16] | i8x16_shr[i8x16_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | i16x8_shr[i16x8_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, i16::MAX / 2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shr[i32x4_shr_s_test]:
+               ([0, -1, 2, 3], 1) => [0, -1, 1, 1]);
+    test_bops!(i64x2[i64; 2] | i64x2_shr[i64x2_shr_s_test]:
+               ([0, -1], 1) => [0, -1]);
+
+    test_bops!(i8x16[i8; 16] | u8x16_shr[i8x16_uhr_u_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+                [0, i8::MAX, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | u16x8_shr[i16x8_uhr_u_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+                [0, i16::MAX, 1, 1, 2, 2, 3, i16::MAX / 2]);
+    test_bops!(i32x4[i32; 4] | u32x4_shr[i32x4_uhr_u_test]:
+                ([0, -1, 2, 3], 1) => [0, i32::MAX, 1, 1]);
+    test_bops!(i64x2[i64; 2] | u64x2_shr[i64x2_uhr_u_test]:
+                ([0, -1], 1) => [0, i64::MAX]);
+
+    #[test]
+    fn v128_bitwise_logical_ops() {
+        unsafe {
+            let a: [u32; 4] = [u32::MAX, 0, u32::MAX, 0];
+            let b: [u32; 4] = [u32::MAX; 4];
+            let c: [u32; 4] = [0; 4];
+
+            let vec_a: v128 = transmute(a);
+            let vec_b: v128 = transmute(b);
+            let vec_c: v128 = transmute(c);
+
+            let r: v128 = v128_and(vec_a, vec_a);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_and(vec_a, vec_b);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_andnot(vec_a, vec_b);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_andnot(vec_a, vec_a);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_andnot(vec_a, vec_c);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_or(vec_a, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128_not(vec_b);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_xor(vec_a, vec_c);
+            compare_bytes(r, vec_a);
+
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_c);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_a);
+            compare_bytes(r, vec_a);
+        }
+    }
+
+    macro_rules! test_bool_red {
+         ([$test_id:ident, $any:ident, $all:ident] | [$($true:expr),*] | [$($false:expr),*] | [$($alt:expr),*]) => {
+             #[test]
+             fn $test_id() {
+                 unsafe {
+                     let vec_a: v128 = transmute([$($true),*]); // true
+                     let vec_b: v128 = transmute([$($false),*]); // false
+                     let vec_c: v128 = transmute([$($alt),*]); // alternating
+
+                     // TODO
+                     // assert_eq!($any(vec_a), true);
+                     // assert_eq!($any(vec_b), false);
+                     // assert_eq!($any(vec_c), true);
+
+                     assert_eq!($all(vec_a), true);
+                     assert_eq!($all(vec_b), false);
+                     assert_eq!($all(vec_c), false);
+                 }
+             }
+         }
+     }
+
+    test_bool_red!(
+        [i8x16_boolean_reductions, v128_any_true, i8x16_all_true]
+            | [1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i8, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i16x8_boolean_reductions, v128_any_true, i16x8_all_true]
+            | [1_i16, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i16, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i16, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i32x4_boolean_reductions, v128_any_true, i32x4_all_true]
+            | [1_i32, 1, 1, 1]
+            | [0_i32, 0, 0, 0]
+            | [1_i32, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i64x2_boolean_reductions, v128_any_true, i64x2_all_true]
+            | [1_i64, 1]
+            | [0_i64, 0]
+            | [1_i64, 0]
+    );
+
+    test_bop!(i8x16[i8; 16] | i8x16_eq[i8x16_eq_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+               [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_eq[i16x8_eq_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_eq[i32x4_eq_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(i64x2[i64; 2] | i64x2_eq[i64x2_eq_test]:
+               ([0, 1], [0, 2]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_eq[f32x4_eq_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_eq[f64x2_eq_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_ne[i8x16_ne_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_ne[i16x8_ne_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_ne[i32x4_ne_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(i64x2[i64; 2] | i64x2_ne[i64x2_ne_test]:
+               ([0, 1], [0, 2]) => [0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_ne[f32x4_ne_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_ne[f64x2_ne_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_lt[i8x16_lt_s_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1, -1, -1, 0, 0]);
+    test_bop!(i8x16[i8; 16] | u8x16_lt[i8x16_lt_u_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_lt[i16x8_lt_s_test]:
+               ([0, 1, 2, 3, 4, 5, 6, -7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, -1]);
+    test_bop!(i16x8[i16; 8] | u16x8_lt[i16x8_lt_u_test]:
+               ([0, 1, 2, 3, 4, 5, 6, -7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_lt[i32x4_lt_s_test]:
+               ([-1, 1, 2, 3], [0, 2, 2, 4]) => [-1, -1, 0, -1]);
+    test_bop!(i32x4[i32; 4] | u32x4_lt[i32x4_lt_u_test]:
+               ([-1, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(i64x2[i64; 2] | i64x2_lt[i64x2_lt_s_test]:
+               ([-1, 3], [0, 2]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_lt[f32x4_lt_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_lt[f64x2_lt_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_gt[i8x16_gt_s_test]:
+           ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, -15],
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i8x16[i8; 16] | u8x16_gt[i8x16_gt_u_test]:
+           ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, -15],
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_gt[i16x8_gt_s_test]:
+               ([0, 2, 2, 4, 4, 6, 6, -7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | u16x8_gt[i16x8_gt_u_test]:
+               ([0, 2, 2, 4, 4, 6, 6, -7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_gt[i32x4_gt_s_test]:
+               ([0, 2, 2, -4], [0, 1, 2, 3]) => [0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | u32x4_gt[i32x4_gt_u_test]:
+               ([0, 2, 2, -4], [0, 1, 2, 3]) => [0, -1, 0, -1]);
+    test_bop!(i64x2[i64; 2] | i64x2_gt[i64x2_gt_s_test]:
+               ([-1, 2], [0, 1]) => [0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_gt[f32x4_gt_test]:
+               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_gt[f64x2_gt_test]: ([0., 2.], [0., 1.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_ge[i8x16_ge_s_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, 0]);
+    test_bop!(i8x16[i8; 16] | u8x16_ge[i8x16_ge_u_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_ge[i16x8_ge_s_test]:
+               ([0, 1, 2, 3, 4, 5, 6, -7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, 0]);
+    test_bop!(i16x8[i16; 8] | u16x8_ge[i16x8_ge_u_test]:
+               ([0, 1, 2, 3, 4, 5, 6, -7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_ge[i32x4_ge_s_test]:
+               ([0, 1, 2, -3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(i32x4[i32; 4] | u32x4_ge[i32x4_ge_u_test]:
+               ([0, 1, 2, -3], [0, 2, 2, 4]) => [-1, 0, -1, -1]);
+    test_bop!(i64x2[i64; 2] | i64x2_ge[i64x2_ge_s_test]:
+               ([0, 1], [-1, 2]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_ge[f32x4_ge_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_ge[f64x2_ge_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_le[i8x16_le_s_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, -15],
+                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+               ) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i8x16[i8; 16] | u8x16_le[i8x16_le_u_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, -15],
+                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+               ) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_le[i16x8_le_s_test]:
+               ([0, 2, 2, 4, 4, 6, 6, -7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | u16x8_le[i16x8_le_u_test]:
+               ([0, 2, 2, 4, 4, 6, 6, -7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_le[i32x4_le_s_test]:
+               ([0, 2, 2, -4], [0, 1, 2, 3]) => [-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | u32x4_le[i32x4_le_u_test]:
+               ([0, 2, 2, -4], [0, 1, 2, 3]) => [-1, 0, -1, 0]);
+    test_bop!(i64x2[i64; 2] | i64x2_le[i64x2_le_s_test]:
+               ([0, 2], [0, 1]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_le[f32x4_le_test]:
+               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [-1, 0, -1, -0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_le[f64x2_le_test]: ([0., 2.], [0., 1.]) => [-1, 0]);
+
+    test_uop!(f32x4[f32; 4] | f32x4_neg[f32x4_neg_test]: [0., 1., 2., 3.] => [ 0., -1., -2., -3.]);
+    test_uop!(f32x4[f32; 4] | f32x4_abs[f32x4_abs_test]: [0., -1., 2., -3.] => [ 0., 1., 2., 3.]);
+    test_bop!(f32x4[f32; 4] | f32x4_min[f32x4_min_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., -3., -4., 8.]);
+    test_bop!(f32x4[f32; 4] | f32x4_min[f32x4_min_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+              => [0., -3., -4., std::f32::NAN]);
+    test_bop!(f32x4[f32; 4] | f32x4_max[f32x4_max_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -1., 7., 10.]);
+    test_bop!(f32x4[f32; 4] | f32x4_max[f32x4_max_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+              => [1., -1., 7., std::f32::NAN]);
+    test_bop!(f32x4[f32; 4] | f32x4_add[f32x4_add_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -4., 3., 18.]);
+    test_bop!(f32x4[f32; 4] | f32x4_sub[f32x4_sub_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [-1., 2., 11., -2.]);
+    test_bop!(f32x4[f32; 4] | f32x4_mul[f32x4_mul_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., 3., -28., 80.]);
+    test_bop!(f32x4[f32; 4] | f32x4_div[f32x4_div_test]:
+              ([0., -8., 70., 8.], [1., 4., 10., 2.]) => [0., -2., 7., 4.]);
+
+    test_uop!(f64x2[f64; 2] | f64x2_neg[f64x2_neg_test]: [0., 1.] => [ 0., -1.]);
+    test_uop!(f64x2[f64; 2] | f64x2_abs[f64x2_abs_test]: [0., -1.] => [ 0., 1.]);
+    test_bop!(f64x2[f64; 2] | f64x2_min[f64x2_min_test]:
+               ([0., -1.], [1., -3.]) => [0., -3.]);
+    test_bop!(f64x2[f64; 2] | f64x2_min[f64x2_min_test_nan]:
+               ([7., 8.], [-4., std::f64::NAN])
+               => [ -4., std::f64::NAN]);
+    test_bop!(f64x2[f64; 2] | f64x2_max[f64x2_max_test]:
+               ([0., -1.], [1., -3.]) => [1., -1.]);
+    test_bop!(f64x2[f64; 2] | f64x2_max[f64x2_max_test_nan]:
+               ([7., 8.], [ -4., std::f64::NAN])
+               => [7., std::f64::NAN]);
+    test_bop!(f64x2[f64; 2] | f64x2_add[f64x2_add_test]:
+               ([0., -1.], [1., -3.]) => [1., -4.]);
+    test_bop!(f64x2[f64; 2] | f64x2_sub[f64x2_sub_test]:
+               ([0., -1.], [1., -3.]) => [-1., 2.]);
+    test_bop!(f64x2[f64; 2] | f64x2_mul[f64x2_mul_test]:
+               ([0., -1.], [1., -3.]) => [0., 3.]);
+    test_bop!(f64x2[f64; 2] | f64x2_div[f64x2_div_test]:
+               ([0., -8.], [1., 4.]) => [0., -2.]);
+
+    macro_rules! test_conv {
+        ($test_id:ident | $conv_id:ident | $to_ty:ident | $from:expr,  $to:expr) => {
+            #[test]
+            fn $test_id() {
+                unsafe {
+                    let from: v128 = transmute($from);
+                    let to: v128 = transmute($to);
+
+                    let r: v128 = $conv_id(from);
+
+                    compare_bytes(r, to);
+                }
+            }
+        };
+    }
+
+    test_conv!(
+        f32x4_convert_s_i32x4 | f32x4_convert_i32x4 | f32x4 | [1_i32, 2, 3, 4],
+        [1_f32, 2., 3., 4.]
+    );
+    test_conv!(
+        f32x4_convert_u_i32x4 | f32x4_convert_u32x4 | f32x4 | [u32::MAX, 2, 3, 4],
+        [u32::MAX as f32, 2., 3., 4.]
+    );
+
+    #[test]
+    fn test_conversions() {
+        compare_bytes(
+            i32x4_trunc_sat_f32x4(f32x4(1., f32::NEG_INFINITY, f32::INFINITY, f32::NAN)),
+            i32x4(1, i32::MIN, i32::MAX, 0),
+        );
+        compare_bytes(
+            u32x4_trunc_sat_f32x4(f32x4(1., f32::NEG_INFINITY, f32::INFINITY, f32::NAN)),
+            u32x4(1, 0, u32::MAX, 0),
+        );
+        compare_bytes(f64x2_convert_low_i32x4(i32x4(1, 2, 3, 4)), f64x2(1., 2.));
+        compare_bytes(
+            f64x2_convert_low_i32x4(i32x4(i32::MIN, i32::MAX, 3, 4)),
+            f64x2(f64::from(i32::MIN), f64::from(i32::MAX)),
+        );
+        compare_bytes(f64x2_convert_low_u32x4(u32x4(1, 2, 3, 4)), f64x2(1., 2.));
+        compare_bytes(
+            f64x2_convert_low_u32x4(u32x4(u32::MIN, u32::MAX, 3, 4)),
+            f64x2(f64::from(u32::MIN), f64::from(u32::MAX)),
+        );
+
+        compare_bytes(
+            i32x4_trunc_sat_f64x2_zero(f64x2(1., f64::NEG_INFINITY)),
+            i32x4(1, i32::MIN, 0, 0),
+        );
+        compare_bytes(
+            i32x4_trunc_sat_f64x2_zero(f64x2(f64::NAN, f64::INFINITY)),
+            i32x4(0, i32::MAX, 0, 0),
+        );
+        compare_bytes(
+            u32x4_trunc_sat_f64x2_zero(f64x2(1., f64::NEG_INFINITY)),
+            u32x4(1, 0, 0, 0),
+        );
+        compare_bytes(
+            u32x4_trunc_sat_f64x2_zero(f64x2(f64::NAN, f64::INFINITY)),
+            u32x4(0, u32::MAX, 0, 0),
+        );
+    }
+
+    #[test]
+    fn test_popcnt() {
+        unsafe {
+            for i in 0..=255 {
+                compare_bytes(
+                    i8x16_popcnt(u8x16_splat(i)),
+                    u8x16_splat(i.count_ones() as u8),
+                )
+            }
+
+            let vectors = [
+                [0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [
+                    100, 200, 50, 0, 10, 7, 38, 185, 192, 3, 34, 85, 93, 7, 31, 99,
+                ],
+            ];
+
+            for vector in vectors.iter() {
+                compare_bytes(
+                    i8x16_popcnt(transmute(*vector)),
+                    i8x16(
+                        vector[0].count_ones() as i8,
+                        vector[1].count_ones() as i8,
+                        vector[2].count_ones() as i8,
+                        vector[3].count_ones() as i8,
+                        vector[4].count_ones() as i8,
+                        vector[5].count_ones() as i8,
+                        vector[6].count_ones() as i8,
+                        vector[7].count_ones() as i8,
+                        vector[8].count_ones() as i8,
+                        vector[9].count_ones() as i8,
+                        vector[10].count_ones() as i8,
+                        vector[11].count_ones() as i8,
+                        vector[12].count_ones() as i8,
+                        vector[13].count_ones() as i8,
+                        vector[14].count_ones() as i8,
+                        vector[15].count_ones() as i8,
+                    ),
+                )
+            }
+        }
+    }
+
+    #[test]
+    fn test_promote_demote() {
+        let tests = [
+            [1., 2.],
+            [f64::NAN, f64::INFINITY],
+            [100., 201.],
+            [0., -0.],
+            [f64::NEG_INFINITY, 0.],
+        ];
+
+        for [a, b] in tests {
+            compare_bytes(
+                f32x4_demote_f64x2_zero(f64x2(a, b)),
+                f32x4(a as f32, b as f32, 0., 0.),
+            );
+            compare_bytes(
+                f64x2_promote_low_f32x4(f32x4(a as f32, b as f32, 0., 0.)),
+                f64x2(a, b),
+            );
+        }
+    }
+
+    #[test]
+    fn test_extmul() {
+        macro_rules! test {
+            ($(
+                $ctor:ident {
+                    from: $from:ident,
+                    to: $to:ident,
+                    low: $low:ident,
+                    high: $high:ident,
+                } => {
+                    $(([$($a:tt)*] * [$($b:tt)*]))*
+                }
+            )*) => ($(
+                $(unsafe {
+                    let a: [$from; 16 / mem::size_of::<$from>()] = [$($a)*];
+                    let b: [$from; 16 / mem::size_of::<$from>()] = [$($b)*];
+                    let low = mem::transmute::<_, [$to; 16 / mem::size_of::<$to>()]>($low($ctor($($a)*), $ctor($($b)*)));
+                    let high = mem::transmute::<_, [$to; 16 / mem::size_of::<$to>()]>($high($ctor($($a)*), $ctor($($b)*)));
+
+                    let half = a.len() / 2;
+                    for i in 0..half {
+                        assert_eq!(
+                            (a[i] as $to).wrapping_mul((b[i] as $to)),
+                            low[i],
+                            "expected {} * {}", a[i] as $to, b[i] as $to,
+                        );
+                        assert_eq!(
+                            (a[half + i] as $to).wrapping_mul((b[half + i] as $to)),
+                            high[i],
+                            "expected {} * {}", a[half + i] as $to, b[half + i] as $to,
+                        );
+                    }
+                })*
+            )*)
+        }
+        test! {
+            i8x16 {
+                from: i8,
+                to: i16,
+                low: i16x8_extmul_low_i8x16,
+                high: i16x8_extmul_high_i8x16,
+            } => {
+                (
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                )
+                (
+                    [-1, -2, 3, 100, 124, -38, 33, 87, 92, 108, 22, 8, -43, -128, 22, 0]
+                        *
+                    [-5, -2, 6, 10, 45, -4, 4, -2, 0, 88, 92, -102, -98, 83, 73, 54]
+                )
+            }
+            u8x16 {
+                from: u8,
+                to: u16,
+                low: u16x8_extmul_low_u8x16,
+                high: u16x8_extmul_high_u8x16,
+            } => {
+                (
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                )
+                (
+                    [1, 2, 3, 100, 124, 38, 33, 87, 92, 198, 22, 8, 43, 128, 22, 0]
+                        *
+                    [5, 200, 6, 10, 45, 248, 4, 2, 0, 2, 92, 102, 234, 83, 73, 54]
+                )
+            }
+            i16x8 {
+                from: i16,
+                to: i32,
+                low: i32x4_extmul_low_i16x8,
+                high: i32x4_extmul_high_i16x8,
+            } => {
+                (
+                    [0, 0, 0, 0, 0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0, 0, 0, 0, 0]
+                )
+                (
+                    [-1, 0, i16::MAX, 19931, -2259, 64, 200, 87]
+                        *
+                    [1, 1, i16::MIN, 29391, 105, 2, 100, -2]
+                )
+            }
+            u16x8 {
+                from: u16,
+                to: u32,
+                low: u32x4_extmul_low_u16x8,
+                high: u32x4_extmul_high_u16x8,
+            } => {
+                (
+                    [0, 0, 0, 0, 0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0, 0, 0, 0, 0]
+                )
+                (
+                    [1, 0, u16::MAX, 19931, 2259, 64, 200, 87]
+                        *
+                    [1, 1, 3, 29391, 105, 2, 100, 2]
+                )
+            }
+            i32x4 {
+                from: i32,
+                to: i64,
+                low: i64x2_extmul_low_i32x4,
+                high: i64x2_extmul_high_i32x4,
+            } => {
+                (
+                    [0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0]
+                )
+                (
+                    [-1, 0, i32::MAX, 19931]
+                        *
+                    [1, 1, i32::MIN, 29391]
+                )
+                (
+                    [i32::MAX, 3003183, 3 << 20, 0xffffff]
+                        *
+                    [i32::MAX, i32::MIN, -40042, 300]
+                )
+            }
+            u32x4 {
+                from: u32,
+                to: u64,
+                low: u64x2_extmul_low_u32x4,
+                high: u64x2_extmul_high_u32x4,
+            } => {
+                (
+                    [0, 0, 0, 0]
+                        *
+                    [0, 0, 0, 0]
+                )
+                (
+                    [1, 0, u32::MAX, 19931]
+                        *
+                    [1, 1, 3, 29391]
+                )
+                (
+                    [u32::MAX, 3003183, 3 << 20, 0xffffff]
+                        *
+                    [u32::MAX, 3000, 40042, 300]
+                )
+            }
+        }
+    }
+
+    #[test]
+    fn test_q15mulr_sat_s() {
+        fn test(a: [i16; 8], b: [i16; 8]) {
+            let a_v = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+            let b_v = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
+            let result = i16x8_q15mulr_sat(a_v, b_v);
+            let result = unsafe { mem::transmute::<v128, [i16; 8]>(result) };
+
+            for (i, (a, b)) in a.iter().zip(&b).enumerate() {
+                assert_eq!(
+                    result[i],
+                    (((*a as i32) * (*b as i32) + 0x4000) >> 15) as i16
+                );
+            }
+        }
+
+        test([0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]);
+        test([1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]);
+        test(
+            [-1, 100, 2003, -29494, 12, 128, 994, 1],
+            [-4049, 8494, -10483, 0, 5, 2222, 883, -9],
+        );
+    }
+
+    #[test]
+    fn test_extadd() {
+        macro_rules! test {
+            ($(
+                $func:ident {
+                    from: $from:ident,
+                    to: $to:ident,
+                } => {
+                    $([$($a:tt)*])*
+                }
+            )*) => ($(
+                $(unsafe {
+                    let a: [$from; 16 / mem::size_of::<$from>()] = [$($a)*];
+                    let a_v = mem::transmute::<_, v128>(a);
+                    let r = mem::transmute::<v128, [$to; 16 / mem::size_of::<$to>()]>($func(a_v));
+
+                    let half = a.len() / 2;
+                    for i in 0..half {
+                        assert_eq!(
+                            (a[2 * i] as $to).wrapping_add((a[2 * i + 1] as $to)),
+                            r[i],
+                            "failed {} + {} != {}",
+                            a[2 * i] as $to,
+                            a[2 * i + 1] as $to,
+                            r[i],
+                        );
+                    }
+                })*
+            )*)
+        }
+        test! {
+            i16x8_extadd_pairwise_i8x16 {
+                from: i8,
+                to: i16,
+            } => {
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                [-1, -2, 3, 100, 124, -38, 33, 87, 92, 108, 22, 8, -43, -128, 22, 0]
+                [-5, -2, 6, 10, 45, -4, 4, -2, 0, 88, 92, -102, -98, 83, 73, 54]
+            }
+            i16x8_extadd_pairwise_u8x16 {
+                from: u8,
+                to: i16,
+            } => {
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                [1, 2, 3, 100, 124, 38, 33, 87, 92, 198, 22, 8, 43, 128, 22, 0]
+                [5, 200, 6, 10, 45, 248, 4, 2, 0, 2, 92, 102, 234, 83, 73, 54]
+            }
+            i32x4_extadd_pairwise_i16x8 {
+                from: i16,
+                to: i32,
+            } => {
+                [0, 0, 0, 0, 0, 0, 0, 0]
+                [-1, 0, i16::MAX, 19931, -2259, 64, 200, 87]
+                [1, 1, i16::MIN, 29391, 105, 2, 100, -2]
+            }
+            i32x4_extadd_pairwise_u16x8 {
+                from: u16,
+                to: i32,
+            } => {
+                [0, 0, 0, 0, 0, 0, 0, 0]
+                [1, 0, u16::MAX, 19931, 2259, 64, 200, 87]
+                [1, 1, 3, 29391, 105, 2, 100, 2]
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/abm.rs b/library/stdarch/crates/core_arch/src/x86/abm.rs
new file mode 100644
index 000000000..50912f774
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/abm.rs
@@ -0,0 +1,62 @@
+//! Advanced Bit Manipulation (ABM) instructions
+//!
+//! The POPCNT and LZCNT have their own CPUID bits to indicate support.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//! System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Counts the leading most significant zero bits.
+///
+/// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_lzcnt_u32)
+#[inline]
+#[target_feature(enable = "lzcnt")]
+#[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
+    x.leading_zeros()
+}
+
+/// Counts the bits that are set.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_popcnt32)
+#[inline]
+#[target_feature(enable = "popcnt")]
+#[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _popcnt32(x: i32) -> i32 {
+    x.count_ones() as i32
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "lzcnt")]
+    unsafe fn test_lzcnt_u32() {
+        assert_eq!(_lzcnt_u32(0b0101_1010), 25);
+    }
+
+    #[simd_test(enable = "popcnt")]
+    unsafe fn test_popcnt32() {
+        assert_eq!(_popcnt32(0b0101_1010), 4);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/adx.rs b/library/stdarch/crates/core_arch/src/x86/adx.rs
new file mode 100644
index 000000000..15fcec8a5
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/adx.rs
@@ -0,0 +1,158 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.addcarry.32"]
+    fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32);
+    #[link_name = "llvm.x86.addcarryx.u32"]
+    fn llvm_addcarryx_u32(a: u8, b: u32, c: u32, d: *mut u8) -> u8;
+    #[link_name = "llvm.x86.subborrow.32"]
+    fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32);
+}
+
+/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry flag), and store the unsigned 32-bit result in `out`, and the carry-out
+/// is returned (carry or overflow flag).
+#[inline]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = llvm_addcarry_u32(c_in, a, b);
+    *out = b;
+    a
+}
+
+/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+#[inline]
+#[target_feature(enable = "adx")]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    llvm_addcarryx_u32(c_in, a, b, out as *mut _ as *mut u8)
+}
+
+/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 32-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+#[inline]
+#[cfg_attr(test, assert_instr(sbb))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _subborrow_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = llvm_subborrow_u32(c_in, a, b);
+    *out = b;
+    a
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[test]
+    fn test_addcarry_u32() {
+        unsafe {
+            let a = u32::MAX;
+            let mut out = 0;
+
+            let r = _addcarry_u32(0, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u32(0, a, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, a);
+
+            let r = _addcarry_u32(1, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 1);
+
+            let r = _addcarry_u32(1, a, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u32(0, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 7);
+
+            let r = _addcarry_u32(1, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 8);
+        }
+    }
+
+    #[simd_test(enable = "adx")]
+    unsafe fn test_addcarryx_u32() {
+        let a = u32::MAX;
+        let mut out = 0;
+
+        let r = _addcarryx_u32(0, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarryx_u32(0, a, 0, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, a);
+
+        let r = _addcarryx_u32(1, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 1);
+
+        let r = _addcarryx_u32(1, a, 0, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarryx_u32(0, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 7);
+
+        let r = _addcarryx_u32(1, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 8);
+    }
+
+    #[simd_test(enable = "adx")]
+    unsafe fn test_addcarryx_u32_2() {
+        unsafe fn add_1_2_3() -> u32 {
+            let mut out = 0;
+            _addcarryx_u32(1, 2, 3, &mut out);
+            out
+        }
+        assert_eq!(6, add_1_2_3());
+    }
+
+    #[test]
+    fn test_subborrow_u32() {
+        unsafe {
+            let a = u32::MAX;
+            let mut out = 0;
+
+            let r = _subborrow_u32(0, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u32(0, 0, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 0);
+
+            let r = _subborrow_u32(1, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a - 1);
+
+            let r = _subborrow_u32(1, 0, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u32(0, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 4);
+
+            let r = _subborrow_u32(1, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 3);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/aes.rs b/library/stdarch/crates/core_arch/src/x86/aes.rs
new file mode 100644
index 000000000..ffded1a0d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/aes.rs
@@ -0,0 +1,171 @@
+//! AES New Instructions (AES-NI)
+//!
+//! The intrinsics here correspond to those in the `wmmintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m128i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.aesni.aesdec"]
+    fn aesdec(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesdeclast"]
+    fn aesdeclast(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesenc"]
+    fn aesenc(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesenclast"]
+    fn aesenclast(a: __m128i, round_key: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aesimc"]
+    fn aesimc(a: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.aesni.aeskeygenassist"]
+    fn aeskeygenassist(a: __m128i, imm8: u8) -> __m128i;
+}
+
+/// Performs one round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesdec_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesdec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesdec_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    aesdec(a, round_key)
+}
+
+/// Performs the last round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesdeclast_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesdeclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesdeclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    aesdeclast(a, round_key)
+}
+
+/// Performs one round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenc_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesenc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesenc_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    aesenc(a, round_key)
+}
+
+/// Performs the last round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesenclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesenclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
+    aesenclast(a, round_key)
+}
+
+/// Performs the `InvMixColumns` transformation on `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesimc_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aesimc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aesimc_si128(a: __m128i) -> __m128i {
+    aesimc(a)
+}
+
+/// Assist in expanding the AES cipher key.
+///
+/// Assist in expanding the AES cipher key by computing steps towards
+/// generating a round key for encryption cipher using data from `a` and an
+/// 8-bit round constant `IMM8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aeskeygenassist_si128)
+#[inline]
+#[target_feature(enable = "aes")]
+#[cfg_attr(test, assert_instr(aeskeygenassist, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_aeskeygenassist_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    aeskeygenassist(a, IMM8 as u8)
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __m128i happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesdec_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let r = _mm_aesdec_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesdeclast_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let r = _mm_aesdeclast_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesenc_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let r = _mm_aesenc_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesenclast_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let r = _mm_aesenclast_si128(a, k);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aesimc_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714195.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let e = _mm_set_epi64x(0xc66c82284ee40aa0, 0x6633441122770055);
+        let r = _mm_aesimc_si128(a);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "aes")]
+    unsafe fn test_mm_aeskeygenassist_si128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714138.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let e = _mm_set_epi64x(0x857c266b7c266e85, 0xeac4eea9c4eeacea);
+        let r = _mm_aeskeygenassist_si128::<5>(a);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
new file mode 100644
index 000000000..ad9e68db6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -0,0 +1,4862 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//! Programmer's Manual, Volume 3: General-Purpose and System
+//! Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    intrinsics,
+    mem::{self, transmute},
+    ptr,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Adds packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_add(a, b)
+}
+
+/// Adds packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
+    simd_add(a, b)
+}
+
+/// Computes the bitwise AND of a packed double-precision (64-bit)
+/// floating-point elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// FIXME: Should be 'vandpd' instruction.
+// See https://github.com/rust-lang/stdarch/issues/71
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
+    let a: u64x4 = transmute(a);
+    let b: u64x4 = transmute(b);
+    transmute(simd_and(a, b))
+}
+
+/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
+    let a: u32x8 = transmute(a);
+    let b: u32x8 = transmute(b);
+    transmute(simd_and(a, b))
+}
+
+/// Computes the bitwise OR packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// FIXME: should be `vorpd` instruction.
+// See <https://github.com/rust-lang/stdarch/issues/71>.
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
+    let a: u64x4 = transmute(a);
+    let b: u64x4 = transmute(b);
+    transmute(simd_or(a, b))
+}
+
+/// Computes the bitwise OR packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
+    let a: u32x8 = transmute(a);
+    let b: u32x8 = transmute(b);
+    transmute(simd_or(a, b))
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
+/// lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_imm8!(MASK);
+    simd_shuffle4!(
+        a,
+        b,
+        <const MASK: i32> [
+            MASK as u32 & 0b1,
+            ((MASK as u32 >> 1) & 0b1) + 4,
+            ((MASK as u32 >> 2) & 0b1) + 2,
+            ((MASK as u32 >> 3) & 0b1) + 6,
+        ],
+    )
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a` within
+/// 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm8!(MASK);
+    simd_shuffle8!(
+        a,
+        b,
+        <const MASK: i32> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            ((MASK as u32 >> 4) & 0b11) + 8,
+            ((MASK as u32 >> 6) & 0b11) + 8,
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 12,
+            ((MASK as u32 >> 6) & 0b11) + 12,
+        ],
+    )
+}
+
+/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
+/// elements in `a`, and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// FIXME: should be `vandnpd` instruction.
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
+    let a: u64x4 = transmute(a);
+    let b: u64x4 = transmute(b);
+    transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
+}
+
+/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
+/// elements in `a`
+/// and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
+    let a: u32x8 = transmute(a);
+    let b: u32x8 = transmute(b);
+    transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
+}
+
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
+    vmaxpd(a, b)
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
+    vmaxps(a, b)
+}
+
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vminpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
+    vminpd(a, b)
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vminps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
+    vminps(a, b)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_mul(a, b)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmulps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
+    simd_mul(a, b)
+}
+
+/// Alternatively adds and subtracts packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_addsub_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
+    addsubpd256(a, b)
+}
+
+/// Alternatively adds and subtracts packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_addsub_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vaddsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
+    addsubps256(a, b)
+}
+
+/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_sub(a, b)
+}
+
+/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
+    simd_sub(a, b)
+}
+
+/// Computes the division of each of the 8 packed 32-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vdivps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
+    simd_div(a, b)
+}
+
+/// Computes the division of each of the 4 packed 64-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_div(a, b)
+}
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_round_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
+    static_assert_imm4!(ROUNDING);
+    roundpd256(a, ROUNDING)
+}
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ceil_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
+    simd_ceil(a)
+}
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_floor_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
+    simd_floor(a)
+}
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_round_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
+    static_assert_imm4!(ROUNDING);
+    roundps256(a, ROUNDING)
+}
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ceil_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
+    simd_ceil(a)
+}
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_floor_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vroundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
+    simd_floor(a)
+}
+
+/// Returns the square root of packed single-precision (32-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
+    sqrtps256(a)
+}
+
+/// Returns the square root of packed double-precision (64-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
+    simd_fsqrt(a)
+}
+
+/// Blends packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// Note: LLVM7 prefers single-precision blend instructions when
+// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
+// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
+#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_imm4!(IMM4);
+    simd_shuffle4!(
+        a,
+        b,
+        <const IMM4: i32> [
+            ((IMM4 as u32 >> 0) & 1) * 4 + 0,
+            ((IMM4 as u32 >> 1) & 1) * 4 + 1,
+            ((IMM4 as u32 >> 2) & 1) * 4 + 2,
+            ((IMM4 as u32 >> 3) & 1) * 4 + 3,
+        ],
+    )
+}
+
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm8!(IMM8);
+    simd_shuffle8!(
+        a,
+        b,
+        <const IMM8: i32> [
+            ((IMM8 as u32 >> 0) & 1) * 8 + 0,
+            ((IMM8 as u32 >> 1) & 1) * 8 + 1,
+            ((IMM8 as u32 >> 2) & 1) * 8 + 2,
+            ((IMM8 as u32 >> 3) & 1) * 8 + 3,
+            ((IMM8 as u32 >> 4) & 1) * 8 + 4,
+            ((IMM8 as u32 >> 5) & 1) * 8 + 5,
+            ((IMM8 as u32 >> 6) & 1) * 8 + 6,
+            ((IMM8 as u32 >> 7) & 1) * 8 + 7,
+        ],
+    )
+}
+
+/// Blends packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vblendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vblendvpd(a, b, c)
+}
+
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vblendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vblendvps(a, b, c)
+}
+
+/// Conditionally multiplies the packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` using the high 4 bits in `imm8`,
+/// sum the four products, and conditionally return the sum
+///  using the low 4 bits of `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm8!(IMM8);
+    vdpps(a, b, IMM8)
+}
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhaddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
+    vhaddpd(a, b)
+}
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhaddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
+    vhaddps(a, b)
+}
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
+    vhsubpd(a, b)
+}
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vhsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
+    vhsubps(a, b)
+}
+
+/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// FIXME Should be 'vxorpd' instruction.
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
+    let a: u64x4 = transmute(a);
+    let b: u64x4 = transmute(b);
+    transmute(simd_xor(a, b))
+}
+
+/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
+    let a: u32x8 = transmute(a);
+    let b: u32x8 = transmute(b);
+    transmute(simd_xor(a, b))
+}
+
+/// Equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_OQ: i32 = 0x00;
+/// Less-than (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LT_OS: i32 = 0x01;
+/// Less-than-or-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LE_OS: i32 = 0x02;
+/// Unordered (non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_UNORD_Q: i32 = 0x03;
+/// Not-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_UQ: i32 = 0x04;
+/// Not-less-than (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLT_US: i32 = 0x05;
+/// Not-less-than-or-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLE_US: i32 = 0x06;
+/// Ordered (non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_ORD_Q: i32 = 0x07;
+/// Equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_UQ: i32 = 0x08;
+/// Not-greater-than-or-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGE_US: i32 = 0x09;
+/// Not-greater-than (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGT_US: i32 = 0x0a;
+/// False (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_FALSE_OQ: i32 = 0x0b;
+/// Not-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_OQ: i32 = 0x0c;
+/// Greater-than-or-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GE_OS: i32 = 0x0d;
+/// Greater-than (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GT_OS: i32 = 0x0e;
+/// True (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_TRUE_UQ: i32 = 0x0f;
+/// Equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_OS: i32 = 0x10;
+/// Less-than (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LT_OQ: i32 = 0x11;
+/// Less-than-or-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_LE_OQ: i32 = 0x12;
+/// Unordered (signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_UNORD_S: i32 = 0x13;
+/// Not-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_US: i32 = 0x14;
+/// Not-less-than (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLT_UQ: i32 = 0x15;
+/// Not-less-than-or-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NLE_UQ: i32 = 0x16;
+/// Ordered (signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_ORD_S: i32 = 0x17;
+/// Equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_EQ_US: i32 = 0x18;
+/// Not-greater-than-or-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGE_UQ: i32 = 0x19;
+/// Not-greater-than (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NGT_UQ: i32 = 0x1a;
+/// False (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_FALSE_OS: i32 = 0x1b;
+/// Not-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_NEQ_OS: i32 = 0x1c;
+/// Greater-than-or-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GE_OQ: i32 = 0x1d;
+/// Greater-than (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_GT_OQ: i32 = 0x1e;
+/// True (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _CMP_TRUE_US: i32 = 0x1f;
+
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_pd)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm5!(IMM5);
+    vcmppd(a, b, IMM5 as i8)
+}
+
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_imm5!(IMM5);
+    vcmppd256(a, b, IMM5 as u8)
+}
+
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ps)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm5!(IMM5);
+    vcmpps(a, b, IMM5 as i8)
+}
+
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm5!(IMM5);
+    vcmpps256(a, b, IMM5 as u8)
+}
+
+/// Compares the lower double-precision (64-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper element from `a` to the upper element of returned
+/// vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm5!(IMM5);
+    vcmpsd(a, b, IMM5 as i8)
+}
+
+/// Compares the lower single-precision (32-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper 3 packed elements from `a` to the upper elements of
+/// returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm5!(IMM5);
+    vcmpss(a, b, IMM5 as i8)
+}
+
+/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
+    simd_cast(a.as_i32x4())
+}
+
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
+    vcvtdq2ps(a.as_i32x8())
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
+    vcvtpd2ps(a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
+    transmute(vcvtps2dq(a))
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
+    simd_cast(a)
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttpd_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
+    transmute(vcvttpd2dq(a))
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
+    transmute(vcvtpd2dq(a))
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttps_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
+    transmute(vcvttps2dq(a))
+}
+
+/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
+    static_assert_imm1!(IMM1);
+    simd_shuffle4!(
+        a,
+        _mm256_undefined_ps(),
+        <const IMM1: i32> [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
+    )
+}
+
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
+    static_assert_imm1!(IMM1);
+    simd_shuffle2!(a, _mm256_undefined_pd(), <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize])
+}
+
+/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_imm1!(IMM1);
+    let dst: i64x2 = simd_shuffle2!(
+        a.as_i64x4(),
+        _mm256_undefined_si256().as_i64x4(),
+        <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize],
+    );
+    transmute(dst)
+}
+
+/// Zeroes the contents of all XMM or YMM registers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zeroall)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vzeroall))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zeroall() {
+    vzeroall()
+}
+
+/// Zeroes the upper 128 bits of all YMM registers;
+/// the lower 128-bits of the registers are unmodified.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zeroupper)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vzeroupper))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zeroupper() {
+    vzeroupper()
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
+    vpermilps256(a, b.as_i32x8())
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutevar_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
+    vpermilps(a, b.as_i32x4())
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    static_assert_imm8!(IMM8);
+    simd_shuffle8!(
+        a,
+        _mm256_undefined_ps(),
+        <const IMM8: i32> [
+            (IMM8 as u32 >> 0) & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
+            ((IMM8 as u32 >> 0) & 0b11) + 4,
+            ((IMM8 as u32 >> 2) & 0b11) + 4,
+            ((IMM8 as u32 >> 4) & 0b11) + 4,
+            ((IMM8 as u32 >> 6) & 0b11) + 4,
+        ],
+    )
+}
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_ps)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    static_assert_imm8!(IMM8);
+    simd_shuffle4!(
+        a,
+        _mm_undefined_ps(),
+        <const IMM8: i32> [
+            (IMM8 as u32 >> 0) & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
+        ],
+    )
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 256-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
+    vpermilpd256(a, b.as_i64x4())
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutevar_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
+    vpermilpd(a, b.as_i64x2())
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpermilpd, IMM4 = 0x1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
+    static_assert_imm4!(IMM4);
+    simd_shuffle4!(
+        a,
+        _mm256_undefined_pd(),
+        <const IMM4: i32> [
+            ((IMM4 as u32 >> 0) & 1),
+            ((IMM4 as u32 >> 1) & 1),
+            ((IMM4 as u32 >> 2) & 1) + 2,
+            ((IMM4 as u32 >> 3) & 1) + 2,
+        ],
+    )
+}
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_pd)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0x1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
+    static_assert_imm2!(IMM2);
+    simd_shuffle2!(
+        a,
+        _mm_undefined_pd(),
+        <const IMM2: i32> [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
+    )
+}
+
+/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm8!(IMM8);
+    vperm2f128ps256(a, b, IMM8 as i8)
+}
+
+/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_imm8!(IMM8);
+    vperm2f128pd256(a, b, IMM8 as i8)
+}
+
+/// Shuffles 128-bits (composed of integer data) selected by `imm8`
+/// from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
+}
+
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_ss)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::trivially_copy_pass_by_ref)]
+pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
+    _mm256_set1_ps(*f)
+}
+
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcast_ss)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::trivially_copy_pass_by_ref)]
+pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
+    _mm_set1_ps(*f)
+}
+
+/// Broadcasts a double-precision (64-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_sd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::trivially_copy_pass_by_ref)]
+pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
+    _mm256_set1_pd(*f)
+}
+
+/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
+/// (32-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
+    vbroadcastf128ps256(a)
+}
+
+/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
+/// (64-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
+    vbroadcastf128pd256(a)
+}
+
+/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
+    static_assert_imm1!(IMM1);
+    simd_shuffle8!(
+        a,
+        _mm256_castps128_ps256(b),
+        <const IMM1: i32> [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
+    )
+}
+
+/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
+    static_assert_imm1!(IMM1);
+    simd_shuffle4!(
+        a,
+        _mm256_castpd128_pd256(b),
+        <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+    )
+}
+
+/// Copies `a` to result, then inserts 128 bits from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    static_assert_imm1!(IMM1);
+    let dst: i64x4 = simd_shuffle4!(
+        a.as_i64x4(),
+        _mm256_castsi128_si256(b).as_i64x4(),
+        <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+    );
+    transmute(dst)
+}
+
+/// Copies `a` to result, and inserts the 8-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
+    static_assert_imm5!(INDEX);
+    transmute(simd_insert(a.as_i8x32(), INDEX as u32, i))
+}
+
+/// Copies `a` to result, and inserts the 16-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
+    static_assert_imm4!(INDEX);
+    transmute(simd_insert(a.as_i16x16(), INDEX as u32, i))
+}
+
+/// Copies `a` to result, and inserts the 32-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
+    static_assert_imm3!(INDEX);
+    transmute(simd_insert(a.as_i32x8(), INDEX as u32, i))
+}
+
+/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
+    *(mem_addr as *const __m256d)
+}
+
+/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
+    *(mem_addr as *mut __m256d) = a;
+}
+
+/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
+    *(mem_addr as *const __m256)
+}
+
+/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
+    *(mem_addr as *mut __m256) = a;
+}
+
+/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
+    let mut dst = _mm256_undefined_pd();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m256d as *mut u8,
+        mem::size_of::<__m256d>(),
+    );
+    dst
+}
+
+/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
+    storeupd256(mem_addr, a);
+}
+
+/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
+    let mut dst = _mm256_undefined_ps();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m256 as *mut u8,
+        mem::size_of::<__m256>(),
+    );
+    dst
+}
+
+/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
+    storeups256(mem_addr, a);
+}
+
+/// Loads 256-bits of integer data from memory into result.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
+    *mem_addr
+}
+
+/// Stores 256-bits of integer data from `a` into memory.
+/// `mem_addr` must be aligned on a 32-byte boundary or a
+/// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
+    *mem_addr = a;
+}
+
+/// Loads 256-bits of integer data from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
+    let mut dst = _mm256_undefined_si256();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m256i as *mut u8,
+        mem::size_of::<__m256i>(),
+    );
+    dst
+}
+
+/// Stores 256-bits of integer data from `a` into memory.
+/// 	`mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
+    storeudq256(mem_addr as *mut i8, a.as_i8x32());
+}
+
+/// Loads packed double-precision (64-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
+    maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
+}
+
+/// Stores packed double-precision (64-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
+    maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
+}
+
+/// Loads packed double-precision (64-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
+    maskloadpd(mem_addr as *const i8, mask.as_i64x2())
+}
+
+/// Stores packed double-precision (64-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
+    maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
+}
+
+/// Loads packed single-precision (32-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
+    maskloadps256(mem_addr as *const i8, mask.as_i32x8())
+}
+
+/// Stores packed single-precision (32-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
+    maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
+}
+
+/// Loads packed single-precision (32-bit) floating-point elements from memory
+/// into result using `mask` (elements are zeroed out when the high bit of the
+/// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
+    maskloadps(mem_addr as *const i8, mask.as_i32x4())
+}
+
+/// Stores packed single-precision (32-bit) floating-point elements from `a`
+/// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
+    maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movehdup_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
+    simd_shuffle8!(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_moveldup_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
+    simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movedup_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
+    simd_shuffle4!(a, a, [0, 0, 2, 2])
+}
+
+/// Loads 256-bits of integer data from unaligned memory into result.
+/// This intrinsic may perform better than `_mm256_loadu_si256` when the
+/// data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_lddqu_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vlddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
+    transmute(vlddqu(mem_addr as *const i8))
+}
+
+/// Moves integer data from a 256-bit integer vector to a 32-byte
+/// aligned memory location. To minimize caching, the data is flagged as
+/// non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
+    intrinsics::nontemporal_store(mem_addr, a);
+}
+
+/// Moves double-precision values from a 256-bit vector of `[4 x double]`
+/// to a 32-byte aligned memory location. To minimize caching, the data is
+/// flagged as non-temporal (unlikely to be used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m256d, a);
+}
+
+/// Moves single-precision floating point values from a 256-bit vector
+/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
+/// caching, the data is flagged as non-temporal (unlikely to be used again
+/// soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovntps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m256, a);
+}
+
+/// Computes the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`, and returns the results. The maximum
+/// relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vrcpps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
+    vrcpps(a)
+}
+
+/// Computes the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`, and returns the results.
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vrsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
+    vrsqrtps(a)
+}
+
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
+    simd_shuffle8!(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
+}
+
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
+    ptestz256(a.as_i64x4(), b.as_i64x4())
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
+    ptestc256(a.as_i64x4(), b.as_i64x4())
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
+/// `CF` values are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
+    ptestnzc256(a.as_i64x4(), b.as_i64x4())
+}
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
+    vtestzpd256(a, b)
+}
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
+    vtestcpd256(a, b)
+}
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
+    vtestnzcpd256(a, b)
+}
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
+    vtestzpd(a, b)
+}
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
+    vtestcpd(a, b)
+}
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
+    vtestnzcpd(a, b)
+}
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
+    vtestzps256(a, b)
+}
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
+    vtestcps256(a, b)
+}
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
+    vtestnzcps256(a, b)
+}
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
+    vtestzps(a, b)
+}
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
+    vtestcps(a, b)
+}
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
+    vtestnzcps(a, b)
+}
+
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed double-precision (64-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovmskpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
+    movmskpd256(a)
+}
+
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed single-precision (32-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vmovmskps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
+    movmskps256(a)
+}
+
+/// Returns vector of type __m256d with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setzero_pd() -> __m256d {
+    _mm256_set1_pd(0.0)
+}
+
+/// Returns vector of type __m256 with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setzero_ps() -> __m256 {
+    _mm256_set1_ps(0.0)
+}
+
+/// Returns vector of type __m256i with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vxor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setzero_si256() -> __m256i {
+    _mm256_set1_epi8(0)
+}
+
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    _mm256_setr_pd(d, c, b, a)
+}
+
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_ps(
+    a: f32,
+    b: f32,
+    c: f32,
+    d: f32,
+    e: f32,
+    f: f32,
+    g: f32,
+    h: f32,
+) -> __m256 {
+    _mm256_setr_ps(h, g, f, e, d, c, b, a)
+}
+
+/// Sets packed 8-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    #[rustfmt::skip]
+    _mm256_setr_epi8(
+        e31, e30, e29, e28, e27, e26, e25, e24,
+        e23, e22, e21, e20, e19, e18, e17, e16,
+        e15, e14, e13, e12, e11, e10, e09, e08,
+        e07, e06, e05, e04, e03, e02, e01, e00,
+    )
+}
+
+/// Sets packed 16-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    #[rustfmt::skip]
+    _mm256_setr_epi16(
+        e15, e14, e13, e12,
+        e11, e10, e09, e08,
+        e07, e06, e05, e04,
+        e03, e02, e01, e00,
+    )
+}
+
+/// Sets packed 32-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Sets packed 64-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi64x)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    _mm256_setr_epi64x(d, c, b, a)
+}
+
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    __m256d(a, b, c, d)
+}
+
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_ps(
+    a: f32,
+    b: f32,
+    c: f32,
+    d: f32,
+    e: f32,
+    f: f32,
+    g: f32,
+    h: f32,
+) -> __m256 {
+    __m256(a, b, c, d, e, f, g, h)
+}
+
+/// Sets packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    #[rustfmt::skip]
+    transmute(i8x32::new(
+        e00, e01, e02, e03, e04, e05, e06, e07,
+        e08, e09, e10, e11, e12, e13, e14, e15,
+        e16, e17, e18, e19, e20, e21, e22, e23,
+        e24, e25, e26, e27, e28, e29, e30, e31,
+    ))
+}
+
+/// Sets packed 16-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    #[rustfmt::skip]
+    transmute(i16x16::new(
+        e00, e01, e02, e03,
+        e04, e05, e06, e07,
+        e08, e09, e10, e11,
+        e12, e13, e14, e15,
+    ))
+}
+
+/// Sets packed 32-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+}
+
+/// Sets packed 64-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi64x)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    transmute(i64x4::new(a, b, c, d))
+}
+
+/// Broadcasts double-precision (64-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
+    _mm256_setr_pd(a, a, a, a)
+}
+
+/// Broadcasts single-precision (32-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
+    _mm256_setr_ps(a, a, a, a, a, a, a, a)
+}
+
+/// Broadcasts 8-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastb`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi8)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
+    #[rustfmt::skip]
+    _mm256_setr_epi8(
+        a, a, a, a, a, a, a, a,
+        a, a, a, a, a, a, a, a,
+        a, a, a, a, a, a, a, a,
+        a, a, a, a, a, a, a, a,
+    )
+}
+
+/// Broadcasts 16-bit integer `a` to all all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi16)
+#[inline]
+#[target_feature(enable = "avx")]
+//#[cfg_attr(test, assert_instr(vpshufb))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
+    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+/// Broadcasts 32-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastd`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi32)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
+    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
+}
+
+/// Broadcasts 64-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastq`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi64x)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
+#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
+    _mm256_setr_epi64x(a, a, a, a)
+}
+
+/// Cast vector of type __m256d to type __m256.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
+    transmute(a)
+}
+
+/// Cast vector of type __m256 to type __m256d.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
+    transmute(a)
+}
+
+/// Casts vector of type __m256 to type __m256i.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
+    transmute(a)
+}
+
+/// Casts vector of type __m256i to type __m256.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
+    transmute(a)
+}
+
+/// Casts vector of type __m256d to type __m256i.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
+    transmute(a)
+}
+
+/// Casts vector of type __m256i to type __m256d.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
+    transmute(a)
+}
+
+/// Casts vector of type __m256 to type __m128.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps256_ps128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
+}
+
+/// Casts vector of type __m256d to type __m128d.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd256_pd128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
+    simd_shuffle2!(a, a, [0, 1])
+}
+
+/// Casts vector of type __m256i to type __m128i.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_si128)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
+    let a = a.as_i64x4();
+    let dst: i64x2 = simd_shuffle2!(a, a, [0, 1]);
+    transmute(dst)
+}
+
+/// Casts vector of type __m128 to type __m256;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps128_ps256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
+    // FIXME simd_shuffle8!(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
+}
+
+/// Casts vector of type __m128d to type __m256d;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd128_pd256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
+    // FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
+    simd_shuffle4!(a, a, [0, 1, 0, 0])
+}
+
+/// Casts vector of type __m128i to type __m256i;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi128_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
+    let a = a.as_i64x2();
+    // FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
+    let dst: i64x4 = simd_shuffle4!(a, a, [0, 1, 0, 0]);
+    transmute(dst)
+}
+
+/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
+/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextps128_ps256)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
+    simd_shuffle8!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextsi128_si256)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
+    let b = _mm_setzero_si128().as_i64x2();
+    let dst: i64x4 = simd_shuffle4!(a.as_i64x2(), b, [0, 1, 2, 3]);
+    transmute(dst)
+}
+
+/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
+/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextpd128_pd256)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
+    simd_shuffle4!(a, _mm_setzero_pd(), [0, 1, 2, 3])
+}
+
+/// Returns vector of type `__m256` with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ps)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_undefined_ps() -> __m256 {
+    _mm256_set1_ps(0.0)
+}
+
+/// Returns vector of type `__m256d` with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_pd)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_undefined_pd() -> __m256d {
+    _mm256_set1_pd(0.0)
+}
+
+/// Returns vector of type __m256i with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_si256)
+#[inline]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_undefined_si256() -> __m256i {
+    __m256i(0, 0, 0, 0)
+}
+
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
+    simd_shuffle8!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128d)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
+    let hi: __m128 = transmute(hi);
+    let lo: __m128 = transmute(lo);
+    transmute(_mm256_set_m128(hi, lo))
+}
+
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128i)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    let hi: __m128 = transmute(hi);
+    let lo: __m128 = transmute(lo);
+    transmute(_mm256_set_m128(hi, lo))
+}
+
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
+    _mm256_set_m128(hi, lo)
+}
+
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128d)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
+    _mm256_set_m128d(hi, lo)
+}
+
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128i)
+#[inline]
+#[target_feature(enable = "avx")]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
+    _mm256_set_m128i(hi, lo)
+}
+
+/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from memory, and combine them into a 256-bit
+/// value.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
+    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
+    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
+}
+
+/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from memory, and combine them into a 256-bit
+/// value.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128d)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
+    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
+    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
+}
+
+/// Loads two 128-bit values (composed of integer data) from memory, and combine
+/// them into a 256-bit value.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128i)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
+    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
+    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
+}
+
+/// Stores the high and low 128-bit halves (each composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `a` into memory two
+/// different 128-bit locations.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128)
+#[inline]
+#[target_feature(enable = "avx,sse")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
+    let lo = _mm256_castps256_ps128(a);
+    _mm_storeu_ps(loaddr, lo);
+    let hi = _mm256_extractf128_ps::<1>(a);
+    _mm_storeu_ps(hiaddr, hi);
+}
+
+/// Stores the high and low 128-bit halves (each composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `a` into memory two
+/// different 128-bit locations.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128d)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
+    let lo = _mm256_castpd256_pd128(a);
+    _mm_storeu_pd(loaddr, lo);
+    let hi = _mm256_extractf128_pd::<1>(a);
+    _mm_storeu_pd(hiaddr, hi);
+}
+
+/// Stores the high and low 128-bit halves (each composed of integer data) from
+/// `a` into memory two different 128-bit locations.
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128i)
+#[inline]
+#[target_feature(enable = "avx,sse2")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
+    let lo = _mm256_castsi256_si128(a);
+    _mm_storeu_si128(loaddr, lo);
+    let hi = _mm256_extractf128_si256::<1>(a);
+    _mm_storeu_si128(hiaddr, hi);
+}
+
+/// Returns the first element of the input vector of `[8 x float]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtss_f32)
+#[inline]
+#[target_feature(enable = "avx")]
+//#[cfg_attr(test, assert_instr(movss))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
+    simd_extract(a, 0)
+}
+
+// LLVM intrinsics used in the above functions
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx.addsub.pd.256"]
+    fn addsubpd256(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.addsub.ps.256"]
+    fn addsubps256(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.round.pd.256"]
+    fn roundpd256(a: __m256d, b: i32) -> __m256d;
+    #[link_name = "llvm.x86.avx.round.ps.256"]
+    fn roundps256(a: __m256, b: i32) -> __m256;
+    #[link_name = "llvm.x86.avx.sqrt.ps.256"]
+    fn sqrtps256(a: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.blendv.pd.256"]
+    fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.blendv.ps.256"]
+    fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.dp.ps.256"]
+    fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
+    #[link_name = "llvm.x86.avx.hadd.pd.256"]
+    fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.hadd.ps.256"]
+    fn vhaddps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.hsub.pd.256"]
+    fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.hsub.ps.256"]
+    fn vhsubps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.sse2.cmp.pd"]
+    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.avx.cmp.pd.256"]
+    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
+    #[link_name = "llvm.x86.sse.cmp.ps"]
+    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
+    #[link_name = "llvm.x86.avx.cmp.ps.256"]
+    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
+    #[link_name = "llvm.x86.sse2.cmp.sd"]
+    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.sse.cmp.ss"]
+    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
+    #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
+    fn vcvtdq2ps(a: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
+    fn vcvtpd2ps(a: __m256d) -> __m128;
+    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
+    fn vcvtps2dq(a: __m256) -> i32x8;
+    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
+    fn vcvttpd2dq(a: __m256d) -> i32x4;
+    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
+    fn vcvtpd2dq(a: __m256d) -> i32x4;
+    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
+    fn vcvttps2dq(a: __m256) -> i32x8;
+    #[link_name = "llvm.x86.avx.vzeroall"]
+    fn vzeroall();
+    #[link_name = "llvm.x86.avx.vzeroupper"]
+    fn vzeroupper();
+    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
+    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
+    fn vpermilps(a: __m128, b: i32x4) -> __m128;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
+    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
+    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
+    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
+    fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
+    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
+    fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
+    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
+    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
+    #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"]
+    fn vbroadcastf128ps256(a: &__m128) -> __m256;
+    #[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"]
+    fn vbroadcastf128pd256(a: &__m128d) -> __m256d;
+    #[link_name = "llvm.x86.avx.storeu.pd.256"]
+    fn storeupd256(mem_addr: *mut f64, a: __m256d);
+    #[link_name = "llvm.x86.avx.storeu.ps.256"]
+    fn storeups256(mem_addr: *mut f32, a: __m256);
+    #[link_name = "llvm.x86.avx.storeu.dq.256"]
+    fn storeudq256(mem_addr: *mut i8, a: i8x32);
+    #[link_name = "llvm.x86.avx.maskload.pd.256"]
+    fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
+    #[link_name = "llvm.x86.avx.maskstore.pd.256"]
+    fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
+    #[link_name = "llvm.x86.avx.maskload.pd"]
+    fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
+    #[link_name = "llvm.x86.avx.maskstore.pd"]
+    fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
+    #[link_name = "llvm.x86.avx.maskload.ps.256"]
+    fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx.maskstore.ps.256"]
+    fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
+    #[link_name = "llvm.x86.avx.maskload.ps"]
+    fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
+    #[link_name = "llvm.x86.avx.maskstore.ps"]
+    fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
+    #[link_name = "llvm.x86.avx.ldu.dq.256"]
+    fn vlddqu(mem_addr: *const i8) -> i8x32;
+    #[link_name = "llvm.x86.avx.rcp.ps.256"]
+    fn vrcpps(a: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
+    fn vrsqrtps(a: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.ptestz.256"]
+    fn ptestz256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.ptestc.256"]
+    fn ptestc256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.ptestnzc.256"]
+    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
+    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
+    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
+    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.pd"]
+    fn vtestzpd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.pd"]
+    fn vtestcpd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
+    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
+    fn vtestzps256(a: __m256, b: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
+    fn vtestcps256(a: __m256, b: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
+    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.ps"]
+    fn vtestzps(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.ps"]
+    fn vtestcps(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
+    fn vtestnzcps(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.avx.movmsk.pd.256"]
+    fn movmskpd256(a: __m256d) -> i32;
+    #[link_name = "llvm.x86.avx.movmsk.ps.256"]
+    fn movmskps256(a: __m256) -> i32;
+    #[link_name = "llvm.x86.avx.min.ps.256"]
+    fn vminps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.max.ps.256"]
+    fn vmaxps(a: __m256, b: __m256) -> __m256;
+    #[link_name = "llvm.x86.avx.min.pd.256"]
+    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.avx.max.pd.256"]
+    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::hint::black_box;
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_add_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_add_pd(a, b);
+        let e = _mm256_setr_pd(6., 8., 10., 12.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_add_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_add_ps(a, b);
+        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_and_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(0.6);
+        let r = _mm256_and_pd(a, b);
+        let e = _mm256_set1_pd(0.5);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_and_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(0.6);
+        let r = _mm256_and_ps(a, b);
+        let e = _mm256_set1_ps(0.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_or_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(0.6);
+        let r = _mm256_or_pd(a, b);
+        let e = _mm256_set1_pd(1.2);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_or_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(0.6);
+        let r = _mm256_or_ps(a, b);
+        let e = _mm256_set1_ps(1.2);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_shuffle_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
+        let e = _mm256_setr_pd(4., 3., 8., 7.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_shuffle_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
+        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_andnot_pd() {
+        let a = _mm256_set1_pd(0.);
+        let b = _mm256_set1_pd(0.6);
+        let r = _mm256_andnot_pd(a, b);
+        assert_eq_m256d(r, b);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_andnot_ps() {
+        let a = _mm256_set1_ps(0.);
+        let b = _mm256_set1_ps(0.6);
+        let r = _mm256_andnot_ps(a, b);
+        assert_eq_m256(r, b);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_max_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_max_pd(a, b);
+        let e = _mm256_setr_pd(2., 4., 6., 8.);
+        assert_eq_m256d(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
+        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
+        let wu: [u64; 4] = transmute(w);
+        let xu: [u64; 4] = transmute(x);
+        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
+        assert_eq!(xu, [0u64; 4]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
+        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
+        let yf: [f64; 4] = transmute(y);
+        let zf: [f64; 4] = transmute(z);
+        assert_eq!(yf, [0.0; 4]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_max_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_max_ps(a, b);
+        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
+        assert_eq_m256(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
+        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
+        let wu: [u32; 8] = transmute(w);
+        let xu: [u32; 8] = transmute(x);
+        assert_eq!(wu, [0x8000_0000u32; 8]);
+        assert_eq!(xu, [0u32; 8]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
+        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
+        let yf: [f32; 8] = transmute(y);
+        let zf: [f32; 8] = transmute(z);
+        assert_eq!(yf, [0.0; 8]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_min_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_min_pd(a, b);
+        let e = _mm256_setr_pd(1., 3., 5., 7.);
+        assert_eq_m256d(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
+        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
+        let wu: [u64; 4] = transmute(w);
+        let xu: [u64; 4] = transmute(x);
+        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
+        assert_eq!(xu, [0u64; 4]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
+        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
+        let yf: [f64; 4] = transmute(y);
+        let zf: [f64; 4] = transmute(z);
+        assert_eq!(yf, [0.0; 4]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_min_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_min_ps(a, b);
+        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
+        assert_eq_m256(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
+        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
+        let wu: [u32; 8] = transmute(w);
+        let xu: [u32; 8] = transmute(x);
+        assert_eq!(wu, [0x8000_0000u32; 8]);
+        assert_eq!(xu, [0u32; 8]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
+        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
+        let yf: [f32; 8] = transmute(y);
+        let zf: [f32; 8] = transmute(z);
+        assert_eq!(yf, [0.0; 8]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_mul_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_mul_pd(a, b);
+        let e = _mm256_setr_pd(5., 12., 21., 32.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_mul_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_mul_ps(a, b);
+        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_addsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_addsub_pd(a, b);
+        let e = _mm256_setr_pd(-4., 8., -4., 12.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_addsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_addsub_ps(a, b);
+        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_sub_pd(a, b);
+        let e = _mm256_setr_pd(-4., -4., -4., -4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
+        let r = _mm256_sub_ps(a, b);
+        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_round_pd() {
+        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
+        let result_closest = _mm256_round_pd::<0b0000>(a);
+        let result_down = _mm256_round_pd::<0b0001>(a);
+        let result_up = _mm256_round_pd::<0b0010>(a);
+        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
+        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
+        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
+        assert_eq_m256d(result_closest, expected_closest);
+        assert_eq_m256d(result_down, expected_down);
+        assert_eq_m256d(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_floor_pd() {
+        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
+        let result_down = _mm256_floor_pd(a);
+        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
+        assert_eq_m256d(result_down, expected_down);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_ceil_pd() {
+        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
+        let result_up = _mm256_ceil_pd(a);
+        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
+        assert_eq_m256d(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_round_ps() {
+        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
+        let result_closest = _mm256_round_ps::<0b0000>(a);
+        let result_down = _mm256_round_ps::<0b0001>(a);
+        let result_up = _mm256_round_ps::<0b0010>(a);
+        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
+        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
+        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
+        assert_eq_m256(result_closest, expected_closest);
+        assert_eq_m256(result_down, expected_down);
+        assert_eq_m256(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_floor_ps() {
+        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
+        let result_down = _mm256_floor_ps(a);
+        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
+        assert_eq_m256(result_down, expected_down);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_ceil_ps() {
+        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
+        let result_up = _mm256_ceil_ps(a);
+        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
+        assert_eq_m256(result_up, expected_up);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sqrt_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_sqrt_pd(a);
+        let e = _mm256_setr_pd(2., 3., 4., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_sqrt_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_sqrt_ps(a);
+        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_div_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_div_ps(a, b);
+        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_div_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_div_pd(a, b);
+        let e = _mm256_setr_pd(1., 3., 8., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blend_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_blend_pd::<0x0>(a, b);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
+        let r = _mm256_blend_pd::<0x3>(a, b);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
+        let r = _mm256_blend_pd::<0xF>(a, b);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blend_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_blend_ps::<0x0>(a, b);
+        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
+        let r = _mm256_blend_ps::<0x3>(a, b);
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
+        let r = _mm256_blend_ps::<0xF>(a, b);
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blendv_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
+        let r = _mm256_blendv_pd(a, b, c);
+        let e = _mm256_setr_pd(4., 9., 2., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_blendv_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        #[rustfmt::skip]
+        let c = _mm256_setr_ps(
+            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
+        );
+        let r = _mm256_blendv_ps(a, b, c);
+        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_dp_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_dp_ps::<0xFF>(a, b);
+        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hadd_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_hadd_pd(a, b);
+        let e = _mm256_setr_pd(13., 7., 41., 7.);
+        assert_eq_m256d(r, e);
+
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_hadd_pd(a, b);
+        let e = _mm256_setr_pd(3., 11., 7., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hadd_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_hadd_ps(a, b);
+        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
+        assert_eq_m256(r, e);
+
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_hadd_ps(a, b);
+        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hsub_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_hsub_pd(a, b);
+        let e = _mm256_setr_pd(-5., 1., -9., -3.);
+        assert_eq_m256d(r, e);
+
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_hsub_pd(a, b);
+        let e = _mm256_setr_pd(-1., -1., -1., -1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_hsub_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_hsub_ps(a, b);
+        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
+        assert_eq_m256(r, e);
+
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_hsub_ps(a, b);
+        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_xor_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_set1_pd(0.);
+        let r = _mm256_xor_pd(a, b);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_xor_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_set1_ps(0.);
+        let r = _mm256_xor_ps(a, b);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_pd() {
+        let a = _mm_setr_pd(4., 9.);
+        let b = _mm_setr_pd(4., 3.);
+        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
+        assert!(get_m128d(r, 0).is_nan());
+        assert!(get_m128d(r, 1).is_nan());
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cmp_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
+        let e = _mm256_set1_pd(0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
+        assert!(get_m128(r, 0).is_nan());
+        assert_eq!(get_m128(r, 1), 0.);
+        assert_eq!(get_m128(r, 2), 0.);
+        assert_eq!(get_m128(r, 3), 0.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cmp_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
+        let e = _mm256_set1_ps(0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_sd() {
+        let a = _mm_setr_pd(4., 9.);
+        let b = _mm_setr_pd(4., 3.);
+        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
+        assert!(get_m128d(r, 0).is_nan());
+        assert_eq!(get_m128d(r, 1), 9.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_cmp_ss() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
+        assert!(get_m128(r, 0).is_nan());
+        assert_eq!(get_m128(r, 1), 3.);
+        assert_eq!(get_m128(r, 2), 2.);
+        assert_eq!(get_m128(r, 3), 5.);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtepi32_pd() {
+        let a = _mm_setr_epi32(4, 9, 16, 25);
+        let r = _mm256_cvtepi32_pd(a);
+        let e = _mm256_setr_pd(4., 9., 16., 25.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtepi32_ps() {
+        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        let r = _mm256_cvtepi32_ps(a);
+        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtpd_ps() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvtpd_ps(a);
+        let e = _mm_setr_ps(4., 9., 16., 25.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtps_epi32() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_cvtps_epi32(a);
+        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtps_pd() {
+        let a = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm256_cvtps_pd(a);
+        let e = _mm256_setr_pd(4., 9., 16., 25.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvttpd_epi32(a);
+        let e = _mm_setr_epi32(4, 9, 16, 25);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtpd_epi32() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvtpd_epi32(a);
+        let e = _mm_setr_epi32(4, 9, 16, 25);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvttps_epi32() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_cvttps_epi32(a);
+        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extractf128_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_extractf128_ps::<0>(a);
+        let e = _mm_setr_ps(4., 3., 2., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extractf128_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_extractf128_pd::<0>(a);
+        let e = _mm_setr_pd(4., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extractf128_si256() {
+        let a = _mm256_setr_epi64x(4, 3, 2, 5);
+        let r = _mm256_extractf128_si256::<0>(a);
+        let e = _mm_setr_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zeroall() {
+        _mm256_zeroall();
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zeroupper() {
+        _mm256_zeroupper();
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permutevar_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_permutevar_ps(a, b);
+        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permutevar_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_permutevar_ps(a, b);
+        let e = _mm_setr_ps(3., 2., 5., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_permute_ps::<0x1b>(a);
+        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permute_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let r = _mm_permute_ps::<0x1b>(a);
+        let e = _mm_setr_ps(5., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permutevar_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let b = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_permutevar_pd(a, b);
+        let e = _mm256_setr_pd(4., 3., 5., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permutevar_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let b = _mm_setr_epi64x(3, 0);
+        let r = _mm_permutevar_pd(a, b);
+        let e = _mm_setr_pd(3., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_permute_pd::<5>(a);
+        let e = _mm256_setr_pd(3., 4., 5., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_permute_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let r = _mm_permute_pd::<1>(a);
+        let e = _mm_setr_pd(3., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_permute2f128_ps::<0x13>(a, b);
+        let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_permute2f128_pd::<0x31>(a, b);
+        let e = _mm256_setr_pd(3., 4., 7., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_si256() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
+        let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
+        let r = _mm256_permute2f128_si256::<0x20>(a, b);
+        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_ss() {
+        let r = _mm256_broadcast_ss(&3.);
+        let e = _mm256_set1_ps(3.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_broadcast_ss() {
+        let r = _mm_broadcast_ss(&3.);
+        let e = _mm_set1_ps(3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_sd() {
+        let r = _mm256_broadcast_sd(&3.);
+        let e = _mm256_set1_pd(3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let r = _mm256_broadcast_ps(&a);
+        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_broadcast_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let r = _mm256_broadcast_pd(&a);
+        let e = _mm256_setr_pd(4., 3., 4., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insertf128_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let b = _mm_setr_ps(4., 9., 16., 25.);
+        let r = _mm256_insertf128_ps::<0>(a, b);
+        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insertf128_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm256_insertf128_pd::<0>(a, b);
+        let e = _mm256_setr_pd(5., 6., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insertf128_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm_setr_epi64x(5, 6);
+        let r = _mm256_insertf128_si256::<0>(a, b);
+        let e = _mm256_setr_epi64x(5, 6, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_insert_epi8::<31>(a, 0);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_insert_epi16::<15>(a, 0);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi32() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_insert_epi32::<7>(a, 0);
+        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_load_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let p = &a as *const _ as *const f64;
+        let r = _mm256_load_pd(p);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_store_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let mut r = _mm256_undefined_pd();
+        _mm256_store_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_load_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let p = &a as *const _ as *const f32;
+        let r = _mm256_load_ps(p);
+        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_store_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let mut r = _mm256_undefined_ps();
+        _mm256_store_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_pd(black_box(p));
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu_pd() {
+        let a = _mm256_set1_pd(9.);
+        let mut r = _mm256_undefined_pd();
+        _mm256_storeu_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu_ps() {
+        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_ps(black_box(p));
+        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu_ps() {
+        let a = _mm256_set1_ps(9.);
+        let mut r = _mm256_undefined_ps();
+        _mm256_storeu_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_load_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let p = &a as *const _;
+        let r = _mm256_load_si256(p);
+        let e = _mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_store_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let mut r = _mm256_undefined_si256();
+        _mm256_store_si256(&mut r as *mut _, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let p = &a as *const _;
+        let r = _mm256_loadu_si256(black_box(p));
+        let e = _mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu_si256() {
+        let a = _mm256_set1_epi8(9);
+        let mut r = _mm256_undefined_si256();
+        _mm256_storeu_si256(&mut r as *mut _, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskload_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let r = _mm256_maskload_pd(black_box(p), mask);
+        let e = _mm256_setr_pd(0., 2., 0., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskstore_pd() {
+        let mut r = _mm256_set1_pd(0.);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        _mm256_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
+        let e = _mm256_setr_pd(0., 2., 0., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskload_pd() {
+        let a = &[1.0f64, 2.];
+        let p = a.as_ptr();
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_pd(black_box(p), mask);
+        let e = _mm_setr_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskstore_pd() {
+        let mut r = _mm_set1_pd(0.);
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_pd(1., 2.);
+        _mm_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
+        let e = _mm_setr_pd(0., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskload_ps() {
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let r = _mm256_maskload_ps(black_box(p), mask);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_maskstore_ps() {
+        let mut r = _mm256_set1_ps(0.);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        _mm256_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskload_ps() {
+        let a = &[1.0f32, 2., 3., 4.];
+        let p = a.as_ptr();
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let r = _mm_maskload_ps(black_box(p), mask);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_maskstore_ps() {
+        let mut r = _mm_set1_ps(0.);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        _mm_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movehdup_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_movehdup_ps(a);
+        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_moveldup_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_moveldup_ps(a);
+        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movedup_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_movedup_pd(a);
+        let e = _mm256_setr_pd(1., 1., 3., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_lddqu_si256() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let p = &a as *const _;
+        let r = _mm256_lddqu_si256(black_box(p));
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_stream_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let mut r = _mm256_undefined_si256();
+        _mm256_stream_si256(&mut r as *mut _, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_stream_pd() {
+        #[repr(align(32))]
+        struct Memory {
+            pub data: [f64; 4],
+        }
+        let a = _mm256_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 4] };
+
+        _mm256_stream_pd(&mut mem.data[0] as *mut f64, a);
+        for i in 0..4 {
+            assert_eq!(mem.data[i], get_m256d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_stream_ps() {
+        #[repr(align(32))]
+        struct Memory {
+            pub data: [f32; 8],
+        }
+        let a = _mm256_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 8] };
+
+        _mm256_stream_ps(&mut mem.data[0] as *mut f32, a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m256(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_rcp_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_rcp_ps(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_ps(
+            0.99975586, 0.49987793, 0.33325195, 0.24993896,
+            0.19995117, 0.16662598, 0.14282227, 0.12496948,
+        );
+        let rel_err = 0.00048828125;
+        for i in 0..8 {
+            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_rsqrt_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_rsqrt_ps(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_ps(
+            0.99975586, 0.7069092, 0.5772705, 0.49987793,
+            0.44714355, 0.40820313, 0.3779297, 0.3534546,
+        );
+        let rel_err = 0.00048828125;
+        for i in 0..8 {
+            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpackhi_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_unpackhi_pd(a, b);
+        let e = _mm256_setr_pd(2., 6., 4., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpackhi_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_unpackhi_ps(a, b);
+        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpacklo_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_unpacklo_pd(a, b);
+        let e = _mm256_setr_pd(1., 5., 3., 7.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_unpacklo_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = _mm256_unpacklo_ps(a, b);
+        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testz_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testz_si256(a, b);
+        assert_eq!(r, 0);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_testz_si256(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testc_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testc_si256(a, b);
+        assert_eq!(r, 0);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_testc_si256(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testnzc_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testnzc_si256(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_setr_epi64x(0, 0, 0, 0);
+        let b = _mm256_setr_epi64x(0, 0, 0, 0);
+        let r = _mm256_testnzc_si256(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testz_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_pd(-1.);
+        let r = _mm256_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testc_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(-1.);
+        let r = _mm256_testc_pd(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testnzc_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = _mm256_setr_pd(1., -1., -1., -1.);
+        let b = _mm256_setr_pd(-1., -1., 1., 1.);
+        let r = _mm256_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testz_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm_set1_pd(-1.);
+        let r = _mm_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testc_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(-1.);
+        let r = _mm_testc_pd(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testnzc_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = _mm_setr_pd(1., -1.);
+        let b = _mm_setr_pd(-1., -1.);
+        let r = _mm_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testz_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_ps(-1.);
+        let r = _mm256_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testc_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = _mm256_set1_ps(-1.);
+        let r = _mm256_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_testnzc_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
+        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm256_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testz_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = _mm_set1_ps(-1.);
+        let r = _mm_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testc_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = _mm_set1_ps(-1.);
+        let r = _mm_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm_testnzc_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = _mm_setr_ps(1., -1., -1., -1.);
+        let b = _mm_setr_ps(-1., -1., 1., 1.);
+        let r = _mm_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movemask_pd() {
+        let a = _mm256_setr_pd(1., -2., 3., -4.);
+        let r = _mm256_movemask_pd(a);
+        assert_eq!(r, 0xA);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_movemask_ps() {
+        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
+        let r = _mm256_movemask_ps(a);
+        assert_eq!(r, 0xAA);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setzero_pd() {
+        let r = _mm256_setzero_pd();
+        assert_eq_m256d(r, _mm256_set1_pd(0.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setzero_ps() {
+        let r = _mm256_setzero_ps();
+        assert_eq_m256(r, _mm256_set1_ps(0.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setzero_si256() {
+        let r = _mm256_setzero_si256();
+        assert_eq_m256i(r, _mm256_set1_epi8(0));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_pd() {
+        let r = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_ps() {
+        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi8() {
+        #[rustfmt::skip]
+        let r = _mm256_set_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            32, 31, 30, 29, 28, 27, 26, 25,
+            24, 23, 22, 21, 20, 19, 18, 17,
+            16, 15, 14, 13, 12, 11, 10, 9,
+            8, 7, 6, 5, 4, 3, 2, 1
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi16() {
+        #[rustfmt::skip]
+        let r = _mm256_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            16, 15, 14, 13, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi32() {
+        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_epi64x() {
+        let r = _mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_pd() {
+        let r = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_ps() {
+        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi8() {
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi16() {
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi32() {
+        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_epi64x() {
+        let r = _mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_pd() {
+        let r = _mm256_set1_pd(1.);
+        assert_eq_m256d(r, _mm256_set1_pd(1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_ps() {
+        let r = _mm256_set1_ps(1.);
+        assert_eq_m256(r, _mm256_set1_ps(1.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi8() {
+        let r = _mm256_set1_epi8(1);
+        assert_eq_m256i(r, _mm256_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi16() {
+        let r = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, _mm256_set1_epi16(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi32() {
+        let r = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, _mm256_set1_epi32(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set1_epi64x() {
+        let r = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, _mm256_set1_epi64x(1));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd_ps() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_castpd_ps(a);
+        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps_pd() {
+        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        let r = _mm256_castps_pd(a);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps_si256() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_castps_si256(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 0, -128, 63, 0, 0, 0, 64,
+            0, 0, 64, 64, 0, 0, -128, 64,
+            0, 0, -96, 64, 0, 0, -64, 64,
+            0, 0, -32, 64, 0, 0, 0, 65,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi256_ps() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 0, -128, 63, 0, 0, 0, 64,
+            0, 0, 64, 64, 0, 0, -128, 64,
+            0, 0, -96, 64, 0, 0, -64, 64,
+            0, 0, -32, 64, 0, 0, 0, 65,
+        );
+        let r = _mm256_castsi256_ps(a);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd_si256() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_castpd_si256(a);
+        assert_eq_m256d(transmute(r), a);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi256_pd() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_castsi256_pd(a);
+        assert_eq_m256d(r, transmute(a));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castps256_ps128() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_castps256_ps128(a);
+        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castpd256_pd128() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_castpd256_pd128(a);
+        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_castsi256_si128() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_castsi256_si128(a);
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zextps128_ps256() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm256_zextps128_ps256(a);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zextsi128_si256() {
+        let a = _mm_setr_epi64x(1, 2);
+        let r = _mm256_zextsi128_si256(a);
+        let e = _mm256_setr_epi64x(1, 2, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_zextpd128_pd256() {
+        let a = _mm_setr_pd(1., 2.);
+        let r = _mm256_zextpd128_pd256(a);
+        let e = _mm256_setr_pd(1., 2., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_m128() {
+        let hi = _mm_setr_ps(5., 6., 7., 8.);
+        let lo = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm256_set_m128(hi, lo);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_m128d() {
+        let hi = _mm_setr_pd(3., 4.);
+        let lo = _mm_setr_pd(1., 2.);
+        let r = _mm256_set_m128d(hi, lo);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_set_m128i() {
+        #[rustfmt::skip]
+        let hi = _mm_setr_epi8(
+            17, 18, 19, 20,
+            21, 22, 23, 24,
+            25, 26, 27, 28,
+            29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let lo = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        let r = _mm256_set_m128i(hi, lo);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_m128() {
+        let lo = _mm_setr_ps(1., 2., 3., 4.);
+        let hi = _mm_setr_ps(5., 6., 7., 8.);
+        let r = _mm256_setr_m128(lo, hi);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_m128d() {
+        let lo = _mm_setr_pd(1., 2.);
+        let hi = _mm_setr_pd(3., 4.);
+        let r = _mm256_setr_m128d(lo, hi);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_setr_m128i() {
+        #[rustfmt::skip]
+        let lo = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let hi = _mm_setr_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_setr_m128i(lo, hi);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu2_m128() {
+        let hi = &[5., 6., 7., 8.];
+        let hiaddr = hi.as_ptr();
+        let lo = &[1., 2., 3., 4.];
+        let loaddr = lo.as_ptr();
+        let r = _mm256_loadu2_m128(hiaddr, loaddr);
+        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu2_m128d() {
+        let hi = &[3., 4.];
+        let hiaddr = hi.as_ptr();
+        let lo = &[1., 2.];
+        let loaddr = lo.as_ptr();
+        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
+        let e = _mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_loadu2_m128i() {
+        #[rustfmt::skip]
+        let hi = _mm_setr_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let lo = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm256_loadu2_m128i(&hi as *const _ as *const _, &lo as *const _ as *const _);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu2_m128() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let mut hi = _mm_undefined_ps();
+        let mut lo = _mm_undefined_ps();
+        _mm256_storeu2_m128(
+            &mut hi as *mut _ as *mut f32,
+            &mut lo as *mut _ as *mut f32,
+            a,
+        );
+        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
+        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu2_m128d() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let mut hi = _mm_undefined_pd();
+        let mut lo = _mm_undefined_pd();
+        _mm256_storeu2_m128d(
+            &mut hi as *mut _ as *mut f64,
+            &mut lo as *mut _ as *mut f64,
+            a,
+        );
+        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
+        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_storeu2_m128i() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let mut hi = _mm_undefined_si128();
+        let mut lo = _mm_undefined_si128();
+        _mm256_storeu2_m128i(&mut hi as *mut _, &mut lo as *mut _, a);
+        #[rustfmt::skip]
+        let e_hi = _mm_setr_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32
+        );
+        #[rustfmt::skip]
+        let e_lo = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16
+        );
+
+        assert_eq_m128i(hi, e_hi);
+        assert_eq_m128i(lo, e_lo);
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_cvtss_f32() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_cvtss_f32(a);
+        assert_eq!(r, 1.);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
new file mode 100644
index 000000000..081609ece
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -0,0 +1,5908 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
+    transmute(pabsd(a.as_i32x8()))
+}
+
+/// Computes the absolute values of packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
+    transmute(pabsw(a.as_i16x16()))
+}
+
+/// Computes the absolute values of packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
+    transmute(pabsb(a.as_i8x32()))
+}
+
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
+/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if IMM8 > 32 {
+        return _mm256_set1_epi8(0);
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b) = if IMM8 > 16 {
+        (_mm256_set1_epi8(0), a)
+    } else {
+        (a, b)
+    };
+
+    let a = a.as_i8x32();
+    let b = b.as_i8x32();
+
+    let r: i8x32 = match IMM8 % 16 {
+        0 => simd_shuffle32!(
+            b,
+            a,
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ],
+        ),
+        1 => simd_shuffle32!(
+            b,
+            a,
+            [
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
+                24, 25, 26, 27, 28, 29, 30, 31, 48,
+            ],
+        ),
+        2 => simd_shuffle32!(
+            b,
+            a,
+            [
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 48, 49,
+            ],
+        ),
+        3 => simd_shuffle32!(
+            b,
+            a,
+            [
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
+            ],
+        ),
+        4 => simd_shuffle32!(
+            b,
+            a,
+            [
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
+                26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
+            ],
+        ),
+        5 => simd_shuffle32!(
+            b,
+            a,
+            [
+                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
+                27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
+            ],
+        ),
+        6 => simd_shuffle32!(
+            b,
+            a,
+            [
+                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
+            ],
+        ),
+        7 => simd_shuffle32!(
+            b,
+            a,
+            [
+                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
+            ],
+        ),
+        8 => simd_shuffle32!(
+            b,
+            a,
+            [
+                8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
+                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
+            ],
+        ),
+        9 => simd_shuffle32!(
+            b,
+            a,
+            [
+                9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
+                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+            ],
+        ),
+        10 => simd_shuffle32!(
+            b,
+            a,
+            [
+                10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
+                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+            ],
+        ),
+        11 => simd_shuffle32!(
+            b,
+            a,
+            [
+                11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
+                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+            ],
+        ),
+        12 => simd_shuffle32!(
+            b,
+            a,
+            [
+                12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
+                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+            ],
+        ),
+        13 => simd_shuffle32!(
+            b,
+            a,
+            [
+                13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
+                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+            ],
+        ),
+        14 => simd_shuffle32!(
+            b,
+            a,
+            [
+                14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
+                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+            ],
+        ),
+        15 => simd_shuffle32!(
+            b,
+            a,
+            [
+                15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
+                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+            ],
+        ),
+        _ => b,
+    };
+    transmute(r)
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data)
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Computes the bitwise NOT of 256 bits (representing integer data)
+/// in `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
+    let all_ones = _mm256_set1_epi8(-1);
+    transmute(simd_and(
+        simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
+        b.as_i64x4(),
+    ))
+}
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm4!(IMM4);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let r: i32x4 = simd_shuffle4!(
+        a,
+        b,
+        <const IMM4: i32> [
+            [0, 4, 0, 4][IMM4 as usize & 0b11],
+            [1, 1, 5, 5][IMM4 as usize & 0b11],
+            [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
+            [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
+        ],
+    );
+    transmute(r)
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let r: i32x8 = simd_shuffle8!(
+        a,
+        b,
+        <const IMM8: i32> [
+            [0, 8, 0, 8][IMM8 as usize & 0b11],
+            [1, 1, 9, 9][IMM8 as usize & 0b11],
+            [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
+            [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
+            [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
+            [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
+            [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
+            [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
+        ],
+    );
+    transmute(r)
+}
+
+/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+
+    let r: i16x16 = simd_shuffle16!(
+        a,
+        b,
+        <const IMM8: i32> [
+            [0, 16, 0, 16][IMM8 as usize & 0b11],
+            [1, 1, 17, 17][IMM8 as usize & 0b11],
+            [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
+            [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
+            [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
+            [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
+            [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
+            [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
+            [8, 24, 8, 24][IMM8 as usize & 0b11],
+            [9, 9, 25, 25][IMM8 as usize & 0b11],
+            [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
+            [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
+            [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
+            [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
+            [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
+            [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
+        ],
+    );
+    transmute(r)
+}
+
+/// Blends packed 8-bit integers from `a` and `b` using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
+    transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle16!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
+    transmute::<i8x16, _>(ret)
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle32!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
+    transmute::<i8x32, _>(ret)
+}
+
+// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle4!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
+    transmute::<i32x4, _>(ret)
+}
+
+// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle8!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
+    transmute::<i32x8, _>(ret)
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+// FIXME: https://github.com/rust-lang/stdarch/issues/791
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
+    let ret = simd_shuffle2!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
+    transmute::<i64x2, _>(ret)
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
+    let ret = simd_shuffle4!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
+    transmute::<i64x4, _>(ret)
+}
+
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
+    simd_shuffle2!(a, _mm_setzero_pd(), [0_u32; 2])
+}
+
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
+    simd_shuffle4!(a, _mm_setzero_pd(), [0_u32; 4])
+}
+
+// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
+// `vbroadcastf128`.
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle4!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
+    transmute::<i64x4, _>(ret)
+}
+
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
+    simd_shuffle4!(a, _mm_setzero_ps(), [0_u32; 4])
+}
+
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
+    simd_shuffle8!(a, _mm_setzero_ps(), [0_u32; 8])
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 128-bit returned value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle8!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
+    transmute::<i16x8, _>(ret)
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 256-bit returned value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle16!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
+    transmute::<i16x16, _>(ret)
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Sign-extend 16-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
+    transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
+}
+
+/// Sign-extend 16-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
+    let a = a.as_i16x8();
+    let v64: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    transmute::<i64x4, _>(simd_cast(v64))
+}
+
+/// Sign-extend 32-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
+    transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
+}
+
+/// Sign-extend 8-bit integers to 16-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
+    transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
+}
+
+/// Sign-extend 8-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
+    let a = a.as_i8x16();
+    let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    transmute::<i32x8, _>(simd_cast(v64))
+}
+
+/// Sign-extend 8-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
+    let a = a.as_i8x16();
+    let v32: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    transmute::<i64x4, _>(simd_cast(v32))
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
+/// integers, and stores the results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
+    transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
+}
+
+/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
+/// integers. The upper four elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
+    let a = a.as_u16x8();
+    let v64: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    transmute::<i64x4, _>(simd_cast(v64))
+}
+
+/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
+    transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
+}
+
+/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
+    transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
+}
+
+/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
+/// integers. The upper eight elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
+    let a = a.as_u8x16();
+    let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    transmute::<i32x8, _>(simd_cast(v64))
+}
+
+/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
+/// integers. The upper twelve elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
+    let a = a.as_u8x16();
+    let v32: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    transmute::<i64x4, _>(simd_cast(v32))
+}
+
+/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_imm1!(IMM1);
+    let a = a.as_i64x4();
+    let b = _mm256_undefined_si256().as_i64x4();
+    let dst: i64x2 = simd_shuffle2!(a, b, <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize]);
+    transmute(dst)
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
+    slice: *const i32,
+    offsets: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_si128().as_i32x4();
+    let neg_one = _mm_set1_epi32(-1).as_i32x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i32,
+    offsets: __m128i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x4();
+    let mask = mask.as_i32x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
+    slice: *const i32,
+    offsets: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
+    src: __m256i,
+    slice: *const i32,
+    offsets: __m256i,
+    mask: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let mask = mask.as_i32x8();
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_ps();
+    let neg_one = _mm_set1_ps(-1.0);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
+    src: __m128,
+    slice: *const f32,
+    offsets: __m128i,
+    mask: __m128,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    pgatherdps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_ps();
+    let neg_one = _mm256_set1_ps(-1.0);
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
+    src: __m256,
+    slice: *const f32,
+    offsets: __m256i,
+    mask: __m256,
+) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i32x8();
+    let slice = slice as *const i8;
+    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
+    slice: *const i64,
+    offsets: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_si128().as_i64x2();
+    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i64,
+    offsets: __m128i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x2();
+    let mask = mask.as_i64x2();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
+    slice: *const i64,
+    offsets: __m128i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    slice: *const i64,
+    offsets: __m128i,
+    mask: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x4();
+    let mask = mask.as_i64x4();
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_pd();
+    let neg_one = _mm_set1_pd(-1.0);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
+    src: __m128d,
+    slice: *const f64,
+    offsets: __m128i,
+    mask: __m128d,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
+    slice: *const f64,
+    offsets: __m128i,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_pd();
+    let neg_one = _mm256_set1_pd(-1.0);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
+    src: __m256d,
+    slice: *const f64,
+    offsets: __m128i,
+    mask: __m256d,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i32x4();
+    let slice = slice as *const i8;
+    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
+    slice: *const i32,
+    offsets: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_si128().as_i32x4();
+    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i32,
+    offsets: __m128i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x4();
+    let mask = mask.as_i32x4();
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
+    slice: *const i32,
+    offsets: __m256i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_si128().as_i32x4();
+    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i32,
+    offsets: __m256i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x4();
+    let mask = mask.as_i32x4();
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_ps();
+    let neg_one = _mm_set1_ps(-1.0);
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    slice: *const f32,
+    offsets: __m128i,
+    mask: __m128,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    pgatherqps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_ps();
+    let neg_one = _mm_set1_ps(-1.0);
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    slice: *const f32,
+    offsets: __m256i,
+    mask: __m128,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
+    slice: *const i64,
+    offsets: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_si128().as_i64x2();
+    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x2();
+    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    slice: *const i64,
+    offsets: __m128i,
+    mask: __m128i,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x2();
+    let mask = mask.as_i64x2();
+    let offsets = offsets.as_i64x2();
+    let slice = slice as *const i8;
+    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
+    slice: *const i64,
+    offsets: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x4();
+    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    slice: *const i64,
+    offsets: __m256i,
+    mask: __m256i,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x4();
+    let mask = mask.as_i64x4();
+    let offsets = offsets.as_i64x4();
+    let slice = slice as *const i8;
+    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
+    transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm_setzero_pd();
+    let neg_one = _mm_set1_pd(-1.0);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x2();
+    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
+    src: __m128d,
+    slice: *const f64,
+    offsets: __m128i,
+    mask: __m128d,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x2();
+    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
+    slice: *const f64,
+    offsets: __m256i,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_pd();
+    let neg_one = _mm256_set1_pd(-1.0);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x4();
+    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
+    src: __m256d,
+    slice: *const f64,
+    offsets: __m256i,
+    mask: __m256d,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x4();
+    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
+/// location specified by `IMM1`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    static_assert_imm1!(IMM1);
+    let a = a.as_i64x4();
+    let b = _mm256_castsi128_si256(b).as_i64x4();
+    let dst: i64x4 =
+        simd_shuffle4!(a, b, <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+    transmute(dst)
+}
+
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
+/// of intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Vertically multiplies each unsigned 8-bit integer from `a` with the
+/// corresponding signed 8-bit integer from `b`, producing intermediate
+/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
+/// signed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
+    transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
+}
+
+/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
+    transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
+}
+
+/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
+    transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
+}
+
+/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
+    transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
+}
+
+/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
+    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
+}
+
+/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
+    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
+}
+
+/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
+    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
+}
+
+/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
+    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pminud(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pminub(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Creates mask from the most significant bit of each 8-bit element in `a`,
+/// return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovmskb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
+    pmovmskb(a.as_i8x32())
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
+/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
+/// results in dst. Eight SADs are performed for each 128-bit lane using one
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight
+/// quadruplets are formed from sequential 8-bit integers selected from `a`
+/// starting at the offset specified in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8))
+}
+
+/// Multiplies the low 32-bit integers from each packed 64-bit element in
+/// `a` and `b`
+///
+/// Returns the 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
+/// element in `a` and `b`
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers, and returns the low 16 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing
+/// intermediate 64-bit integers, and returns the low 32 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiplies packed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Truncate each intermediate
+/// integer to the 18 most significant bits, round by adding 1, and
+/// return bits `[16:1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
+/// and `b`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Permutes packed 32-bit integers from `a` according to the content of `b`.
+///
+/// The last 3 bits of each integer of `b` are used as addresses into the 8
+/// integers of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(permd(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Permutes 64-bit integers from `a` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    let r: i64x4 = simd_shuffle4!(
+        a.as_i64x4(),
+        zero,
+        <const IMM8: i32> [
+            IMM8 as u32 & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8))
+}
+
+/// Shuffles 64-bit floating-point elements in `a` across lanes using the
+/// control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+    static_assert_imm8!(IMM8);
+    simd_shuffle4!(
+        a,
+        _mm256_undefined_pd(),
+        <const IMM8: i32> [
+            IMM8 as u32 & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
+        ],
+    )
+}
+
+/// Shuffles eight 32-bit foating-point elements in `a` across lanes using
+/// the corresponding 32-bit integer index in `idx`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
+    permps(a, idx.as_i32x8())
+}
+
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to
+/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
+/// integers in the low 16 bits of the 64-bit return value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsadbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// For each of the 128-bit low and high halves of the vectors, the last
+/// 4 bits of each byte of `b` are used as addresses into the respective
+/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
+///
+/// In addition, if the highest significant bit of a byte of `b` is set, the
+/// respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
+/// equivalent to:
+///
+/// ```
+/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
+///     let mut r = [0; 32];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///         if b[i + 16] & 0x80 == 0u8 {
+///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
+/// `imm8`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+///
+/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
+/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
+///
+/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
+/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
+///
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r: i32x8 = simd_shuffle8!(
+        a.as_i32x8(),
+        a.as_i32x8(),
+        <const MASK: i32> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            (MASK as u32 >> 4) & 0b11,
+            (MASK as u32 >> 6) & 0b11,
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i16x16();
+    let r: i16x16 = simd_shuffle16!(
+        a,
+        a,
+        <const IMM8: i32> [
+            0,
+            1,
+            2,
+            3,
+            4 + (IMM8 as u32 & 0b11),
+            4 + ((IMM8 as u32 >> 2) & 0b11),
+            4 + ((IMM8 as u32 >> 4) & 0b11),
+            4 + ((IMM8 as u32 >> 6) & 0b11),
+            8,
+            9,
+            10,
+            11,
+            12 + (IMM8 as u32 & 0b11),
+            12 + ((IMM8 as u32 >> 2) & 0b11),
+            12 + ((IMM8 as u32 >> 4) & 0b11),
+            12 + ((IMM8 as u32 >> 6) & 0b11),
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i16x16();
+    let r: i16x16 = simd_shuffle16!(
+        a,
+        a,
+        <const IMM8: i32> [
+            0 + (IMM8 as u32 & 0b11),
+            0 + ((IMM8 as u32 >> 2) & 0b11),
+            0 + ((IMM8 as u32 >> 4) & 0b11),
+            0 + ((IMM8 as u32 >> 6) & 0b11),
+            4,
+            5,
+            6,
+            7,
+            8 + (IMM8 as u32 & 0b11),
+            8 + ((IMM8 as u32 >> 2) & 0b11),
+            8 + ((IMM8 as u32 >> 4) & 0b11),
+            8 + ((IMM8 as u32 >> 6) & 0b11),
+            12,
+            13,
+            14,
+            15,
+        ],
+    );
+    transmute(r)
+}
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed
+/// 16-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(psignw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed
+/// 32-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(psignd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed
+/// 8-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(psignb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Shifts packed 16-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
+    transmute(psllw(a.as_i16x16(), count.as_i16x8()))
+}
+
+/// Shifts packed 32-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
+    transmute(pslld(a.as_i32x8(), count.as_i32x4()))
+}
+
+/// Shifts packed 64-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
+    transmute(psllq(a.as_i64x4(), count.as_i64x2()))
+}
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(pslliw(a.as_i16x16(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(psllid(a.as_i32x8(), IMM8))
+}
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(pslliq(a.as_i64x4(), IMM8))
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    _mm256_bslli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || i % 16 < shift {
+            0
+        } else {
+            32 + (i - shift)
+        }
+    }
+    let a = a.as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    let r: i8x32 = simd_shuffle32!(
+        zero,
+        a,
+        <const IMM8: i32> [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+            mask(IMM8, 16),
+            mask(IMM8, 17),
+            mask(IMM8, 18),
+            mask(IMM8, 19),
+            mask(IMM8, 20),
+            mask(IMM8, 21),
+            mask(IMM8, 22),
+            mask(IMM8, 23),
+            mask(IMM8, 24),
+            mask(IMM8, 25),
+            mask(IMM8, 26),
+            mask(IMM8, 27),
+            mask(IMM8, 28),
+            mask(IMM8, 29),
+            mask(IMM8, 30),
+            mask(IMM8, 31),
+        ],
+    );
+    transmute(r)
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
+    transmute(psraw(a.as_i16x16(), count.as_i16x8()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
+    transmute(psrad(a.as_i32x8(), count.as_i32x4()))
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(psraiw(a.as_i16x16(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(psraid(a.as_i32x8(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psravd(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
+    transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    _mm256_bsrli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    let r: i8x32 = match IMM8 % 16 {
+        0 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ],
+        ),
+        1 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
+                24, 25, 26, 27, 28, 29, 30, 31, 32,
+            ],
+        ),
+        2 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 32, 32,
+            ],
+        ),
+        3 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
+            ],
+        ),
+        4 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, 25,
+                26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
+            ],
+        ),
+        5 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, 26,
+                27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
+            ],
+        ),
+        6 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        7 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        8 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, 28,
+                29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        9 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, 29,
+                30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        10 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, 30,
+                31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        11 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, 31,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        12 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, 32,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        13 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, 32,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        14 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, 32,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        15 => simd_shuffle32!(
+            a,
+            zero,
+            [
+                14, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+            ],
+        ),
+        _ => zero,
+    };
+    transmute(r)
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
+    transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
+    transmute(psrld(a.as_i32x8(), count.as_i32x4()))
+}
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
+    transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(psrliw(a.as_i16x16(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(psrlid(a.as_i32x8(), IMM8))
+}
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(psrliq(a.as_i64x4(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
+}
+
+// TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Unpacks and interleave 8-bit integers from the high half of each
+/// 128-bit lane in `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi8(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+///     -30, -31,
+/// );
+///
+/// let c = _mm256_unpackhi_epi8(a, b);
+///
+/// let expected = _mm256_setr_epi8(
+///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
+///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
+///     -31,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
+    #[rustfmt::skip]
+    let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47,
+            24, 56, 25, 57, 26, 58, 27, 59,
+            28, 60, 29, 61, 30, 62, 31, 63,
+    ]);
+    transmute(r)
+}
+
+/// Unpacks and interleave 8-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi8(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+///     -30, -31,
+/// );
+///
+/// let c = _mm256_unpacklo_epi8(a, b);
+///
+/// let expected = _mm256_setr_epi8(
+///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
+///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
+    #[rustfmt::skip]
+    let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
+        0, 32, 1, 33, 2, 34, 3, 35,
+        4, 36, 5, 37, 6, 38, 7, 39,
+        16, 48, 17, 49, 18, 50, 19, 51,
+        20, 52, 21, 53, 22, 54, 23, 55,
+    ]);
+    transmute(r)
+}
+
+/// Unpacks and interleave 16-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi16(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
+///
+/// let c = _mm256_unpackhi_epi16(a, b);
+///
+/// let expected = _mm256_setr_epi16(
+///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let r: i16x16 = simd_shuffle16!(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
+    );
+    transmute(r)
+}
+
+/// Unpacks and interleave 16-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+///
+/// let a = _mm256_setr_epi16(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
+///
+/// let c = _mm256_unpacklo_epi16(a, b);
+///
+/// let expected = _mm256_setr_epi16(
+///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let r: i16x16 = simd_shuffle16!(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
+    );
+    transmute(r)
+}
+
+/// Unpacks and interleave 32-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
+///
+/// let c = _mm256_unpackhi_epi32(a, b);
+///
+/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
+    transmute(r)
+}
+
+/// Unpacks and interleave 32-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
+///
+/// let c = _mm256_unpacklo_epi32(a, b);
+///
+/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
+    transmute(r)
+}
+
+/// Unpacks and interleave 64-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
+///
+/// let c = _mm256_unpackhi_epi64(a, b);
+///
+/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
+    transmute(r)
+}
+
+/// Unpacks and interleave 64-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("avx2") {
+/// #         #[target_feature(enable = "avx2")]
+/// #         unsafe fn worker() {
+/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
+///
+/// let c = _mm256_unpacklo_epi64(a, b);
+///
+/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
+    transmute(r)
+}
+
+/// Computes the bitwise XOR of 256 bits (representing integer data)
+/// in `a` and `b`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_imm5!(INDEX);
+    simd_extract::<_, u8>(a.as_u8x32(), INDEX as u32) as i32
+}
+
+/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_imm4!(INDEX);
+    simd_extract::<_, u16>(a.as_u16x16(), INDEX as u32) as i32
+}
+
+/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_imm3!(INDEX);
+    simd_extract(a.as_i32x8(), INDEX as u32)
+}
+
+/// Returns the first element of the input vector of `[4 x double]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "avx2")]
+//#[cfg_attr(test, assert_instr(movsd))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
+    simd_extract(a, 0)
+}
+
+/// Returns the first element of the input vector of `[8 x i32]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
+#[inline]
+#[target_feature(enable = "avx2")]
+//#[cfg_attr(test, assert_instr(movd))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
+    simd_extract(a.as_i32x8(), 0)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx2.pabs.b"]
+    fn pabsb(a: i8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.pabs.w"]
+    fn pabsw(a: i16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pabs.d"]
+    fn pabsd(a: i32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.pavg.b"]
+    fn pavgb(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.pavg.w"]
+    fn pavgw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pblendvb"]
+    fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.phadd.w"]
+    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.phadd.d"]
+    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.phadd.sw"]
+    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.phsub.w"]
+    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.phsub.d"]
+    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.phsub.sw"]
+    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmadd.wd"]
+    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
+    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
+    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
+    #[link_name = "llvm.x86.avx2.maskload.d"]
+    fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.maskload.d.256"]
+    fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.maskload.q"]
+    fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.avx2.maskload.q.256"]
+    fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx2.maskstore.d"]
+    fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
+    #[link_name = "llvm.x86.avx2.maskstore.d.256"]
+    fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
+    #[link_name = "llvm.x86.avx2.maskstore.q"]
+    fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
+    #[link_name = "llvm.x86.avx2.maskstore.q.256"]
+    fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
+    #[link_name = "llvm.x86.avx2.pmaxs.w"]
+    fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmaxs.d"]
+    fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.pmaxs.b"]
+    fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.pmaxu.w"]
+    fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pmaxu.d"]
+    fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.pmaxu.b"]
+    fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.pmins.w"]
+    fn pminsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmins.d"]
+    fn pminsd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.pmins.b"]
+    fn pminsb(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.pminu.w"]
+    fn pminuw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pminu.d"]
+    fn pminud(a: u32x8, b: u32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.pminu.b"]
+    fn pminub(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.pmovmskb"]
+    fn pmovmskb(a: i8x32) -> i32;
+    #[link_name = "llvm.x86.avx2.mpsadbw"]
+    fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pmulhu.w"]
+    fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
+    #[link_name = "llvm.x86.avx2.pmulh.w"]
+    fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmul.dq"]
+    fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.pmulu.dq"]
+    fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
+    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
+    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.packsswb"]
+    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
+    #[link_name = "llvm.x86.avx2.packssdw"]
+    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.packuswb"]
+    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
+    #[link_name = "llvm.x86.avx2.packusdw"]
+    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
+    #[link_name = "llvm.x86.avx2.psad.bw"]
+    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
+    #[link_name = "llvm.x86.avx2.psign.b"]
+    fn psignb(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx2.psign.w"]
+    fn psignw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psign.d"]
+    fn psignd(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psll.w"]
+    fn psllw(a: i16x16, count: i16x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psll.d"]
+    fn pslld(a: i32x8, count: i32x4) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psll.q"]
+    fn psllq(a: i64x4, count: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.avx2.pslli.w"]
+    fn pslliw(a: i16x16, imm8: i32) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pslli.d"]
+    fn psllid(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx2.pslli.q"]
+    fn pslliq(a: i64x4, imm8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psllv.d"]
+    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.psllv.d.256"]
+    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psllv.q"]
+    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.avx2.psllv.q.256"]
+    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psra.w"]
+    fn psraw(a: i16x16, count: i16x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psra.d"]
+    fn psrad(a: i32x8, count: i32x4) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrai.w"]
+    fn psraiw(a: i16x16, imm8: i32) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psrai.d"]
+    fn psraid(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrav.d"]
+    fn psravd(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.psrav.d.256"]
+    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrl.w"]
+    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psrl.d"]
+    fn psrld(a: i32x8, count: i32x4) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrl.q"]
+    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psrli.w"]
+    fn psrliw(a: i16x16, imm8: i32) -> i16x16;
+    #[link_name = "llvm.x86.avx2.psrli.d"]
+    fn psrlid(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrli.q"]
+    fn psrliq(a: i64x4, imm8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psrlv.d"]
+    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
+    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.psrlv.q"]
+    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
+    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx2.pshuf.b"]
+    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
+    #[link_name = "llvm.x86.avx2.permd"]
+    fn permd(a: u32x8, b: u32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.permps"]
+    fn permps(a: __m256, b: i32x8) -> __m256;
+    #[link_name = "llvm.x86.avx2.vperm2i128"]
+    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.gather.d.d"]
+    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
+    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
+    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
+    #[link_name = "llvm.x86.avx2.gather.d.q"]
+    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
+    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
+    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.gather.q.d"]
+    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
+    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
+    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
+    #[link_name = "llvm.x86.avx2.gather.q.q"]
+    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
+    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
+    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
+    #[link_name = "llvm.x86.avx2.gather.d.pd"]
+    fn pgatherdpd(
+        src: __m128d,
+        slice: *const i8,
+        offsets: i32x4,
+        mask: __m128d,
+        scale: i8,
+    ) -> __m128d;
+    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
+    fn vpgatherdpd(
+        src: __m256d,
+        slice: *const i8,
+        offsets: i32x4,
+        mask: __m256d,
+        scale: i8,
+    ) -> __m256d;
+    #[link_name = "llvm.x86.avx2.gather.q.pd"]
+    fn pgatherqpd(
+        src: __m128d,
+        slice: *const i8,
+        offsets: i64x2,
+        mask: __m128d,
+        scale: i8,
+    ) -> __m128d;
+    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
+    fn vpgatherqpd(
+        src: __m256d,
+        slice: *const i8,
+        offsets: i64x4,
+        mask: __m256d,
+        scale: i8,
+    ) -> __m256d;
+    #[link_name = "llvm.x86.avx2.gather.d.ps"]
+    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
+        -> __m128;
+    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
+    fn vpgatherdps(
+        src: __m256,
+        slice: *const i8,
+        offsets: i32x8,
+        mask: __m256,
+        scale: i8,
+    ) -> __m256;
+    #[link_name = "llvm.x86.avx2.gather.q.ps"]
+    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
+        -> __m128;
+    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
+    fn vpgatherqps(
+        src: __m128,
+        slice: *const i8,
+        offsets: i64x4,
+        mask: __m128,
+        scale: i8,
+    ) -> __m128;
+    #[link_name = "llvm.x86.avx2.psll.dq"]
+    fn vpslldq(a: i64x4, b: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx2.psrl.dq"]
+    fn vpsrldq(a: i64x4, b: i32) -> i64x4;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi32(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0,  1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi64() {
+        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
+        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
+        let r = _mm256_add_epi64(a, b);
+        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi32() {
+        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_add_epi32(a, b);
+        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_add_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 2, 4, 6, 8, 10, 12, 14,
+            16, 18, 20, 22, 24, 26, 28, 30,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_add_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm256_add_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 2, 4, 6, 8, 10, 12, 14,
+            16, 18, 20, 22, 24, 26, 28, 30,
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63,
+        );
+        let r = _mm256_adds_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+            64, 66, 68, 70, 72, 74, 76, 78,
+            80, 82, 84, 86, 88, 90, 92, 94,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi8_saturate_positive() {
+        let a = _mm256_set1_epi8(0x7F);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_adds_epi8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi8_saturate_negative() {
+        let a = _mm256_set1_epi8(-0x80);
+        let b = _mm256_set1_epi8(-1);
+        let r = _mm256_adds_epi8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+        );
+        let r = _mm256_adds_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi16_saturate_positive() {
+        let a = _mm256_set1_epi16(0x7FFF);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_adds_epi16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epi16_saturate_negative() {
+        let a = _mm256_set1_epi16(-0x8000);
+        let b = _mm256_set1_epi16(-1);
+        let r = _mm256_adds_epi16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63,
+        );
+        let r = _mm256_adds_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+            64, 66, 68, 70, 72, 74, 76, 78,
+            80, 82, 84, 86, 88, 90, 92, 94,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu8_saturate() {
+        let a = _mm256_set1_epi8(!0);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_adds_epu8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+        );
+        let r = _mm256_adds_epu16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            32, 34, 36, 38, 40, 42, 44, 46,
+            48, 50, 52, 54, 56, 58, 60, 62,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_adds_epu16_saturate() {
+        let a = _mm256_set1_epi16(!0);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_adds_epu16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_and_si256() {
+        let a = _mm256_set1_epi8(5);
+        let b = _mm256_set1_epi8(3);
+        let got = _mm256_and_si256(a, b);
+        assert_eq_m256i(got, _mm256_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_andnot_si256() {
+        let a = _mm256_set1_epi8(5);
+        let b = _mm256_set1_epi8(3);
+        let got = _mm256_andnot_si256(a, b);
+        assert_eq_m256i(got, _mm256_set1_epi8(2));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_avg_epu8() {
+        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
+        let r = _mm256_avg_epu8(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_avg_epu16() {
+        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
+        let r = _mm256_avg_epu16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(6));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_blend_epi32() {
+        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
+        let e = _mm_setr_epi32(9, 3, 3, 3);
+        let r = _mm_blend_epi32::<0x01>(a, b);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_blend_epi32::<0x0E>(b, a);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_blend_epi32() {
+        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
+        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
+        let r = _mm256_blend_epi32::<0x01>(a, b);
+        assert_eq_m256i(r, e);
+
+        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
+        let r = _mm256_blend_epi32::<0x82>(a, b);
+        assert_eq_m256i(r, e);
+
+        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
+        let r = _mm256_blend_epi32::<0x7C>(a, b);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_blend_epi16() {
+        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
+        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
+        let r = _mm256_blend_epi16::<0x01>(a, b);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_blend_epi16::<0xFE>(b, a);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_blendv_epi8() {
+        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
+        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
+        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
+        let r = _mm256_blendv_epi8(a, b, mask);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastb_epi8() {
+        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
+        let res = _mm_broadcastb_epi8(a);
+        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastb_epi8() {
+        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
+        let res = _mm256_broadcastb_epi8(a);
+        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastd_epi32() {
+        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
+        let res = _mm_broadcastd_epi32(a);
+        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastd_epi32() {
+        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
+        let res = _mm256_broadcastd_epi32(a);
+        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(0x1ffffffff, 0);
+        let res = _mm_broadcastq_epi64(a);
+        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(0x1ffffffff, 0);
+        let res = _mm256_broadcastq_epi64(a);
+        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastsd_pd() {
+        let a = _mm_setr_pd(6.28, 3.14);
+        let res = _mm_broadcastsd_pd(a);
+        assert_eq_m128d(res, _mm_set1_pd(6.28f64));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastsd_pd() {
+        let a = _mm_setr_pd(6.28, 3.14);
+        let res = _mm256_broadcastsd_pd(a);
+        assert_eq_m256d(res, _mm256_set1_pd(6.28f64));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastsi128_si256() {
+        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
+        let res = _mm256_broadcastsi128_si256(a);
+        let retval = _mm256_setr_epi64x(
+            0x0987654321012334,
+            0x5678909876543210,
+            0x0987654321012334,
+            0x5678909876543210,
+        );
+        assert_eq_m256i(res, retval);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastss_ps() {
+        let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
+        let res = _mm_broadcastss_ps(a);
+        assert_eq_m128(res, _mm_set1_ps(6.28f32));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastss_ps() {
+        let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
+        let res = _mm256_broadcastss_ps(a);
+        assert_eq_m256(res, _mm256_set1_ps(6.28f32));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_broadcastw_epi16() {
+        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
+        let res = _mm_broadcastw_epi16(a);
+        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_broadcastw_epi16() {
+        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
+        let res = _mm256_broadcastw_epi16(a);
+        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            31, 30, 2, 28, 27, 26, 25, 24,
+            23, 22, 21, 20, 19, 18, 17, 16,
+            15, 14, 13, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1, 0,
+        );
+        let r = _mm256_cmpeq_epi8(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi16(
+            15, 14, 2, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1, 0,
+        );
+        let r = _mm256_cmpeq_epi16(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi32() {
+        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
+        let r = _mm256_cmpeq_epi32(a, b);
+        let e = _mm256_set1_epi32(0);
+        let e = _mm256_insert_epi32::<2>(e, !0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpeq_epi64() {
+        let a = _mm256_setr_epi64x(0, 1, 2, 3);
+        let b = _mm256_setr_epi64x(3, 2, 2, 0);
+        let r = _mm256_cmpeq_epi64(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi8() {
+        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
+        let b = _mm256_set1_epi8(0);
+        let r = _mm256_cmpgt_epi8(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi16() {
+        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
+        let b = _mm256_set1_epi16(0);
+        let r = _mm256_cmpgt_epi16(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi32() {
+        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
+        let b = _mm256_set1_epi32(0);
+        let r = _mm256_cmpgt_epi32(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cmpgt_epi64() {
+        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_cmpgt_epi64(a, b);
+        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi8_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi16(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi8_epi32() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
+        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi8_epi64() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 0, -1, 1, -2, 2, -3, 3,
+            -4, 4, -5, 5, -6, 6, -7, 7,
+        );
+        let r = _mm256_setr_epi64x(0, 0, -1, 1);
+        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi16_epi32() {
+        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
+        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
+        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi16_epi64() {
+        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
+        let r = _mm256_setr_epi64x(0, 0, -1, 1);
+        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepi32_epi64() {
+        let a = _mm_setr_epi32(0, 0, -1, 1);
+        let r = _mm256_setr_epi64x(0, 0, -1, 1);
+        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu16_epi32() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu16_epi64() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_setr_epi64x(0, 1, 2, 3);
+        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu32_epi64() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let r = _mm256_setr_epi64x(0, 1, 2, 3);
+        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu8_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let r = _mm256_setr_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu8_epi32() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtepu8_epi64() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r = _mm256_setr_epi64x(0, 1, 2, 3);
+        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extracti128_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_extracti128_si256::<1>(a);
+        let e = _mm_setr_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadd_epi16(a, b);
+        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hadd_epi32(a, b);
+        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hadds_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, 1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadds_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
+            4, 4, 4, 4, 8, 8, 8, 8,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsub_epi16(a, b);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hsub_epi32(a, b);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_hsubs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, -1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsubs_epi16(a, b);
+        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_madd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_madd_epi16(a, b);
+        let e = _mm256_set1_epi32(16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_inserti128_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm_setr_epi64x(7, 8);
+        let r = _mm256_inserti128_si256::<1>(a, b);
+        let e = _mm256_setr_epi64x(1, 2, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maddubs_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_maddubs_epi16(a, b);
+        let e = _mm256_set1_epi16(16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi32() {
+        let nums = [1, 2, 3, 4];
+        let a = &nums as *const i32;
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        let r = _mm_maskload_epi32(a, mask);
+        let e = _mm_setr_epi32(1, 0, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi32() {
+        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
+        let a = &nums as *const i32;
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        let r = _mm256_maskload_epi32(a, mask);
+        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi64() {
+        let nums = [1_i64, 2_i64];
+        let a = &nums as *const i64;
+        let mask = _mm_setr_epi64x(0, -1);
+        let r = _mm_maskload_epi64(a, mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi64() {
+        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
+        let a = &nums as *const i64;
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        let r = _mm256_maskload_epi64(a, mask);
+        let e = _mm256_setr_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut arr = [-1, -1, -1, -1];
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 4];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi32() {
+        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
+        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 42, -1, 6, 7, -1];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi64() {
+        let a = _mm_setr_epi64x(1_i64, 2_i64);
+        let mut arr = [-1_i64, -1_i64];
+        let mask = _mm_setr_epi64x(0, -1);
+        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi64() {
+        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
+        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2, 3, -1];
+        assert_eq!(arr, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_max_epi16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_max_epi32(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_max_epi8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epu16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_max_epu16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epu32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_max_epu32(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_max_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_max_epu8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_min_epi16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_min_epi32(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_min_epi8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epu16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_min_epu16(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epu32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_min_epu32(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_min_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_min_epu8(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_movemask_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let r = _mm256_movemask_epi8(a);
+        let e = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mpsadbw_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_mpsadbw_epu8::<0>(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mul_epi32() {
+        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mul_epi32(a, b);
+        let e = _mm256_setr_epi64x(0, 0, 10, 14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mul_epu32() {
+        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mul_epu32(a, b);
+        let e = _mm256_setr_epi64x(0, 0, 10, 14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mulhi_epi16() {
+        let a = _mm256_set1_epi16(6535);
+        let b = _mm256_set1_epi16(6535);
+        let r = _mm256_mulhi_epi16(a, b);
+        let e = _mm256_set1_epi16(651);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mulhi_epu16() {
+        let a = _mm256_set1_epi16(6535);
+        let b = _mm256_set1_epi16(6535);
+        let r = _mm256_mulhi_epu16(a, b);
+        let e = _mm256_set1_epi16(651);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mullo_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_mullo_epi16(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mullo_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_mullo_epi32(a, b);
+        let e = _mm256_set1_epi32(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_mullo_epi16(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_or_si256() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(0);
+        let r = _mm256_or_si256(a, b);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packs_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packs_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packus_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packus_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_sad_epu8(a, b);
+        let e = _mm256_set1_epi64x(16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0, 1, 2, 3, 11, 22, 33, 44,
+            4, 5, 6, 7, 55, 66, 77, 88,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 2, 3, 44, 22, 22, 11,
+            4, 5, 6, 7, 88, 66, 66, 55,
+        );
+        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            11, 22, 33, 44, 0, 1, 2, 3,
+            55, 66, 77, 88, 4, 5, 6, 7,
+        );
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            44, 22, 22, 11, 0, 1, 2, 3,
+            88, 66, 66, 55, 4, 5, 6, 7,
+        );
+        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let r = _mm256_sign_epi16(a, b);
+        let e = _mm256_set1_epi16(-2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_sign_epi32(a, b);
+        let e = _mm256_set1_epi32(-2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let r = _mm256_sign_epi8(a, b);
+        let e = _mm256_set1_epi8(-2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi16() {
+        let a = _mm256_set1_epi16(0xFF);
+        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
+        let r = _mm256_sll_epi16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi32() {
+        let a = _mm256_set1_epi32(0xFFFF);
+        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
+        let r = _mm256_sll_epi32(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi64() {
+        let a = _mm256_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
+        let r = _mm256_sll_epi64(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_epi16() {
+        assert_eq_m256i(
+            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
+            _mm256_set1_epi16(0xFF0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_epi32() {
+        assert_eq_m256i(
+            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
+            _mm256_set1_epi32(0xFFFF0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_epi64() {
+        assert_eq_m256i(
+            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
+            _mm256_set1_epi64x(0xFFFFFFFF0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_slli_si256() {
+        let a = _mm256_set1_epi64x(0xFFFFFFFF);
+        let r = _mm256_slli_si256::<3>(a);
+        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_sllv_epi32(a, b);
+        let e = _mm_set1_epi32(4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_sllv_epi32(a, b);
+        let e = _mm256_set1_epi32(4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_sllv_epi64(a, b);
+        let e = _mm_set1_epi64x(4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_sllv_epi64(a, b);
+        let e = _mm256_set1_epi64x(4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_sra_epi16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi32() {
+        let a = _mm256_set1_epi32(-1);
+        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
+        let r = _mm256_sra_epi32(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srai_epi16() {
+        assert_eq_m256i(
+            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
+            _mm256_set1_epi16(-1),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srai_epi32() {
+        assert_eq_m256i(
+            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
+            _mm256_set1_epi32(-1),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_srav_epi32() {
+        let a = _mm_set1_epi32(4);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_srav_epi32(a, count);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srav_epi32() {
+        let a = _mm256_set1_epi32(4);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_srav_epi32(a, count);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_si256() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_srli_si256::<3>(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            4, 5, 6, 7, 8, 9, 10, 11,
+            12, 13, 14, 15, 16, 0, 0, 0,
+            20, 21, 22, 23, 24, 25, 26, 27,
+            28, 29, 30, 31, 32, 0, 0, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi16() {
+        let a = _mm256_set1_epi16(0xFF);
+        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
+        let r = _mm256_srl_epi16(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi32() {
+        let a = _mm256_set1_epi32(0xFFFF);
+        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
+        let r = _mm256_srl_epi32(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi64() {
+        let a = _mm256_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_setr_epi64x(4, 0);
+        let r = _mm256_srl_epi64(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_epi16() {
+        assert_eq_m256i(
+            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
+            _mm256_set1_epi16(0xF),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_epi32() {
+        assert_eq_m256i(
+            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
+            _mm256_set1_epi32(0xFFF),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srli_epi64() {
+        assert_eq_m256i(
+            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
+            _mm256_set1_epi64x(0xFFFFFFF),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi32() {
+        let a = _mm_set1_epi32(2);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_srlv_epi32(a, count);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_srlv_epi32(a, count);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_srlv_epi64(a, count);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_srlv_epi64(a, count);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi16() {
+        let a = _mm256_set1_epi16(4);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_sub_epi16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi32() {
+        let a = _mm256_set1_epi32(4);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_sub_epi32(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi64() {
+        let a = _mm256_set1_epi64x(4);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_sub_epi64(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_sub_epi8() {
+        let a = _mm256_set1_epi8(4);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_sub_epi8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epi16() {
+        let a = _mm256_set1_epi16(4);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_subs_epi16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epi8() {
+        let a = _mm256_set1_epi8(4);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_subs_epi8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epu16() {
+        let a = _mm256_set1_epi16(4);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_subs_epu16(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_subs_epu8() {
+        let a = _mm256_set1_epi8(4);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_subs_epu8(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_xor_si256() {
+        let a = _mm256_set1_epi8(5);
+        let b = _mm256_set1_epi8(3);
+        let r = _mm256_xor_si256(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            -1, -2, -3, -4, -5, -6, -7, -8,
+            -9, -10, -11, -12, -13, -14, -15, -16,
+            -17, -18, -19, -20, -21, -22, -23, -24,
+            -25, -26, -27, -28, -29, -30, -31, -32,
+        );
+        let r = _mm256_alignr_epi8::<33>(a, b);
+        assert_eq_m256i(r, _mm256_set1_epi8(0));
+
+        let r = _mm256_alignr_epi8::<17>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 0,
+            18, 19, 20, 21, 22, 23, 24, 25,
+            26, 27, 28, 29, 30, 31, 32, 0,
+        );
+        assert_eq_m256i(r, expected);
+
+        let r = _mm256_alignr_epi8::<4>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            -5, -6, -7, -8, -9, -10, -11, -12,
+            -13, -14, -15, -16, 1, 2, 3, 4,
+            -21, -22, -23, -24, -25, -26, -27, -28,
+            -29, -30, -31, -32, 17, 18, 19, 20,
+        );
+        assert_eq_m256i(r, expected);
+
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            -1, -2, -3, -4, -5, -6, -7, -8,
+            -9, -10, -11, -12, -13, -14, -15, -16, -17,
+            -18, -19, -20, -21, -22, -23, -24, -25,
+            -26, -27, -28, -29, -30, -31, -32,
+        );
+        let r = _mm256_alignr_epi8::<16>(a, b);
+        assert_eq_m256i(r, expected);
+
+        let r = _mm256_alignr_epi8::<15>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            -16, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            -32, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        assert_eq_m256i(r, expected);
+
+        let r = _mm256_alignr_epi8::<0>(a, b);
+        assert_eq_m256i(r, b);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+        );
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            5, 0, 5, 4, 9, 13, 7, 4,
+            13, 6, 6, 11, 5, 2, 9, 1,
+            21, 0, 21, 20, 25, 29, 23, 20,
+            29, 22, 22, 27, 21, 18, 25, 17,
+        );
+        let r = _mm256_shuffle_epi8(a, b);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_epi32() {
+        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
+        let r = _mm256_permutevar8x32_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permute4x64_epi64() {
+        let a = _mm256_setr_epi64x(100, 200, 300, 400);
+        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
+        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permute2x128_si256() {
+        let a = _mm256_setr_epi64x(100, 200, 500, 600);
+        let b = _mm256_setr_epi64x(300, 400, 700, 800);
+        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
+        let e = _mm256_setr_epi64x(700, 800, 500, 600);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permute4x64_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
+        let e = _mm256_setr_pd(4., 1., 2., 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let r = _mm256_permutevar8x32_ps(a, b);
+        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i32gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r =
+            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i32gather_epi32::<4>(
+            _mm256_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i32gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r =
+            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i32gather_ps::<4>(
+            _mm256_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
+        );
+        assert_eq_m256(
+            r,
+            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
+        );
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i32gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i32gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i32gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i32gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i32gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_epi32(-1, 0, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi32() {
+        let mut arr = [0i32; 128];
+        for i in 0..128i32 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_ps() {
+        let mut arr = [0.0f32; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i64gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i64gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_i64gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i64gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_pd() {
+        let mut arr = [0.0f64; 128];
+        let mut j = 0.0;
+        for i in 0..128usize {
+            arr[i] = j;
+            j += 1.0;
+        }
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i64gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_extract_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31
+        );
+        let r1 = _mm256_extract_epi8::<0>(a);
+        let r2 = _mm256_extract_epi8::<3>(a);
+        assert_eq!(r1, 0xFF);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extract_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        let r1 = _mm256_extract_epi16::<0>(a);
+        let r2 = _mm256_extract_epi16::<3>(a);
+        assert_eq!(r1, 0xFFFF);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extract_epi32() {
+        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
+        let r1 = _mm256_extract_epi32::<0>(a);
+        let r2 = _mm256_extract_epi32::<3>(a);
+        assert_eq!(r1, -1);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtsd_f64() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let r = _mm256_cvtsd_f64(a);
+        assert_eq!(r, 1.);
+    }
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_cvtsi256_si32() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtsi256_si32(a);
+        assert_eq!(r, 1);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
new file mode 100644
index 000000000..e9977e018
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
@@ -0,0 +1,1573 @@
+//! [AVX512BF16 intrinsics].
+//!
+//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128"]
+    fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8;
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256"]
+    fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16;
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512"]
+    fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32;
+    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256"]
+    fn cvtneps2bf16_256(a: f32x8) -> i16x8;
+    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512"]
+    fn cvtneps2bf16_512(a: f32x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128"]
+    fn dpbf16ps(a: f32x4, b: i32x4, c: i32x4) -> f32x4;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256"]
+    fn dpbf16ps_256(a: f32x8, b: i32x8, c: i32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512"]
+    fn dpbf16ps_512(a: f32x16, b: i32x16, c: i32x16) -> f32x16;
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
+/// 128-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
+    transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using writemask k (elements are copied from src when the
+/// corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+    let zero = _mm_setzero_si128().as_u16x8();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
+/// 256-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
+    transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
+/// to packed BF16 (16-bit) floating-point elements and and store the results in single vector
+/// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm256_mask_cvtne2ps_pbh(
+    src: __m256bh,
+    k: __mmask16,
+    a: __m256,
+    b: __m256,
+) -> __m256bh {
+    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
+/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector
+/// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+    let zero = _mm256_setzero_si256().as_u16x16();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a  
+/// 512-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
+    transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using writemask k (elements are copied from src when the
+/// corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm512_mask_cvtne2ps_pbh(
+    src: __m512bh,
+    k: __mmask32,
+    a: __m512,
+    b: __m512,
+) -> __m512bh {
+    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+    let zero = _mm512_setzero_si512().as_u16x32();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
+    transmute(cvtneps2bf16_256(a.as_f32x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
+    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
+    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+    let zero = _mm_setzero_si128().as_u16x8();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
+    transmute(cvtneps2bf16_512(a.as_f32x16()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
+    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
+    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+    let zero = _mm256_setzero_si256().as_u16x16();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    transmute(dpbf16ps(src.as_f32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
+    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+    let zero = _mm_set1_ps(0.0_f32).as_f32x4();
+    transmute(simd_select_bitmask(k, rst, zero))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    transmute(dpbf16ps_256(src.as_f32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
+    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, rst, zero))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit)
+/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit)
+/// floating-point elements with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    transmute(dpbf16ps_512(src.as_f32x16(), a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
+    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm512_maskz_dpbf16_ps(
+    k: __mmask16,
+    src: __m512,
+    a: __m512bh,
+    b: __m512bh,
+) -> __m512 {
+    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, rst, zero))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{core_arch::x86::*, mem::transmute};
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        #[rustfmt::skip]
+        let src_array: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        let src: __m128bh = transmute(src_array);
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let k: __mmask8 = 0b1111_1111;
+        let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        assert_eq!(result, expected_result);
+        let k = 0b0000_0000;
+        let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let k: __mmask8 = 0b1111_1111;
+        let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        assert_eq!(result, expected_result);
+        let k = 0b0011_1100;
+        let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0,
+            0,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0,
+            0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let src_array: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        let src: __m256bh = transmute(src_array);
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0b0110_1100_0011_0110;
+        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0,
+            0,
+            0,
+            0,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let src_array: [u16; 32] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
+        let src: __m512bh = transmute(src_array);
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let k: __mmask32 = 0xffffffff;
+        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask32 = 0;
+        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_cvtne2ps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let b_array = [
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let k: __mmask32 = 0xffffffff;
+        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
+        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        #[rustfmt::skip]
+        let expected_result: [u16; 32] = [
+            0,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0,
+            0b1_10000011_0000100,
+            0,
+            0b1_10001000_1111010,
+            0,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0,
+            0,
+            0,
+            0b1_10000110_1111111,
+            0,
+            0b1_10001000_0010000,
+            0,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0,
+            0b0_10000011_0000100,
+            0,
+            0,
+            0b0_10001000_0010000,
+            0,
+            0b0_10000010_0101000,
+            0,
+            0b0_10000100_1001001,
+            0,
+            0,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let c: __m128bh = _mm256_cvtneps_pbh(a);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let src_array: [u16; 8] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+        ];
+        let src: __m128bh = transmute(src_array);
+        let a: __m256 = transmute(a_array);
+        let k: __mmask8 = 0xff;
+        let b = _mm256_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0x0;
+        let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m256 = transmute(a_array);
+        let k: __mmask8 = 0xff;
+        let b = _mm256_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        #[rustfmt::skip]
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0x6;
+        let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] =
+            [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let c: __m256bh = _mm512_cvtneps_pbh(a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let src_array: [u16; 16] = [
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+        ];
+        let src: __m256bh = transmute(src_array);
+        let a: __m512 = transmute(a_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_cvtneps_pbh() {
+        #[rustfmt::skip]
+        let a_array = [
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
+        let a: __m512 = transmute(a_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0x653a;
+        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        #[rustfmt::skip]
+        let expected_result: [u16; 16] = [
+            0,
+            0b0_10000010_0101000,
+            0,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0,
+            0,
+            0b0_10000110_0110010,
+            0,
+            0b0_10000000_1110000,
+            0,
+            0,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_dpbf16_ps(src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let k: __mmask8 = 0xf3;
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let k: __mmask8 = 0xf3;
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        #[rustfmt::skip]
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_dpbf16_ps(src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        let k: __mmask8 = 0x33;
+        #[rustfmt::skip]
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        let k: __mmask8 = 0x33;
+        #[rustfmt::skip]
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        #[rustfmt::skip]
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_dpbf16_ps(src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let k: __mmask16 = 0x3333;
+        #[rustfmt::skip]
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0xffff;
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_dpbf16_ps() {
+        #[rustfmt::skip]
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let k: __mmask16 = 0x3333;
+        #[rustfmt::skip]
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32,
+            0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0xffff;
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        #[rustfmt::skip]
+        let expected_result: [f32; 16] = [
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        ];
+        assert_eq!(result, expected_result);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
new file mode 100644
index 000000000..3c9df3912
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
@@ -0,0 +1,760 @@
+//! Bit-oriented Algorithms (BITALG)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::simd::i16x16;
+use crate::core_arch::simd::i16x32;
+use crate::core_arch::simd::i16x8;
+use crate::core_arch::simd::i8x16;
+use crate::core_arch::simd::i8x32;
+use crate::core_arch::simd::i8x64;
+use crate::core_arch::simd_llvm::simd_select_bitmask;
+use crate::core_arch::x86::__m128i;
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+use crate::core_arch::x86::__mmask16;
+use crate::core_arch::x86::__mmask32;
+use crate::core_arch::x86::__mmask64;
+use crate::core_arch::x86::__mmask8;
+use crate::core_arch::x86::_mm256_setzero_si256;
+use crate::core_arch::x86::_mm512_setzero_si512;
+use crate::core_arch::x86::_mm_setzero_si128;
+use crate::core_arch::x86::m128iExt;
+use crate::core_arch::x86::m256iExt;
+use crate::core_arch::x86::m512iExt;
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.ctpop.v32i16"]
+    fn popcnt_v32i16(x: i16x32) -> i16x32;
+    #[link_name = "llvm.ctpop.v16i16"]
+    fn popcnt_v16i16(x: i16x16) -> i16x16;
+    #[link_name = "llvm.ctpop.v8i16"]
+    fn popcnt_v8i16(x: i16x8) -> i16x8;
+
+    #[link_name = "llvm.ctpop.v64i8"]
+    fn popcnt_v64i8(x: i8x64) -> i8x64;
+    #[link_name = "llvm.ctpop.v32i8"]
+    fn popcnt_v32i8(x: i8x32) -> i8x32;
+    #[link_name = "llvm.ctpop.v16i8"]
+    fn popcnt_v16i8(x: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.512"]
+    fn bitshuffle_512(data: i8x64, indices: i8x64, mask: __mmask64) -> __mmask64;
+    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.256"]
+    fn bitshuffle_256(data: i8x32, indices: i8x32, mask: __mmask32) -> __mmask32;
+    #[link_name = "llvm.x86.avx512.mask.vpshufbitqmb.128"]
+    fn bitshuffle_128(data: i8x16, indices: i8x16, mask: __mmask16) -> __mmask16;
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub unsafe fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
+    transmute(popcnt_v32i16(a.as_i16x32()))
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub unsafe fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, popcnt_v32i16(a.as_i16x32()), zero))
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub unsafe fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v32i16(a.as_i16x32()),
+        src.as_i16x32(),
+    ))
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub unsafe fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
+    transmute(popcnt_v16i16(a.as_i16x16()))
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub unsafe fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, popcnt_v16i16(a.as_i16x16()), zero))
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub unsafe fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v16i16(a.as_i16x16()),
+        src.as_i16x16(),
+    ))
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub unsafe fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
+    transmute(popcnt_v8i16(a.as_i16x8()))
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub unsafe fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, popcnt_v8i16(a.as_i16x8()), zero))
+}
+
+/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_popcnt_epi16)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntw))]
+pub unsafe fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v8i16(a.as_i16x8()),
+        src.as_i16x8(),
+    ))
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub unsafe fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
+    transmute(popcnt_v64i8(a.as_i8x64()))
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub unsafe fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, popcnt_v64i8(a.as_i8x64()), zero))
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub unsafe fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v64i8(a.as_i8x64()),
+        src.as_i8x64(),
+    ))
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub unsafe fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
+    transmute(popcnt_v32i8(a.as_i8x32()))
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub unsafe fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, popcnt_v32i8(a.as_i8x32()), zero))
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub unsafe fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v32i8(a.as_i8x32()),
+        src.as_i8x32(),
+    ))
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub unsafe fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
+    transmute(popcnt_v16i8(a.as_i8x16()))
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub unsafe fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, popcnt_v16i8(a.as_i8x16()), zero))
+}
+
+/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_popcnt_epi8)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntb))]
+pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v16i8(a.as_i8x16()),
+        src.as_i8x16(),
+    ))
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
+    transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0))
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
+    transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k))
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
+    transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0))
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
+    transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k))
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
+    transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0))
+}
+
+/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
+/// Then groups 8 8-bit values from `c`as indices into the the bits of the corresponding 64-bit integer.
+/// It then selects these bits and packs them into the output.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_bitshuffle_epi64_mask)
+#[inline]
+#[target_feature(enable = "avx512bitalg,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufbitqmb))]
+pub unsafe fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
+    transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k))
+}
+
+#[cfg(test)]
+mod tests {
+    // Some of the constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_popcnt_epi16() {
+        let test_data = _mm512_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+            1024, 2048,
+        );
+        let actual_result = _mm512_popcnt_epi16(test_data);
+        let reference_result = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 12, 8, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi16() {
+        let test_data = _mm512_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+            1024, 2048,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm512_maskz_popcnt_epi16(mask, test_data);
+        let reference_result = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi16() {
+        let test_data = _mm512_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+            1024, 2048,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm512_mask_popcnt_epi16(test_data, mask, test_data);
+        let reference_result = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xFF_FF, -1, -100, 255, 256, 2,
+            4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi16() {
+        let test_data = _mm256_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF,
+        );
+        let actual_result = _mm256_popcnt_epi16(test_data);
+        let reference_result =
+            _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi16() {
+        let test_data = _mm256_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm256_maskz_popcnt_epi16(mask, test_data);
+        let reference_result = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi16() {
+        let test_data = _mm256_set_epi16(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF,
+            0x3F_FF, 0x7F_FF,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm256_mask_popcnt_epi16(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF, 0x3F_FF, 0x7F_FF,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_popcnt_epi16() {
+        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
+        let actual_result = _mm_popcnt_epi16(test_data);
+        let reference_result = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi16() {
+        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
+        let mask = 0xF0;
+        let actual_result = _mm_maskz_popcnt_epi16(mask, test_data);
+        let reference_result = _mm_set_epi16(0, 1, 2, 3, 0, 0, 0, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi16() {
+        let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F);
+        let mask = 0xF0;
+        let actual_result = _mm_mask_popcnt_epi16(test_data, mask, test_data);
+        let reference_result = _mm_set_epi16(0, 1, 2, 3, 0xF, 0x1F, 0x3F, 0x7F);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_popcnt_epi8() {
+        let test_data = _mm512_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
+            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
+            225, 21, 249, 211, 155, 228, 70,
+        );
+        let actual_result = _mm512_popcnt_epi8(test_data);
+        let reference_result = _mm512_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4, 6, 4, 3, 3, 5, 6, 3, 3, 5, 6, 4, 4, 4, 3, 3, 6, 7, 3, 5, 5, 3, 4, 5, 3, 4, 4,
+            3, 6, 5, 5, 4, 3,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi8() {
+        let test_data = _mm512_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
+            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
+            225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_FF_FF_00_00_00_00;
+        let actual_result = _mm512_maskz_popcnt_epi8(mask, test_data);
+        let reference_result = _mm512_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi8() {
+        let test_data = _mm512_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189,
+            140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90,
+            225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_FF_FF_00_00_00_00;
+        let actual_result = _mm512_mask_popcnt_epi8(test_data, mask, test_data);
+        let reference_result = _mm512_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4, 183, 154, 84, 56, 227, 189, 140, 35, 117, 219, 169, 226, 170, 13, 22, 159,
+            251, 73, 121, 143, 145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi8() {
+        let test_data = _mm256_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100,
+            217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172,
+        );
+        let actual_result = _mm256_popcnt_epi8(test_data);
+        let reference_result = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5,
+            2, 4, 4,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi8() {
+        let test_data = _mm256_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 251, 73, 121, 143,
+            145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm256_maskz_popcnt_epi8(mask, test_data);
+        let reference_result = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi8() {
+        let test_data = _mm256_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 251, 73, 121, 143,
+            145, 85, 91, 137, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm256_mask_popcnt_epi8(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 251, 73, 121, 143, 145, 85, 91, 137,
+            90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_popcnt_epi8() {
+        let test_data = _mm_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64,
+        );
+        let actual_result = _mm_popcnt_epi8(test_data);
+        let reference_result = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi8() {
+        let test_data = _mm_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm_maskz_popcnt_epi8(mask, test_data);
+        let reference_result = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi8() {
+        let test_data = _mm_set_epi8(
+            0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 90, 225, 21, 249, 211, 155, 228, 70,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm_mask_popcnt_epi8(test_data, mask, test_data);
+        let reference_result =
+            _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 90, 225, 21, 249, 211, 155, 228, 70);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_bitshuffle_epi64_mask() {
+        let test_indices = _mm512_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0, 63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59,
+            58, 57, 56, 32, 32, 16, 16, 0, 0, 8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm512_setr_epi64(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let actual_result = _mm512_bitshuffle_epi64_mask(test_data, test_indices);
+        let reference_result = 0xF0 << 0
+            | 0x03 << 8
+            | 0xFF << 16
+            | 0xAC << 24
+            | 0xF0 << 32
+            | 0x03 << 40
+            | 0xFF << 48
+            | 0xAC << 56;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f")]
+    unsafe fn test_mm512_mask_bitshuffle_epi64_mask() {
+        let test_indices = _mm512_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0, 63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59,
+            58, 57, 56, 32, 32, 16, 16, 0, 0, 8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm512_setr_epi64(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let mask = 0xFF_FF_FF_FF_00_00_00_00;
+        let actual_result = _mm512_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
+        let reference_result = 0x00 << 0
+            | 0x00 << 8
+            | 0x00 << 16
+            | 0x00 << 24
+            | 0xF0 << 32
+            | 0x03 << 40
+            | 0xFF << 48
+            | 0xAC << 56;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_bitshuffle_epi64_mask() {
+        let test_indices = _mm256_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm256_setr_epi64x(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let actual_result = _mm256_bitshuffle_epi64_mask(test_data, test_indices);
+        let reference_result = 0xF0 << 0 | 0x03 << 8 | 0xFF << 16 | 0xAC << 24;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_bitshuffle_epi64_mask() {
+        let test_indices = _mm256_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56, 32, 32, 16, 16, 0, 0,
+            8, 8, 56, 48, 40, 32, 24, 16, 8, 0,
+        );
+        let test_data = _mm256_setr_epi64x(
+            0xFF_FF_FF_FF_00_00_00_00,
+            0xFF_00_FF_00_FF_00_FF_00,
+            0xFF_00_00_00_00_00_00_00,
+            0xAC_00_00_00_00_00_00_00,
+        );
+        let mask = 0xFF_FF_00_00;
+        let actual_result = _mm256_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
+        let reference_result = 0x00 << 0 | 0x00 << 8 | 0xFF << 16 | 0xAC << 24;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_bitshuffle_epi64_mask() {
+        let test_indices = _mm_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56,
+        );
+        let test_data = _mm_setr_epi64x(0xFF_00_00_00_00_00_00_00, 0xAC_00_00_00_00_00_00_00);
+        let actual_result = _mm_bitshuffle_epi64_mask(test_data, test_indices);
+        let reference_result = 0xFF << 0 | 0xAC << 8;
+
+        assert_eq!(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512bitalg,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_bitshuffle_epi64_mask() {
+        let test_indices = _mm_set_epi8(
+            63, 62, 61, 60, 59, 58, 57, 56, 63, 62, 61, 60, 59, 58, 57, 56,
+        );
+        let test_data = _mm_setr_epi64x(0xFF_00_00_00_00_00_00_00, 0xAC_00_00_00_00_00_00_00);
+        let mask = 0xFF_00;
+        let actual_result = _mm_mask_bitshuffle_epi64_mask(mask, test_data, test_indices);
+        let reference_result = 0x00 << 0 | 0xAC << 8;
+
+        assert_eq!(actual_result, reference_result);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
new file mode 100644
index 000000000..47d565cea
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -0,0 +1,18785 @@
+use crate::{
+    arch::asm,
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::{self, transmute},
+    ptr,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use super::avx512f::{vpl, vps};
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_epi16&expand=30)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub unsafe fn _mm512_abs_epi16(a: __m512i) -> __m512i {
+    let a = a.as_i16x32();
+    // all-0 is a properly initialized i16x32
+    let zero: i16x32 = mem::zeroed();
+    let sub = simd_sub(zero, a);
+    let cmp: i16x32 = simd_gt(a, zero);
+    transmute(simd_select(cmp, a, sub))
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_epi16&expand=31)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub unsafe fn _mm512_mask_abs_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi16(a).as_i16x32();
+    transmute(simd_select_bitmask(k, abs, src.as_i16x32()))
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_abs_epi16&expand=32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub unsafe fn _mm512_maskz_abs_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi16(a).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_abs_epi16&expand=28)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub unsafe fn _mm256_mask_abs_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    let abs = _mm256_abs_epi16(a).as_i16x16();
+    transmute(simd_select_bitmask(k, abs, src.as_i16x16()))
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_abs_epi16&expand=29)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub unsafe fn _mm256_maskz_abs_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    let abs = _mm256_abs_epi16(a).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_abs_epi16&expand=25)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub unsafe fn _mm_mask_abs_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let abs = _mm_abs_epi16(a).as_i16x8();
+    transmute(simd_select_bitmask(k, abs, src.as_i16x8()))
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_abs_epi16&expand=26)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+pub unsafe fn _mm_maskz_abs_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    let abs = _mm_abs_epi16(a).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_epi8&expand=57)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub unsafe fn _mm512_abs_epi8(a: __m512i) -> __m512i {
+    let a = a.as_i8x64();
+    // all-0 is a properly initialized i8x64
+    let zero: i8x64 = mem::zeroed();
+    let sub = simd_sub(zero, a);
+    let cmp: i8x64 = simd_gt(a, zero);
+    transmute(simd_select(cmp, a, sub))
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_epi8&expand=58)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub unsafe fn _mm512_mask_abs_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi8(a).as_i8x64();
+    transmute(simd_select_bitmask(k, abs, src.as_i8x64()))
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_abs_epi8&expand=59)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub unsafe fn _mm512_maskz_abs_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi8(a).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_abs_epi8&expand=55)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub unsafe fn _mm256_mask_abs_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    let abs = _mm256_abs_epi8(a).as_i8x32();
+    transmute(simd_select_bitmask(k, abs, src.as_i8x32()))
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_abs_epi8&expand=56)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub unsafe fn _mm256_maskz_abs_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    let abs = _mm256_abs_epi8(a).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_abs_epi8&expand=52)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub unsafe fn _mm_mask_abs_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    let abs = _mm_abs_epi8(a).as_i8x16();
+    transmute(simd_select_bitmask(k, abs, src.as_i8x16()))
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_abs_epi8&expand=53)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+pub unsafe fn _mm_maskz_abs_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    let abs = _mm_abs_epi8(a).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_epi16&expand=91)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub unsafe fn _mm512_add_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_add(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_epi16&expand=92)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub unsafe fn _mm512_mask_add_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let add = _mm512_add_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, add, src.as_i16x32()))
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_epi16&expand=93)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub unsafe fn _mm512_maskz_add_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let add = _mm512_add_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_epi&expand=89)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub unsafe fn _mm256_mask_add_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let add = _mm256_add_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, add, src.as_i16x16()))
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_epi16&expand=90)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub unsafe fn _mm256_maskz_add_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let add = _mm256_add_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_epi16&expand=86)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub unsafe fn _mm_mask_add_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let add = _mm_add_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, add, src.as_i16x8()))
+}
+
+/// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_epi16&expand=87)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+pub unsafe fn _mm_maskz_add_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let add = _mm_add_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_epi8&expand=118)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub unsafe fn _mm512_add_epi8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_add(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_epi8&expand=119)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub unsafe fn _mm512_mask_add_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let add = _mm512_add_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, add, src.as_i8x64()))
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_epi8&expand=120)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub unsafe fn _mm512_maskz_add_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let add = _mm512_add_epi8(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_epi8&expand=116)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub unsafe fn _mm256_mask_add_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let add = _mm256_add_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, add, src.as_i8x32()))
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_epi8&expand=117)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub unsafe fn _mm256_maskz_add_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let add = _mm256_add_epi8(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_epi8&expand=113)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub unsafe fn _mm_mask_add_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let add = _mm_add_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, add, src.as_i8x16()))
+}
+
+/// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_epi8&expand=114)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+pub unsafe fn _mm_maskz_add_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let add = _mm_add_epi8(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_adds_epu16&expand=197)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub unsafe fn _mm512_adds_epu16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddusw(
+        a.as_u16x32(),
+        b.as_u16x32(),
+        _mm512_setzero_si512().as_u16x32(),
+        0b11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_adds_epu16&expand=198)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub unsafe fn _mm512_mask_adds_epu16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    transmute(vpaddusw(a.as_u16x32(), b.as_u16x32(), src.as_u16x32(), k))
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_adds_epu16&expand=199)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub unsafe fn _mm512_maskz_adds_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddusw(
+        a.as_u16x32(),
+        b.as_u16x32(),
+        _mm512_setzero_si512().as_u16x32(),
+        k,
+    ))
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_adds_epu16&expand=195)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub unsafe fn _mm256_mask_adds_epu16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    transmute(vpaddusw256(
+        a.as_u16x16(),
+        b.as_u16x16(),
+        src.as_u16x16(),
+        k,
+    ))
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_adds_epu16&expand=196)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub unsafe fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpaddusw256(
+        a.as_u16x16(),
+        b.as_u16x16(),
+        _mm256_setzero_si256().as_u16x16(),
+        k,
+    ))
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_adds_epu16&expand=192)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub unsafe fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpaddusw128(a.as_u16x8(), b.as_u16x8(), src.as_u16x8(), k))
+}
+
+/// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_adds_epu16&expand=193)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+pub unsafe fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpaddusw128(
+        a.as_u16x8(),
+        b.as_u16x8(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_adds_epu8&expand=206)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub unsafe fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddusb(
+        a.as_u8x64(),
+        b.as_u8x64(),
+        _mm512_setzero_si512().as_u8x64(),
+        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_adds_epu8&expand=207)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub unsafe fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddusb(a.as_u8x64(), b.as_u8x64(), src.as_u8x64(), k))
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_adds_epu8&expand=208)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub unsafe fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddusb(
+        a.as_u8x64(),
+        b.as_u8x64(),
+        _mm512_setzero_si512().as_u8x64(),
+        k,
+    ))
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_adds_epu8&expand=204)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub unsafe fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpaddusb256(a.as_u8x32(), b.as_u8x32(), src.as_u8x32(), k))
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_adds_epu8&expand=205)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub unsafe fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpaddusb256(
+        a.as_u8x32(),
+        b.as_u8x32(),
+        _mm256_setzero_si256().as_u8x32(),
+        k,
+    ))
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_adds_epu8&expand=201)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub unsafe fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpaddusb128(a.as_u8x16(), b.as_u8x16(), src.as_u8x16(), k))
+}
+
+/// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_adds_epu8&expand=202)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+pub unsafe fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpaddusb128(
+        a.as_u8x16(),
+        b.as_u8x16(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_adds_epi16&expand=179)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub unsafe fn _mm512_adds_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddsw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_setzero_si512().as_i16x32(),
+        0b11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_adds_epi16&expand=180)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub unsafe fn _mm512_mask_adds_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    transmute(vpaddsw(a.as_i16x32(), b.as_i16x32(), src.as_i16x32(), k))
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_adds_epi16&expand=181)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub unsafe fn _mm512_maskz_adds_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddsw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_setzero_si512().as_i16x32(),
+        k,
+    ))
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_adds_epi16&expand=177)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub unsafe fn _mm256_mask_adds_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    transmute(vpaddsw256(a.as_i16x16(), b.as_i16x16(), src.as_i16x16(), k))
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_adds_epi16&expand=178)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub unsafe fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpaddsw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_setzero_si256().as_i16x16(),
+        k,
+    ))
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_adds_epi16&expand=174)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub unsafe fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpaddsw128(a.as_i16x8(), b.as_i16x8(), src.as_i16x8(), k))
+}
+
+/// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_adds_epi16&expand=175)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+pub unsafe fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpaddsw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_setzero_si128().as_i16x8(),
+        k,
+    ))
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_adds_epi8&expand=188)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub unsafe fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddsb(
+        a.as_i8x64(),
+        b.as_i8x64(),
+        _mm512_setzero_si512().as_i8x64(),
+        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_adds_epi8&expand=189)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub unsafe fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddsb(a.as_i8x64(), b.as_i8x64(), src.as_i8x64(), k))
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_adds_epi8&expand=190)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub unsafe fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpaddsb(
+        a.as_i8x64(),
+        b.as_i8x64(),
+        _mm512_setzero_si512().as_i8x64(),
+        k,
+    ))
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_adds_epi8&expand=186)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub unsafe fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpaddsb256(a.as_i8x32(), b.as_i8x32(), src.as_i8x32(), k))
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_adds_epi8&expand=187)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub unsafe fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpaddsb256(
+        a.as_i8x32(),
+        b.as_i8x32(),
+        _mm256_setzero_si256().as_i8x32(),
+        k,
+    ))
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_adds_epi8&expand=183)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub unsafe fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpaddsb128(a.as_i8x16(), b.as_i8x16(), src.as_i8x16(), k))
+}
+
+/// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_adds_epi8&expand=184)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+pub unsafe fn _mm_maskz_adds_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpaddsb128(
+        a.as_i8x16(),
+        b.as_i8x16(),
+        _mm_setzero_si128().as_i8x16(),
+        k,
+    ))
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi16&expand=5685)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub unsafe fn _mm512_sub_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_sub(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_epi16&expand=5683)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub unsafe fn _mm512_mask_sub_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let sub = _mm512_sub_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_epi16&expand=5684)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub unsafe fn _mm512_maskz_sub_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let sub = _mm512_sub_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_epi16&expand=5680)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub unsafe fn _mm256_mask_sub_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let sub = _mm256_sub_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_epi16&expand=5681)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub unsafe fn _mm256_maskz_sub_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let sub = _mm256_sub_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_epi16&expand=5677)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub unsafe fn _mm_mask_sub_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let sub = _mm_sub_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
+}
+
+/// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_epi16&expand=5678)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+pub unsafe fn _mm_maskz_sub_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let sub = _mm_sub_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi8&expand=5712)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub unsafe fn _mm512_sub_epi8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_sub(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_epi8&expand=5710)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub unsafe fn _mm512_mask_sub_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let sub = _mm512_sub_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_epi8&expand=5711)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub unsafe fn _mm512_maskz_sub_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let sub = _mm512_sub_epi8(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_epi8&expand=5707)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub unsafe fn _mm256_mask_sub_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let sub = _mm256_sub_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_epi8&expand=5708)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub unsafe fn _mm256_maskz_sub_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let sub = _mm256_sub_epi8(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_epi8&expand=5704)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub unsafe fn _mm_mask_sub_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let sub = _mm_sub_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
+}
+
+/// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_epi8&expand=5705)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+pub unsafe fn _mm_maskz_sub_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let sub = _mm_sub_epi8(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_subs_epu16&expand=5793)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub unsafe fn _mm512_subs_epu16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubusw(
+        a.as_u16x32(),
+        b.as_u16x32(),
+        _mm512_setzero_si512().as_u16x32(),
+        0b11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_subs_epu16&expand=5791)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub unsafe fn _mm512_mask_subs_epu16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    transmute(vpsubusw(a.as_u16x32(), b.as_u16x32(), src.as_u16x32(), k))
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_subs_epu16&expand=5792)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub unsafe fn _mm512_maskz_subs_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubusw(
+        a.as_u16x32(),
+        b.as_u16x32(),
+        _mm512_setzero_si512().as_u16x32(),
+        k,
+    ))
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_subs_epu16&expand=5788)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub unsafe fn _mm256_mask_subs_epu16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    transmute(vpsubusw256(
+        a.as_u16x16(),
+        b.as_u16x16(),
+        src.as_u16x16(),
+        k,
+    ))
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_subs_epu16&expand=5789)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub unsafe fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpsubusw256(
+        a.as_u16x16(),
+        b.as_u16x16(),
+        _mm256_setzero_si256().as_u16x16(),
+        k,
+    ))
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_subs_epu16&expand=5785)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub unsafe fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpsubusw128(a.as_u16x8(), b.as_u16x8(), src.as_u16x8(), k))
+}
+
+/// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_subs_epu16&expand=5786)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+pub unsafe fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpsubusw128(
+        a.as_u16x8(),
+        b.as_u16x8(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_subs_epu8&expand=5802)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub unsafe fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubusb(
+        a.as_u8x64(),
+        b.as_u8x64(),
+        _mm512_setzero_si512().as_u8x64(),
+        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_subs_epu8&expand=5800)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub unsafe fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubusb(a.as_u8x64(), b.as_u8x64(), src.as_u8x64(), k))
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_subs_epu8&expand=5801)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub unsafe fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubusb(
+        a.as_u8x64(),
+        b.as_u8x64(),
+        _mm512_setzero_si512().as_u8x64(),
+        k,
+    ))
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_subs_epu8&expand=5797)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub unsafe fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpsubusb256(a.as_u8x32(), b.as_u8x32(), src.as_u8x32(), k))
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_subs_epu8&expand=5798)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub unsafe fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpsubusb256(
+        a.as_u8x32(),
+        b.as_u8x32(),
+        _mm256_setzero_si256().as_u8x32(),
+        k,
+    ))
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_subs_epu8&expand=5794)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub unsafe fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpsubusb128(a.as_u8x16(), b.as_u8x16(), src.as_u8x16(), k))
+}
+
+/// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_subs_epu8&expand=5795)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+pub unsafe fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpsubusb128(
+        a.as_u8x16(),
+        b.as_u8x16(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_subs_epi16&expand=5775)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub unsafe fn _mm512_subs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubsw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_setzero_si512().as_i16x32(),
+        0b11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_subs_epi16&expand=5773)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub unsafe fn _mm512_mask_subs_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    transmute(vpsubsw(a.as_i16x32(), b.as_i16x32(), src.as_i16x32(), k))
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_subs_epi16&expand=5774)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub unsafe fn _mm512_maskz_subs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubsw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_setzero_si512().as_i16x32(),
+        k,
+    ))
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_subs_epi16&expand=5770)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub unsafe fn _mm256_mask_subs_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    transmute(vpsubsw256(a.as_i16x16(), b.as_i16x16(), src.as_i16x16(), k))
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_subs_epi16&expand=5771)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub unsafe fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpsubsw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_setzero_si256().as_i16x16(),
+        k,
+    ))
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_subs_epi16&expand=5767)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub unsafe fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpsubsw128(a.as_i16x8(), b.as_i16x8(), src.as_i16x8(), k))
+}
+
+/// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_subs_epi16&expand=5768)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+pub unsafe fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpsubsw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_setzero_si128().as_i16x8(),
+        k,
+    ))
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_subs_epi8&expand=5784)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub unsafe fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubsb(
+        a.as_i8x64(),
+        b.as_i8x64(),
+        _mm512_setzero_si512().as_i8x64(),
+        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_subs_epi8&expand=5782)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub unsafe fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubsb(a.as_i8x64(), b.as_i8x64(), src.as_i8x64(), k))
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_subs_epi8&expand=5783)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub unsafe fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsubsb(
+        a.as_i8x64(),
+        b.as_i8x64(),
+        _mm512_setzero_si512().as_i8x64(),
+        k,
+    ))
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_subs_epi8&expand=5779)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub unsafe fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpsubsb256(a.as_i8x32(), b.as_i8x32(), src.as_i8x32(), k))
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_subs_epi8&expand=5780)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub unsafe fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpsubsb256(
+        a.as_i8x32(),
+        b.as_i8x32(),
+        _mm256_setzero_si256().as_i8x32(),
+        k,
+    ))
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_subs_epi8&expand=5776)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub unsafe fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpsubsb128(a.as_i8x16(), b.as_i8x16(), src.as_i8x16(), k))
+}
+
+/// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_subs_epi8&expand=5777)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+pub unsafe fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpsubsb128(
+        a.as_i8x16(),
+        b.as_i8x16(),
+        _mm_setzero_si128().as_i8x16(),
+        k,
+    ))
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mulhi_epu16&expand=3973)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub unsafe fn _mm512_mulhi_epu16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmulhuw(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mulhi_epu16&expand=3971)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub unsafe fn _mm512_mask_mulhi_epu16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, mul, src.as_u16x32()))
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mulhi_epu16&expand=3972)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub unsafe fn _mm512_maskz_mulhi_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
+    let zero = _mm512_setzero_si512().as_u16x32();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mulhi_epu16&expand=3968)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub unsafe fn _mm256_mask_mulhi_epu16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, mul, src.as_u16x16()))
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mulhi_epu16&expand=3969)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub unsafe fn _mm256_maskz_mulhi_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
+    let zero = _mm256_setzero_si256().as_u16x16();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mulhi_epu16&expand=3965)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub unsafe fn _mm_mask_mulhi_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mulhi_epu16(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, mul, src.as_u16x8()))
+}
+
+/// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mulhi_epu16&expand=3966)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+pub unsafe fn _mm_maskz_mulhi_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mulhi_epu16(a, b).as_u16x8();
+    let zero = _mm_setzero_si128().as_u16x8();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mulhi_epi16&expand=3962)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub unsafe fn _mm512_mulhi_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmulhw(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mulhi_epi16&expand=3960)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub unsafe fn _mm512_mask_mulhi_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mulhi_epi16&expand=3961)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub unsafe fn _mm512_maskz_mulhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mulhi_epi16&expand=3957)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub unsafe fn _mm256_mask_mulhi_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mulhi_epi16&expand=3958)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub unsafe fn _mm256_maskz_mulhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mulhi_epi16&expand=3954)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub unsafe fn _mm_mask_mulhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mulhi_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+}
+
+/// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mulhi_epi16&expand=3955)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+pub unsafe fn _mm_maskz_mulhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mulhi_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mulhrs_epi16&expand=3986)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub unsafe fn _mm512_mulhrs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmulhrsw(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mulhrs_epi16&expand=3984)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub unsafe fn _mm512_mask_mulhrs_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mulhrs_epi16&expand=3985)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub unsafe fn _mm512_maskz_mulhrs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mulhrs_epi16&expand=3981)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub unsafe fn _mm256_mask_mulhrs_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mulhrs_epi16&expand=3982)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub unsafe fn _mm256_maskz_mulhrs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mulhrs_epi16&expand=3978)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub unsafe fn _mm_mask_mulhrs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mulhrs_epi16&expand=3979)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+pub unsafe fn _mm_maskz_mulhrs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mullo_epi16&expand=3996)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub unsafe fn _mm512_mullo_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_mul(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mullo_epi16&expand=3994)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub unsafe fn _mm512_mask_mullo_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let mul = _mm512_mullo_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mullo_epi16&expand=3995)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub unsafe fn _mm512_maskz_mullo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let mul = _mm512_mullo_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mullo_epi16&expand=3991)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub unsafe fn _mm256_mask_mullo_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let mul = _mm256_mullo_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mullo_epi16&expand=3992)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub unsafe fn _mm256_maskz_mullo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let mul = _mm256_mullo_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mullo_epi16&expand=3988)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub unsafe fn _mm_mask_mullo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mullo_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+}
+
+/// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mullo_epi16&expand=3989)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+pub unsafe fn _mm_maskz_mullo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mullo_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epu16&expand=3609)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub unsafe fn _mm512_max_epu16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaxuw(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epu16&expand=3607)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub unsafe fn _mm512_mask_max_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epu16(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, max, src.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epu16&expand=3608)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub unsafe fn _mm512_maskz_max_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epu16(a, b).as_u16x32();
+    let zero = _mm512_setzero_si512().as_u16x32();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epu16&expand=3604)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub unsafe fn _mm256_mask_max_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epu16(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, max, src.as_u16x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epu16&expand=3605)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub unsafe fn _mm256_maskz_max_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epu16(a, b).as_u16x16();
+    let zero = _mm256_setzero_si256().as_u16x16();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epu16&expand=3601)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub unsafe fn _mm_mask_max_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epu16(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, max, src.as_u16x8()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epu16&expand=3602)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+pub unsafe fn _mm_maskz_max_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epu16(a, b).as_u16x8();
+    let zero = _mm_setzero_si128().as_u16x8();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epu8&expand=3636)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub unsafe fn _mm512_max_epu8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaxub(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epu8&expand=3634)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub unsafe fn _mm512_mask_max_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epu8(a, b).as_u8x64();
+    transmute(simd_select_bitmask(k, max, src.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epu8&expand=3635)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub unsafe fn _mm512_maskz_max_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epu8(a, b).as_u8x64();
+    let zero = _mm512_setzero_si512().as_u8x64();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epu8&expand=3631)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub unsafe fn _mm256_mask_max_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epu8(a, b).as_u8x32();
+    transmute(simd_select_bitmask(k, max, src.as_u8x32()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epu8&expand=3632)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub unsafe fn _mm256_maskz_max_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epu8(a, b).as_u8x32();
+    let zero = _mm256_setzero_si256().as_u8x32();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epu8&expand=3628)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub unsafe fn _mm_mask_max_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epu8(a, b).as_u8x16();
+    transmute(simd_select_bitmask(k, max, src.as_u8x16()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epu8&expand=3629)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+pub unsafe fn _mm_maskz_max_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epu8(a, b).as_u8x16();
+    let zero = _mm_setzero_si128().as_u8x16();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epi16&expand=3573)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub unsafe fn _mm512_max_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaxsw(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epi16&expand=3571)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub unsafe fn _mm512_mask_max_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, max, src.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epi16&expand=3572)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub unsafe fn _mm512_maskz_max_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epi16&expand=3568)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub unsafe fn _mm256_mask_max_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, max, src.as_i16x16()))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epi16&expand=3569)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub unsafe fn _mm256_maskz_max_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epi16&expand=3565)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub unsafe fn _mm_mask_max_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, max, src.as_i16x8()))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epi16&expand=3566)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+pub unsafe fn _mm_maskz_max_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epi8&expand=3600)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub unsafe fn _mm512_max_epi8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaxsb(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epi8&expand=3598)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub unsafe fn _mm512_mask_max_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, max, src.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epi8&expand=3599)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub unsafe fn _mm512_maskz_max_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epi8(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epi8&expand=3595)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub unsafe fn _mm256_mask_max_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, max, src.as_i8x32()))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epi8&expand=3596)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub unsafe fn _mm256_maskz_max_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epi8(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epi8&expand=3592)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub unsafe fn _mm_mask_max_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, max, src.as_i8x16()))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epi8&expand=3593)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+pub unsafe fn _mm_maskz_max_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epi8(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epu16&expand=3723)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub unsafe fn _mm512_min_epu16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpminuw(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epu16&expand=3721)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub unsafe fn _mm512_mask_min_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epu16(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, min, src.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epu16&expand=3722)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub unsafe fn _mm512_maskz_min_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epu16(a, b).as_u16x32();
+    let zero = _mm512_setzero_si512().as_u16x32();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epu16&expand=3718)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub unsafe fn _mm256_mask_min_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epu16(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, min, src.as_u16x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epu16&expand=3719)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub unsafe fn _mm256_maskz_min_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epu16(a, b).as_u16x16();
+    let zero = _mm256_setzero_si256().as_u16x16();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epu16&expand=3715)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub unsafe fn _mm_mask_min_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epu16(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, min, src.as_u16x8()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epu16&expand=3716)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+pub unsafe fn _mm_maskz_min_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epu16(a, b).as_u16x8();
+    let zero = _mm_setzero_si128().as_u16x8();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epu8&expand=3750)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub unsafe fn _mm512_min_epu8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpminub(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epu8&expand=3748)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub unsafe fn _mm512_mask_min_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epu8(a, b).as_u8x64();
+    transmute(simd_select_bitmask(k, min, src.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epu8&expand=3749)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub unsafe fn _mm512_maskz_min_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epu8(a, b).as_u8x64();
+    let zero = _mm512_setzero_si512().as_u8x64();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epu8&expand=3745)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub unsafe fn _mm256_mask_min_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epu8(a, b).as_u8x32();
+    transmute(simd_select_bitmask(k, min, src.as_u8x32()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epu8&expand=3746)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub unsafe fn _mm256_maskz_min_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epu8(a, b).as_u8x32();
+    let zero = _mm256_setzero_si256().as_u8x32();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epu8&expand=3742)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub unsafe fn _mm_mask_min_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epu8(a, b).as_u8x16();
+    transmute(simd_select_bitmask(k, min, src.as_u8x16()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epu8&expand=3743)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminub))]
+pub unsafe fn _mm_maskz_min_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epu8(a, b).as_u8x16();
+    let zero = _mm_setzero_si128().as_u8x16();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epi16&expand=3687)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub unsafe fn _mm512_min_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpminsw(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epi16&expand=3685)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub unsafe fn _mm512_mask_min_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, min, src.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epi16&expand=3686)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub unsafe fn _mm512_maskz_min_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epi16&expand=3682)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub unsafe fn _mm256_mask_min_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, min, src.as_i16x16()))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epi16&expand=3683)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub unsafe fn _mm256_maskz_min_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epi16&expand=3679)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub unsafe fn _mm_mask_min_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, min, src.as_i16x8()))
+}
+
+/// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epi16&expand=3680)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+pub unsafe fn _mm_maskz_min_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epi8&expand=3714)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub unsafe fn _mm512_min_epi8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpminsb(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epi8&expand=3712)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub unsafe fn _mm512_mask_min_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, min, src.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epi8&expand=3713)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub unsafe fn _mm512_maskz_min_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epi8(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epi8&expand=3709)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub unsafe fn _mm256_mask_min_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, min, src.as_i8x32()))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epi8&expand=3710)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub unsafe fn _mm256_maskz_min_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epi8(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epi8&expand=3706)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub unsafe fn _mm_mask_min_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, min, src.as_i8x16()))
+}
+
+/// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epi8&expand=3707)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+pub unsafe fn _mm_maskz_min_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epi8(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmplt_epu16_mask&expand=1050)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmplt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<u16x32, _>(simd_lt(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epu16_mask&expand=1051)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmplt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmplt_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmplt_epu16_mask&expand=1050)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmplt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<u16x16, _>(simd_lt(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epu16_mask&expand=1049)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmplt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmplt_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi16_mask&expand=1018)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmplt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u16x8, _>(simd_lt(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epi16_mask&expand=1019)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmplt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmplt_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm512_cmplt_epu8_mask&expand=1068)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmplt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<u8x64, _>(simd_lt(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epu8_mask&expand=1069)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmplt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmplt_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epu8_mask&expand=1066)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmplt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<u8x32, _>(simd_lt(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epu8_mask&expand=1067)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmplt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmplt_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epu8_mask&expand=1064)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmplt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<u8x16, _>(simd_lt(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epu8_mask&expand=1065)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmplt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmplt_epu8_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epi16_mask&expand=1022)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmplt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<i16x32, _>(simd_lt(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epi16_mask&expand=1023)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmplt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmplt_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epi16_mask&expand=1020)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmplt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<i16x16, _>(simd_lt(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epi16_mask&expand=1021)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmplt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmplt_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi16_mask&expand=1018)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmplt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epi16_mask&expand=1019)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmplt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmplt_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epi8_mask&expand=1044)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmplt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<i8x64, _>(simd_lt(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epi8_mask&expand=1045)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmplt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmplt_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epi8_mask&expand=1042)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmplt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<i8x32, _>(simd_lt(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epi8_mask&expand=1043)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmplt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmplt_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi8_mask&expand=1040)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmplt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epi8_mask&expand=1041)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmplt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmplt_epi8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epu16_mask&expand=927)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpgt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<u16x32, _>(simd_gt(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epu16_mask&expand=928)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpgt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmpgt_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epu16_mask&expand=925)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpgt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<u16x16, _>(simd_gt(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epu16_mask&expand=926)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpgt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmpgt_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epu16_mask&expand=923)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpgt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u16x8, _>(simd_gt(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epu16_mask&expand=924)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpgt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpgt_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epu8_mask&expand=945)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpgt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<u8x64, _>(simd_gt(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epu8_mask&expand=946)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpgt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmpgt_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epu8_mask&expand=943)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpgt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<u8x32, _>(simd_gt(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epu8_mask&expand=944)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpgt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmpgt_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epu8_mask&expand=941)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpgt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<u8x16, _>(simd_gt(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epu8_mask&expand=942)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpgt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmpgt_epu8_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epi16_mask&expand=897)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpgt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<i16x32, _>(simd_gt(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epi16_mask&expand=898)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpgt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmpgt_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16_mask&expand=895)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpgt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epi16_mask&expand=896)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpgt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmpgt_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi16_mask&expand=893)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpgt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epi16_mask&expand=894)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpgt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpgt_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epi8_mask&expand=921)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpgt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<i8x64, _>(simd_gt(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epi8_mask&expand=922)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpgt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmpgt_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8_mask&expand=919)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpgt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epi8_mask&expand=920)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpgt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmpgt_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi8_mask&expand=917)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpgt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epi8_mask&expand=918)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpgt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmpgt_epi8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epu16_mask&expand=989)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmple_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<u16x32, _>(simd_le(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epu16_mask&expand=990)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmple_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmple_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epu16_mask&expand=987)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmple_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<u16x16, _>(simd_le(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epu16_mask&expand=988)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmple_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmple_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epu16_mask&expand=985)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmple_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u16x8, _>(simd_le(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epu16_mask&expand=986)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmple_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmple_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epu8_mask&expand=1007)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmple_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<u8x64, _>(simd_le(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epu8_mask&expand=1008)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmple_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmple_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epu8_mask&expand=1005)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmple_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<u8x32, _>(simd_le(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epu8_mask&expand=1006)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmple_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmple_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epu8_mask&expand=1003)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmple_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<u8x16, _>(simd_le(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epu8_mask&expand=1004)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmple_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmple_epu8_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epi16_mask&expand=965)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmple_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<i16x32, _>(simd_le(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epi16_mask&expand=966)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmple_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmple_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epi16_mask&expand=963)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmple_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<i16x16, _>(simd_le(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epi16_mask&expand=964)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmple_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmple_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epi16_mask&expand=961)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmple_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i16x8, _>(simd_le(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epi16_mask&expand=962)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmple_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmple_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epi8_mask&expand=983)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmple_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<i8x64, _>(simd_le(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epi8_mask&expand=984)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmple_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmple_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epi8_mask&expand=981)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmple_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<i8x32, _>(simd_le(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epi8_mask&expand=982)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmple_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmple_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epi8_mask&expand=979)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmple_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<i8x16, _>(simd_le(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epi8_mask&expand=980)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmple_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmple_epi8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epu16_mask&expand=867)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpge_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<u16x32, _>(simd_ge(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epu16_mask&expand=868)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpge_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmpge_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epu16_mask&expand=865)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpge_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<u16x16, _>(simd_ge(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epu16_mask&expand=866)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpge_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmpge_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epu16_mask&expand=863)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpge_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u16x8, _>(simd_ge(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epu16_mask&expand=864)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpge_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpge_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epu8_mask&expand=885)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpge_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<u8x64, _>(simd_ge(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epu8_mask&expand=886)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpge_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmpge_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epu8_mask&expand=883)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpge_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<u8x32, _>(simd_ge(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epu8_mask&expand=884)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpge_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmpge_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epu8_mask&expand=881)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpge_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<u8x16, _>(simd_ge(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epu8_mask&expand=882)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpge_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmpge_epu8_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epi16_mask&expand=843)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpge_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<i16x32, _>(simd_ge(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epi16_mask&expand=844)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpge_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmpge_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epi16_mask&expand=841)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpge_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<i16x16, _>(simd_ge(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epi16_mask&expand=842)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpge_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmpge_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epi16_mask&expand=839)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpge_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i16x8, _>(simd_ge(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epi16_mask&expand=840)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpge_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpge_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epi8_mask&expand=861)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpge_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<i8x64, _>(simd_ge(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epi8_mask&expand=862)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpge_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmpge_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epi8_mask&expand=859)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpge_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<i8x32, _>(simd_ge(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epi8_mask&expand=860)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpge_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmpge_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epi8_mask&expand=857)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpge_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<i8x16, _>(simd_ge(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epi8_mask&expand=858)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpge_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmpge_epi8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epu16_mask&expand=801)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpeq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<u16x32, _>(simd_eq(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epu16_mask&expand=802)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpeq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmpeq_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epu16_mask&expand=799)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpeq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<u16x16, _>(simd_eq(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epu16_mask&expand=800)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpeq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmpeq_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epu16_mask&expand=797)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpeq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u16x8, _>(simd_eq(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epu16_mask&expand=798)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpeq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpeq_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epu8_mask&expand=819)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpeq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<u8x64, _>(simd_eq(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epu8_mask&expand=820)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpeq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmpeq_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epu8_mask&expand=817)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpeq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<u8x32, _>(simd_eq(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epu8_mask&expand=818)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpeq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmpeq_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epu8_mask&expand=815)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpeq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<u8x16, _>(simd_eq(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epu8_mask&expand=816)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpeq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmpeq_epu8_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epi16_mask&expand=771)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpeq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<i16x32, _>(simd_eq(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epi16_mask&expand=772)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpeq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmpeq_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16_mask&expand=769)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpeq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epi16_mask&expand=770)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpeq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmpeq_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi16_mask&expand=767)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpeq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epi16_mask&expand=768)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpeq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpeq_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epi8_mask&expand=795)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpeq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<i8x64, _>(simd_eq(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epi8_mask&expand=796)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpeq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmpeq_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8_mask&expand=793)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpeq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epi8_mask&expand=794)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpeq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmpeq_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi8_mask&expand=791)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpeq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epi8_mask&expand=792)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpeq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmpeq_epi8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epu16_mask&expand=1106)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpneq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<u16x32, _>(simd_ne(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epu16_mask&expand=1107)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpneq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmpneq_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epu16_mask&expand=1104)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpneq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<u16x16, _>(simd_ne(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epu16_mask&expand=1105)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpneq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmpneq_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epu16_mask&expand=1102)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpneq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u16x8, _>(simd_ne(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epu16_mask&expand=1103)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpneq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpneq_epu16_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epu8_mask&expand=1124)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpneq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<u8x64, _>(simd_ne(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epu8_mask&expand=1125)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpneq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmpneq_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epu8_mask&expand=1122)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpneq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<u8x32, _>(simd_ne(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epu8_mask&expand=1123)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpneq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmpneq_epu8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epu8_mask&expand=1120)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpneq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<u8x16, _>(simd_ne(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epu8_mask&expand=1121)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpneq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmpneq_epu8_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epi16_mask&expand=1082)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpneq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    simd_bitmask::<i16x32, _>(simd_ne(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epi16_mask&expand=1083)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpneq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    _mm512_cmpneq_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epi16_mask&expand=1080)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpneq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    simd_bitmask::<i16x16, _>(simd_ne(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epi16_mask&expand=1081)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpneq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    _mm256_cmpneq_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epi16_mask&expand=1078)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpneq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i16x8, _>(simd_ne(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epi16_mask&expand=1079)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpneq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpneq_epi16_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epi8_mask&expand=1100)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpneq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    simd_bitmask::<i8x64, _>(simd_ne(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epi8_mask&expand=1101)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpneq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    _mm512_cmpneq_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epi8_mask&expand=1098)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_cmpneq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    simd_bitmask::<i8x32, _>(simd_ne(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epi8_mask&expand=1099)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm256_mask_cmpneq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    _mm256_cmpneq_epi8_mask(a, b) & k1
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epi8_mask&expand=1096)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_cmpneq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    simd_bitmask::<i8x16, _>(simd_ne(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epi8_mask&expand=1097)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    _mm_cmpneq_epi8_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by `IMM8`, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu16_mask&expand=715)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u16x32();
+    let b = b.as_u16x32();
+    let r = vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
+    transmute(r)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu16_mask&expand=716)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask32 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u16x32();
+    let b = b.as_u16x32();
+    let r = vpcmpuw(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epu16_mask&expand=713)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u16x16();
+    let b = b.as_u16x16();
+    let r = vpcmpuw256(a, b, IMM8, 0b11111111_11111111);
+    transmute(r)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epu16_mask&expand=714)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
+    k1: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask16 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u16x16();
+    let b = b.as_u16x16();
+    let r = vpcmpuw256(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epu16_mask&expand=711)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u16x8();
+    let b = b.as_u16x8();
+    let r = vpcmpuw128(a, b, IMM8, 0b11111111);
+    transmute(r)
+}
+
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epu16_mask&expand=712)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u16x8();
+    let b = b.as_u16x8();
+    let r = vpcmpuw128(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu8_mask&expand=733)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u8x64();
+    let b = b.as_u8x64();
+    let r = vpcmpub(
+        a,
+        b,
+        IMM8,
+        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+    );
+    transmute(r)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu8_mask&expand=734)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
+    k1: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask64 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u8x64();
+    let b = b.as_u8x64();
+    let r = vpcmpub(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epu8_mask&expand=731)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
+    let r = vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
+    transmute(r)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epu8_mask&expand=732)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask32 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
+    let r = vpcmpub256(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epu8_mask&expand=729)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    let r = vpcmpub128(a, b, IMM8, 0b11111111_11111111);
+    transmute(r)
+}
+
+/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epu8_mask&expand=730)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
+    k1: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask16 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    let r = vpcmpub128(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi16_mask&expand=691)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i16x32();
+    let b = b.as_i16x32();
+    let r = vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
+    transmute(r)
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi16_mask&expand=692)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask32 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i16x32();
+    let b = b.as_i16x32();
+    let r = vpcmpw(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epi16_mask&expand=689)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    let r = vpcmpw256(a, b, IMM8, 0b11111111_11111111);
+    transmute(r)
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epi16_mask&expand=690)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
+    k1: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask16 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    let r = vpcmpw256(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epi16_mask&expand=687)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    let r = vpcmpw128(a, b, IMM8, 0b11111111);
+    transmute(r)
+}
+
+/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epi16_mask&expand=688)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    let r = vpcmpw128(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi8_mask&expand=709)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i8x64();
+    let b = b.as_i8x64();
+    let r = vpcmpb(
+        a,
+        b,
+        IMM8,
+        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+    );
+    transmute(r)
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi8_mask&expand=710)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
+    k1: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask64 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i8x64();
+    let b = b.as_i8x64();
+    let r = vpcmpb(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epi8_mask&expand=707)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i8x32();
+    let b = b.as_i8x32();
+    let r = vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
+    transmute(r)
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epi8_mask&expand=708)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
+    k1: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask32 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i8x32();
+    let b = b.as_i8x32();
+    let r = vpcmpb256(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epi8_mask&expand=705)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    let r = vpcmpb128(a, b, IMM8, 0b11111111_11111111);
+    transmute(r)
+}
+
+/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epi8_mask&expand=706)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(
+    k1: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask16 {
+    static_assert_imm3!(IMM8);
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    let r = vpcmpb128(a, b, IMM8, k1);
+    transmute(r)
+}
+
+/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_epi16&expand=3368)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm512_loadu_epi16(mem_addr: *const i16) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 16 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_epi16&expand=3365)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm256_loadu_epi16(mem_addr: *const i16) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 8 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_epi16&expand=3362)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm_loadu_epi16(mem_addr: *const i16) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Load 512-bits (composed of 64 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_epi8&expand=3395)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm512_loadu_epi8(mem_addr: *const i8) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 32 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_epi8&expand=3392)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm256_loadu_epi8(mem_addr: *const i8) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 16 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_epi8&expand=3389)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm_loadu_epi8(mem_addr: *const i8) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 32 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_epi16&expand=5622)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm512_storeu_epi16(mem_addr: *mut i16, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 16 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_epi16&expand=5620)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm256_storeu_epi16(mem_addr: *mut i16, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 8 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_epi16&expand=5618)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm_storeu_epi16(mem_addr: *mut i16, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Store 512-bits (composed of 64 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_epi8&expand=5640)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm512_storeu_epi8(mem_addr: *mut i8, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 32 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_epi8&expand=5638)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm256_storeu_epi8(mem_addr: *mut i8, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 16 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_epi8&expand=5636)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vmovdqu16 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vmovdqu8 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vmovdqu16 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vmovdqu8 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vmovdqu16 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vmovdqu8 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) {
+    asm!(
+        vps!("vmovdqu16", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) {
+    asm!(
+        vps!("vmovdqu8", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) {
+    asm!(
+        vps!("vmovdqu16", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) {
+    asm!(
+        vps!("vmovdqu8", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) {
+    asm!(
+        vps!("vmovdqu16", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) {
+    asm!(
+        vps!("vmovdqu8", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_madd_epi16&expand=3511)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub unsafe fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_madd_epi16&expand=3512)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub unsafe fn _mm512_mask_madd_epi16(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let madd = _mm512_madd_epi16(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_madd_epi16&expand=3513)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub unsafe fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let madd = _mm512_madd_epi16(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, madd, zero))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_madd_epi16&expand=3509)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub unsafe fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let madd = _mm256_madd_epi16(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_madd_epi16&expand=3510)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub unsafe fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let madd = _mm256_madd_epi16(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, madd, zero))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_madd_epi16&expand=3506)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub unsafe fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let madd = _mm_madd_epi16(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
+}
+
+/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_madd_epi16&expand=3507)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+pub unsafe fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let madd = _mm_madd_epi16(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, madd, zero))
+}
+
+/// Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maddubs_epi16&expand=3539)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub unsafe fn _mm512_maddubs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaddubsw(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_maddubs_epi16&expand=3540)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub unsafe fn _mm512_mask_maddubs_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, madd, src.as_i16x32()))
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_maddubs_epi16&expand=3541)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub unsafe fn _mm512_maskz_maddubs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, madd, zero))
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_maddubs_epi16&expand=3537)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub unsafe fn _mm256_mask_maddubs_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, madd, src.as_i16x16()))
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_maddubs_epi16&expand=3538)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub unsafe fn _mm256_maskz_maddubs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, madd, zero))
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_maddubs_epi16&expand=3534)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub unsafe fn _mm_mask_maddubs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let madd = _mm_maddubs_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, madd, src.as_i16x8()))
+}
+
+/// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_maddubs_epi16&expand=3535)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+pub unsafe fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let madd = _mm_maddubs_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, madd, zero))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_packs_epi32&expand=4091)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub unsafe fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpackssdw(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_packs_epi32&expand=4089)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub unsafe fn _mm512_mask_packs_epi32(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let pack = _mm512_packs_epi32(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_packs_epi32&expand=4090)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub unsafe fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let pack = _mm512_packs_epi32(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_packs_epi32&expand=4086)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub unsafe fn _mm256_mask_packs_epi32(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let pack = _mm256_packs_epi32(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_packs_epi32&expand=4087)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub unsafe fn _mm256_maskz_packs_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let pack = _mm256_packs_epi32(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_packs_epi32&expand=4083)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub unsafe fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let pack = _mm_packs_epi32(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_packs_epi32&expand=4084)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+pub unsafe fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let pack = _mm_packs_epi32(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_packs_epi16&expand=4082)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub unsafe fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpacksswb(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_packs_epi16&expand=4080)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub unsafe fn _mm512_mask_packs_epi16(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let pack = _mm512_packs_epi16(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_packs_epi16&expand=4081)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub unsafe fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let pack = _mm512_packs_epi16(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_packs_epi16&expand=4077)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub unsafe fn _mm256_mask_packs_epi16(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let pack = _mm256_packs_epi16(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=#text=_mm256_maskz_packs_epi16&expand=4078)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub unsafe fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let pack = _mm256_packs_epi16(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_packs_epi16&expand=4074)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub unsafe fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let pack = _mm_packs_epi16(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_packs_epi16&expand=4075)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+pub unsafe fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let pack = _mm_packs_epi16(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_packus_epi32&expand=4130)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub unsafe fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpackusdw(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_packus_epi32&expand=4128)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub unsafe fn _mm512_mask_packus_epi32(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let pack = _mm512_packus_epi32(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_packus_epi32&expand=4129)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub unsafe fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let pack = _mm512_packus_epi32(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_packus_epi32&expand=4125)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub unsafe fn _mm256_mask_packus_epi32(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let pack = _mm256_packus_epi32(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_packus_epi32&expand=4126)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub unsafe fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let pack = _mm256_packus_epi32(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_packus_epi32&expand=4122)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub unsafe fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let pack = _mm_packus_epi32(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+}
+
+/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_packus_epi32&expand=4123)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+pub unsafe fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let pack = _mm_packus_epi32(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_packus_epi16&expand=4121)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub unsafe fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpackuswb(a.as_i16x32(), b.as_i16x32()))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_packus_epi16&expand=4119)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub unsafe fn _mm512_mask_packus_epi16(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let pack = _mm512_packus_epi16(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_packus_epi16&expand=4120)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub unsafe fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let pack = _mm512_packus_epi16(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_packus_epi16&expand=4116)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub unsafe fn _mm256_mask_packus_epi16(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let pack = _mm256_packus_epi16(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_packus_epi16&expand=4117)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub unsafe fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let pack = _mm256_packus_epi16(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_packus_epi16&expand=4113)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub unsafe fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let pack = _mm_packus_epi16(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+}
+
+/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_packus_epi16&expand=4114)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+pub unsafe fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let pack = _mm_packus_epi16(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, pack, zero))
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_avg_epu16&expand=388)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub unsafe fn _mm512_avg_epu16(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpavgw(a.as_u16x32(), b.as_u16x32()))
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_avg_epu16&expand=389)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub unsafe fn _mm512_mask_avg_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let avg = _mm512_avg_epu16(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, avg, src.as_u16x32()))
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_avg_epu16&expand=390)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub unsafe fn _mm512_maskz_avg_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let avg = _mm512_avg_epu16(a, b).as_u16x32();
+    let zero = _mm512_setzero_si512().as_u16x32();
+    transmute(simd_select_bitmask(k, avg, zero))
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_avg_epu16&expand=386)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub unsafe fn _mm256_mask_avg_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let avg = _mm256_avg_epu16(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, avg, src.as_u16x16()))
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_avg_epu16&expand=387)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub unsafe fn _mm256_maskz_avg_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let avg = _mm256_avg_epu16(a, b).as_u16x16();
+    let zero = _mm256_setzero_si256().as_u16x16();
+    transmute(simd_select_bitmask(k, avg, zero))
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_avg_epu16&expand=383)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub unsafe fn _mm_mask_avg_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let avg = _mm_avg_epu16(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, avg, src.as_u16x8()))
+}
+
+/// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_avg_epu16&expand=384)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+pub unsafe fn _mm_maskz_avg_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let avg = _mm_avg_epu16(a, b).as_u16x8();
+    let zero = _mm_setzero_si128().as_u16x8();
+    transmute(simd_select_bitmask(k, avg, zero))
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_avg_epu8&expand=397)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub unsafe fn _mm512_avg_epu8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpavgb(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_avg_epu8&expand=398)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub unsafe fn _mm512_mask_avg_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let avg = _mm512_avg_epu8(a, b).as_u8x64();
+    transmute(simd_select_bitmask(k, avg, src.as_u8x64()))
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_avg_epu8&expand=399)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub unsafe fn _mm512_maskz_avg_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let avg = _mm512_avg_epu8(a, b).as_u8x64();
+    let zero = _mm512_setzero_si512().as_u8x64();
+    transmute(simd_select_bitmask(k, avg, zero))
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_avg_epu8&expand=395)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub unsafe fn _mm256_mask_avg_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let avg = _mm256_avg_epu8(a, b).as_u8x32();
+    transmute(simd_select_bitmask(k, avg, src.as_u8x32()))
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_avg_epu8&expand=396)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub unsafe fn _mm256_maskz_avg_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let avg = _mm256_avg_epu8(a, b).as_u8x32();
+    let zero = _mm256_setzero_si256().as_u8x32();
+    transmute(simd_select_bitmask(k, avg, zero))
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_avg_epu8&expand=392)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub unsafe fn _mm_mask_avg_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let avg = _mm_avg_epu8(a, b).as_u8x16();
+    transmute(simd_select_bitmask(k, avg, src.as_u8x16()))
+}
+
+/// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_avg_epu8&expand=393)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+pub unsafe fn _mm_maskz_avg_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let avg = _mm_avg_epu8(a, b).as_u8x16();
+    let zero = _mm_setzero_si128().as_u8x16();
+    transmute(simd_select_bitmask(k, avg, zero))
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sll_epi16&expand=5271)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub unsafe fn _mm512_sll_epi16(a: __m512i, count: __m128i) -> __m512i {
+    transmute(vpsllw(a.as_i16x32(), count.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sll_epi16&expand=5269)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub unsafe fn _mm512_mask_sll_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    count: __m128i,
+) -> __m512i {
+    let shf = _mm512_sll_epi16(a, count).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sll_epi16&expand=5270)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub unsafe fn _mm512_maskz_sll_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    let shf = _mm512_sll_epi16(a, count).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sll_epi16&expand=5266)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub unsafe fn _mm256_mask_sll_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    count: __m128i,
+) -> __m256i {
+    let shf = _mm256_sll_epi16(a, count).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sll_epi16&expand=5267)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub unsafe fn _mm256_maskz_sll_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    let shf = _mm256_sll_epi16(a, count).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sll_epi16&expand=5263)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub unsafe fn _mm_mask_sll_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sll_epi16(a, count).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sll_epi16&expand=5264)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub unsafe fn _mm_maskz_sll_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sll_epi16(a, count).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_slli_epi16&expand=5301)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i16x32();
+    let r = vpslliw(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_slli_epi16&expand=5299)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_slli_epi16<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i16x32();
+    let shf = vpslliw(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_slli_epi16&expand=5300)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_slli_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i16x32();
+    let shf = vpslliw(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_slli_epi16&expand=5296)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_slli_epi16<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = pslliw256(a.as_i16x16(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_slli_epi16&expand=5297)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_slli_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = pslliw256(a.as_i16x16(), imm8);
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_slli_epi16&expand=5293)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_slli_epi16<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = pslliw128(a.as_i16x8(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_slli_epi16&expand=5294)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_slli_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = pslliw128(a.as_i16x8(), imm8);
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sllv_epi16&expand=5333)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub unsafe fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
+    transmute(vpsllvw(a.as_i16x32(), count.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sllv_epi16&expand=5331)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub unsafe fn _mm512_mask_sllv_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    count: __m512i,
+) -> __m512i {
+    let shf = _mm512_sllv_epi16(a, count).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sllv_epi16&expand=5332)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub unsafe fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    let shf = _mm512_sllv_epi16(a, count).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi16&expand=5330)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub unsafe fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
+    transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sllv_epi16&expand=5328)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub unsafe fn _mm256_mask_sllv_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    count: __m256i,
+) -> __m256i {
+    let shf = _mm256_sllv_epi16(a, count).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sllv_epi16&expand=5329)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub unsafe fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    let shf = _mm256_sllv_epi16(a, count).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi16&expand=5327)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub unsafe fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
+    transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sllv_epi16&expand=5325)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub unsafe fn _mm_mask_sllv_epi16(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    count: __m128i,
+) -> __m128i {
+    let shf = _mm_sllv_epi16(a, count).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sllv_epi16&expand=5326)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvw))]
+pub unsafe fn _mm_maskz_sllv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sllv_epi16(a, count).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srl_epi16&expand=5483)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub unsafe fn _mm512_srl_epi16(a: __m512i, count: __m128i) -> __m512i {
+    transmute(vpsrlw(a.as_i16x32(), count.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srl_epi16&expand=5481)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub unsafe fn _mm512_mask_srl_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    count: __m128i,
+) -> __m512i {
+    let shf = _mm512_srl_epi16(a, count).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srl_epi16&expand=5482)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub unsafe fn _mm512_maskz_srl_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    let shf = _mm512_srl_epi16(a, count).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srl_epi16&expand=5478)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub unsafe fn _mm256_mask_srl_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    count: __m128i,
+) -> __m256i {
+    let shf = _mm256_srl_epi16(a, count).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srl_epi16&expand=5479)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub unsafe fn _mm256_maskz_srl_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    let shf = _mm256_srl_epi16(a, count).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srl_epi16&expand=5475)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub unsafe fn _mm_mask_srl_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srl_epi16(a, count).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srl_epi16&expand=5476)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub unsafe fn _mm_maskz_srl_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srl_epi16(a, count).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srli_epi16&expand=5513)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i16x32();
+    let r = vpsrliw(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srli_epi16&expand=5511)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srli_epi16<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i16x32();
+    let shf = vpsrliw(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srli_epi16&expand=5512)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    //imm8 should be u32, it seems the document to verify is incorrect
+    let a = a.as_i16x32();
+    let shf = vpsrliw(a, IMM8 as u32);
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srli_epi16&expand=5508)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_srli_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shf = _mm256_srli_epi16::<IMM8>(a);
+    transmute(simd_select_bitmask(k, shf.as_i16x16(), src.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srli_epi16&expand=5509)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_srli_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shf = _mm256_srli_epi16::<IMM8>(a);
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf.as_i16x16(), zero))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srli_epi16&expand=5505)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_srli_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf = _mm_srli_epi16::<IMM8>(a);
+    transmute(simd_select_bitmask(k, shf.as_i16x8(), src.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srli_epi16&expand=5506)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_srli_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf = _mm_srli_epi16::<IMM8>(a);
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf.as_i16x8(), zero))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srlv_epi16&expand=5545)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub unsafe fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
+    transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srlv_epi16&expand=5543)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub unsafe fn _mm512_mask_srlv_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    count: __m512i,
+) -> __m512i {
+    let shf = _mm512_srlv_epi16(a, count).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srlv_epi16&expand=5544)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub unsafe fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    let shf = _mm512_srlv_epi16(a, count).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi16&expand=5542)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub unsafe fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
+    transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srlv_epi16&expand=5540)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub unsafe fn _mm256_mask_srlv_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    count: __m256i,
+) -> __m256i {
+    let shf = _mm256_srlv_epi16(a, count).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srlv_epi16&expand=5541)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub unsafe fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    let shf = _mm256_srlv_epi16(a, count).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi16&expand=5539)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub unsafe fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
+    transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srlv_epi16&expand=5537)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub unsafe fn _mm_mask_srlv_epi16(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    count: __m128i,
+) -> __m128i {
+    let shf = _mm_srlv_epi16(a, count).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srlv_epi16&expand=5538)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvw))]
+pub unsafe fn _mm_maskz_srlv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srlv_epi16(a, count).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sra_epi16&expand=5398)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub unsafe fn _mm512_sra_epi16(a: __m512i, count: __m128i) -> __m512i {
+    transmute(vpsraw(a.as_i16x32(), count.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sra_epi16&expand=5396)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub unsafe fn _mm512_mask_sra_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    count: __m128i,
+) -> __m512i {
+    let shf = _mm512_sra_epi16(a, count).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sra_epi16&expand=5397)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub unsafe fn _mm512_maskz_sra_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    let shf = _mm512_sra_epi16(a, count).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sra_epi16&expand=5393)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub unsafe fn _mm256_mask_sra_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    count: __m128i,
+) -> __m256i {
+    let shf = _mm256_sra_epi16(a, count).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sra_epi16&expand=5394)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub unsafe fn _mm256_maskz_sra_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    let shf = _mm256_sra_epi16(a, count).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sra_epi16&expand=5390)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub unsafe fn _mm_mask_sra_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sra_epi16(a, count).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sra_epi16&expand=5391)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub unsafe fn _mm_maskz_sra_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sra_epi16(a, count).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srai_epi16&expand=5427)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i16x32();
+    let r = vpsraiw(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srai_epi16&expand=5425)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srai_epi16<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i16x32();
+    let shf = vpsraiw(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srai_epi16&expand=5426)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srai_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i16x32();
+    let shf = vpsraiw(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srai_epi16&expand=5422)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_srai_epi16<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psraiw256(a.as_i16x16(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srai_epi16&expand=5423)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_srai_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psraiw256(a.as_i16x16(), imm8);
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srai_epi16&expand=5419)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_srai_epi16<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psraiw128(a.as_i16x8(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srai_epi16&expand=5420)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_srai_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psraiw128(a.as_i16x8(), imm8);
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srav_epi16&expand=5456)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub unsafe fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
+    transmute(vpsravw(a.as_i16x32(), count.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srav_epi16&expand=5454)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub unsafe fn _mm512_mask_srav_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    count: __m512i,
+) -> __m512i {
+    let shf = _mm512_srav_epi16(a, count).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srav_epi16&expand=5455)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub unsafe fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    let shf = _mm512_srav_epi16(a, count).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi16&expand=5453)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub unsafe fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
+    transmute(vpsravw256(a.as_i16x16(), count.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srav_epi16&expand=5451)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub unsafe fn _mm256_mask_srav_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    count: __m256i,
+) -> __m256i {
+    let shf = _mm256_srav_epi16(a, count).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srav_epi16&expand=5452)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub unsafe fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    let shf = _mm256_srav_epi16(a, count).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi16&expand=5450)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub unsafe fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
+    transmute(vpsravw128(a.as_i16x8(), count.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srav_epi16&expand=5448)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub unsafe fn _mm_mask_srav_epi16(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    count: __m128i,
+) -> __m128i {
+    let shf = _mm_srav_epi16(a, count).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srav_epi16&expand=5449)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravw))]
+pub unsafe fn _mm_maskz_srav_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srav_epi16(a, count).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi16&expand=4226)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub unsafe fn _mm512_permutex2var_epi16(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    transmute(vpermi2w(a.as_i16x32(), idx.as_i16x32(), b.as_i16x32()))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi16&expand=4223)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpermt2w))]
+pub unsafe fn _mm512_mask_permutex2var_epi16(
+    a: __m512i,
+    k: __mmask32,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+    transmute(simd_select_bitmask(k, permute, a.as_i16x32()))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi16&expand=4225)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub unsafe fn _mm512_maskz_permutex2var_epi16(
+    k: __mmask32,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi16&expand=4224)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpermi2w))]
+pub unsafe fn _mm512_mask2_permutex2var_epi16(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask32,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+    transmute(simd_select_bitmask(k, permute, idx.as_i16x32()))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_epi16&expand=4222)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub unsafe fn _mm256_permutex2var_epi16(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    transmute(vpermi2w256(a.as_i16x16(), idx.as_i16x16(), b.as_i16x16()))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_epi16&expand=4219)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2w))]
+pub unsafe fn _mm256_mask_permutex2var_epi16(
+    a: __m256i,
+    k: __mmask16,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+    transmute(simd_select_bitmask(k, permute, a.as_i16x16()))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_epi16&expand=4221)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub unsafe fn _mm256_maskz_permutex2var_epi16(
+    k: __mmask16,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_epi16&expand=4220)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermi2w))]
+pub unsafe fn _mm256_mask2_permutex2var_epi16(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask16,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+    transmute(simd_select_bitmask(k, permute, idx.as_i16x16()))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_epi16&expand=4218)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub unsafe fn _mm_permutex2var_epi16(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    transmute(vpermi2w128(a.as_i16x8(), idx.as_i16x8(), b.as_i16x8()))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_epi16&expand=4215)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2w))]
+pub unsafe fn _mm_mask_permutex2var_epi16(
+    a: __m128i,
+    k: __mmask8,
+    idx: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+    transmute(simd_select_bitmask(k, permute, a.as_i16x8()))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_epi16&expand=4217)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
+pub unsafe fn _mm_maskz_permutex2var_epi16(
+    k: __mmask8,
+    a: __m128i,
+    idx: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_epi16&expand=4216)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermi2w))]
+pub unsafe fn _mm_mask2_permutex2var_epi16(
+    a: __m128i,
+    idx: __m128i,
+    k: __mmask8,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+    transmute(simd_select_bitmask(k, permute, idx.as_i16x8()))
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi16&expand=4295)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub unsafe fn _mm512_permutexvar_epi16(idx: __m512i, a: __m512i) -> __m512i {
+    transmute(vpermw(a.as_i16x32(), idx.as_i16x32()))
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi16&expand=4293)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub unsafe fn _mm512_mask_permutexvar_epi16(
+    src: __m512i,
+    k: __mmask32,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
+    transmute(simd_select_bitmask(k, permute, src.as_i16x32()))
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi16&expand=4294)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub unsafe fn _mm512_maskz_permutexvar_epi16(k: __mmask32, idx: __m512i, a: __m512i) -> __m512i {
+    let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_epi16&expand=4292)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub unsafe fn _mm256_permutexvar_epi16(idx: __m256i, a: __m256i) -> __m256i {
+    transmute(vpermw256(a.as_i16x16(), idx.as_i16x16()))
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_epi16&expand=4290)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub unsafe fn _mm256_mask_permutexvar_epi16(
+    src: __m256i,
+    k: __mmask16,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
+    transmute(simd_select_bitmask(k, permute, src.as_i16x16()))
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_epi16&expand=4291)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub unsafe fn _mm256_maskz_permutexvar_epi16(k: __mmask16, idx: __m256i, a: __m256i) -> __m256i {
+    let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_epi16&expand=4289)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub unsafe fn _mm_permutexvar_epi16(idx: __m128i, a: __m128i) -> __m128i {
+    transmute(vpermw128(a.as_i16x8(), idx.as_i16x8()))
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutexvar_epi16&expand=4287)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub unsafe fn _mm_mask_permutexvar_epi16(
+    src: __m128i,
+    k: __mmask8,
+    idx: __m128i,
+    a: __m128i,
+) -> __m128i {
+    let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
+    transmute(simd_select_bitmask(k, permute, src.as_i16x8()))
+}
+
+/// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutexvar_epi16&expand=4288)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermw))]
+pub unsafe fn _mm_maskz_permutexvar_epi16(k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
+    let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi16&expand=430)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
+pub unsafe fn _mm512_mask_blend_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(k, b.as_i16x32(), a.as_i16x32()))
+}
+
+/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_epi16&expand=429)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
+pub unsafe fn _mm256_mask_blend_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(k, b.as_i16x16(), a.as_i16x16()))
+}
+
+/// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_epi16&expand=427)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
+pub unsafe fn _mm_mask_blend_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(k, b.as_i16x8(), a.as_i16x8()))
+}
+
+/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi8&expand=441)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
+pub unsafe fn _mm512_mask_blend_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(k, b.as_i8x64(), a.as_i8x64()))
+}
+
+/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_epi8&expand=440)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
+pub unsafe fn _mm256_mask_blend_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(k, b.as_i8x32(), a.as_i8x32()))
+}
+
+/// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_epi8&expand=439)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
+pub unsafe fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(k, b.as_i8x16(), a.as_i8x16()))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastw_epi16&expand=587)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
+    let a = _mm512_castsi128_si512(a).as_i16x32();
+    let ret: i16x32 = simd_shuffle32!(
+        a,
+        a,
+        [
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0,
+        ],
+    );
+    transmute(ret)
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastw_epi16&expand=588)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm512_mask_broadcastw_epi16(src: __m512i, k: __mmask32, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i16x32()))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastw_epi16&expand=589)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm512_maskz_broadcastw_epi16(k: __mmask32, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastw_epi16&expand=585)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm256_mask_broadcastw_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i16x16()))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastw_epi16&expand=586)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm256_maskz_broadcastw_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastw_epi16&expand=582)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm_mask_broadcastw_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i16x8()))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastw_epi16&expand=583)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastb_epi8&expand=536)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
+    let a = _mm512_castsi128_si512(a).as_i8x64();
+    let ret: i8x64 = simd_shuffle64!(
+        a,
+        a,
+        [
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        ],
+    );
+    transmute(ret)
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastb_epi8&expand=537)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm512_mask_broadcastb_epi8(src: __m512i, k: __mmask64, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i8x64()))
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastb_epi8&expand=538)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm512_maskz_broadcastb_epi8(k: __mmask64, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastb_epi8&expand=534)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm256_mask_broadcastb_epi8(src: __m256i, k: __mmask32, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i8x32()))
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastb_epi8&expand=535)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm256_maskz_broadcastb_epi8(k: __mmask32, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastb_epi8&expand=531)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm_mask_broadcastb_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i8x16()))
+}
+
+/// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastb_epi8&expand=532)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm_maskz_broadcastb_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi16&expand=6012)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub unsafe fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
+    let a = a.as_i16x32();
+    let b = b.as_i16x32();
+    #[rustfmt::skip]
+    let r: i16x32 = simd_shuffle32!(
+        a,
+        b,
+        [
+            4, 32 + 4, 5, 32 + 5,
+            6, 32 + 6, 7, 32 + 7,
+            12, 32 + 12, 13, 32 + 13,
+            14, 32 + 14, 15, 32 + 15,
+            20, 32 + 20, 21, 32 + 21,
+            22, 32 + 22, 23, 32 + 23,
+            28, 32 + 28, 29, 32 + 29,
+            30, 32 + 30, 31, 32 + 31,
+        ],
+    );
+    transmute(r)
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi16&expand=6010)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub unsafe fn _mm512_mask_unpackhi_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i16x32()))
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi16&expand=6011)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub unsafe fn _mm512_maskz_unpackhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_epi16&expand=6007)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub unsafe fn _mm256_mask_unpackhi_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i16x16()))
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_epi16&expand=6008)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub unsafe fn _mm256_maskz_unpackhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_epi16&expand=6004)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub unsafe fn _mm_mask_unpackhi_epi16(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i16x8()))
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_epi16&expand=6005)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+pub unsafe fn _mm_maskz_unpackhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi8&expand=6039)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub unsafe fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
+    let a = a.as_i8x64();
+    let b = b.as_i8x64();
+    #[rustfmt::skip]
+    let r: i8x64 = simd_shuffle64!(
+        a,
+        b,
+        [
+            8,  64+8,   9, 64+9,
+            10, 64+10, 11, 64+11,
+            12, 64+12, 13, 64+13,
+            14, 64+14, 15, 64+15,
+            24, 64+24, 25, 64+25,
+            26, 64+26, 27, 64+27,
+            28, 64+28, 29, 64+29,
+            30, 64+30, 31, 64+31,
+            40, 64+40, 41, 64+41,
+            42, 64+42, 43, 64+43,
+            44, 64+44, 45, 64+45,
+            46, 64+46, 47, 64+47,
+            56, 64+56, 57, 64+57,
+            58, 64+58, 59, 64+59,
+            60, 64+60, 61, 64+61,
+            62, 64+62, 63, 64+63,
+        ],
+    );
+    transmute(r)
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi8&expand=6037)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub unsafe fn _mm512_mask_unpackhi_epi8(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i8x64()))
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi8&expand=6038)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub unsafe fn _mm512_maskz_unpackhi_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_epi8&expand=6034)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub unsafe fn _mm256_mask_unpackhi_epi8(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i8x32()))
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_epi8&expand=6035)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub unsafe fn _mm256_maskz_unpackhi_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_epi8&expand=6031)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub unsafe fn _mm_mask_unpackhi_epi8(
+    src: __m128i,
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i8x16()))
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_epi8&expand=6032)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+pub unsafe fn _mm_maskz_unpackhi_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi16&expand=6069)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub unsafe fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
+    let a = a.as_i16x32();
+    let b = b.as_i16x32();
+    #[rustfmt::skip]
+    let r: i16x32 = simd_shuffle32!(
+        a,
+        b,
+        [
+            0,  32+0,   1, 32+1,
+            2,  32+2,   3, 32+3,
+            8,  32+8,   9, 32+9,
+            10, 32+10, 11, 32+11,
+            16, 32+16, 17, 32+17,
+            18, 32+18, 19, 32+19,
+            24, 32+24, 25, 32+25,
+            26, 32+26, 27, 32+27
+        ],
+    );
+    transmute(r)
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi16&expand=6067)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub unsafe fn _mm512_mask_unpacklo_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i16x32()))
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi16&expand=6068)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub unsafe fn _mm512_maskz_unpacklo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_epi16&expand=6064)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub unsafe fn _mm256_mask_unpacklo_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i16x16()))
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_epi16&expand=6065)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub unsafe fn _mm256_maskz_unpacklo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_epi16&expand=6061)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub unsafe fn _mm_mask_unpacklo_epi16(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i16x8()))
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_epi16&expand=6062)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+pub unsafe fn _mm_maskz_unpacklo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi8&expand=6096)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub unsafe fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
+    let a = a.as_i8x64();
+    let b = b.as_i8x64();
+    #[rustfmt::skip]
+    let r: i8x64 = simd_shuffle64!(
+        a,
+        b,
+        [
+            0,  64+0,   1, 64+1,
+            2,  64+2,   3, 64+3,
+            4,  64+4,   5, 64+5,
+            6,  64+6,   7, 64+7,
+            16, 64+16, 17, 64+17,
+            18, 64+18, 19, 64+19,
+            20, 64+20, 21, 64+21,
+            22, 64+22, 23, 64+23,
+            32, 64+32, 33, 64+33,
+            34, 64+34, 35, 64+35,
+            36, 64+36, 37, 64+37,
+            38, 64+38, 39, 64+39,
+            48, 64+48, 49, 64+49,
+            50, 64+50, 51, 64+51,
+            52, 64+52, 53, 64+53,
+            54, 64+54, 55, 64+55,
+        ],
+    );
+    transmute(r)
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi8&expand=6094)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub unsafe fn _mm512_mask_unpacklo_epi8(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i8x64()))
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi8&expand=6095)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub unsafe fn _mm512_maskz_unpacklo_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_epi8&expand=6091)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub unsafe fn _mm256_mask_unpacklo_epi8(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i8x32()))
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_epi8&expand=6092)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub unsafe fn _mm256_maskz_unpacklo_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_epi8&expand=6088)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub unsafe fn _mm_mask_unpacklo_epi8(
+    src: __m128i,
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i8x16()))
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_epi8&expand=6089)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub unsafe fn _mm_maskz_unpacklo_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi16&expand=3795)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub unsafe fn _mm512_mask_mov_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    let mov = a.as_i16x32();
+    transmute(simd_select_bitmask(k, mov, src.as_i16x32()))
+}
+
+/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi16&expand=3796)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub unsafe fn _mm512_maskz_mov_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    let mov = a.as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_epi16&expand=3793)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub unsafe fn _mm256_mask_mov_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    let mov = a.as_i16x16();
+    transmute(simd_select_bitmask(k, mov, src.as_i16x16()))
+}
+
+/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_epi16&expand=3794)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub unsafe fn _mm256_maskz_mov_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    let mov = a.as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_epi16&expand=3791)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub unsafe fn _mm_mask_mov_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let mov = a.as_i16x8();
+    transmute(simd_select_bitmask(k, mov, src.as_i16x8()))
+}
+
+/// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_epi16&expand=3792)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu16))]
+pub unsafe fn _mm_maskz_mov_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    let mov = a.as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi8&expand=3813)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub unsafe fn _mm512_mask_mov_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    let mov = a.as_i8x64();
+    transmute(simd_select_bitmask(k, mov, src.as_i8x64()))
+}
+
+/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi8&expand=3814)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub unsafe fn _mm512_maskz_mov_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    let mov = a.as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_epi8&expand=3811)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub unsafe fn _mm256_mask_mov_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    let mov = a.as_i8x32();
+    transmute(simd_select_bitmask(k, mov, src.as_i8x32()))
+}
+
+/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_epi8&expand=3812)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub unsafe fn _mm256_maskz_mov_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    let mov = a.as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_epi8&expand=3809)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub unsafe fn _mm_mask_mov_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    let mov = a.as_i8x16();
+    transmute(simd_select_bitmask(k, mov, src.as_i8x16()))
+}
+
+/// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_epi8&expand=3810)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu8))]
+pub unsafe fn _mm_maskz_mov_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    let mov = a.as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi16&expand=4942)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm512_mask_set1_epi16(src: __m512i, k: __mmask32, a: i16) -> __m512i {
+    let r = _mm512_set1_epi16(a).as_i16x32();
+    transmute(simd_select_bitmask(k, r, src.as_i16x32()))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi16&expand=4943)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm512_maskz_set1_epi16(k: __mmask32, a: i16) -> __m512i {
+    let r = _mm512_set1_epi16(a).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_set1_epi16&expand=4939)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm256_mask_set1_epi16(src: __m256i, k: __mmask16, a: i16) -> __m256i {
+    let r = _mm256_set1_epi16(a).as_i16x16();
+    transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_set1_epi16&expand=4940)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm256_maskz_set1_epi16(k: __mmask16, a: i16) -> __m256i {
+    let r = _mm256_set1_epi16(a).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_set1_epi16&expand=4936)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm_mask_set1_epi16(src: __m128i, k: __mmask8, a: i16) -> __m128i {
+    let r = _mm_set1_epi16(a).as_i16x8();
+    transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_set1_epi16&expand=4937)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+pub unsafe fn _mm_maskz_set1_epi16(k: __mmask8, a: i16) -> __m128i {
+    let r = _mm_set1_epi16(a).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi8&expand=4970)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm512_mask_set1_epi8(src: __m512i, k: __mmask64, a: i8) -> __m512i {
+    let r = _mm512_set1_epi8(a).as_i8x64();
+    transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi8&expand=4971)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm512_maskz_set1_epi8(k: __mmask64, a: i8) -> __m512i {
+    let r = _mm512_set1_epi8(a).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_set1_epi8&expand=4967)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm256_mask_set1_epi8(src: __m256i, k: __mmask32, a: i8) -> __m256i {
+    let r = _mm256_set1_epi8(a).as_i8x32();
+    transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_set1_epi8&expand=4968)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm256_maskz_set1_epi8(k: __mmask32, a: i8) -> __m256i {
+    let r = _mm256_set1_epi8(a).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_set1_epi8&expand=4964)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm_mask_set1_epi8(src: __m128i, k: __mmask16, a: i8) -> __m128i {
+    let r = _mm_set1_epi8(a).as_i8x16();
+    transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+}
+
+/// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_set1_epi8&expand=4965)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+pub unsafe fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
+    let r = _mm_set1_epi8(a).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shufflelo_epi16&expand=5221)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i16x32();
+    let r: i16x32 = simd_shuffle32!(
+        a,
+        a,
+        <const IMM8: i32> [
+            IMM8 as u32 & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
+            4,
+            5,
+            6,
+            7,
+            (IMM8 as u32 & 0b11) + 8,
+            ((IMM8 as u32 >> 2) & 0b11) + 8,
+            ((IMM8 as u32 >> 4) & 0b11) + 8,
+            ((IMM8 as u32 >> 6) & 0b11) + 8,
+            12,
+            13,
+            14,
+            15,
+            (IMM8 as u32 & 0b11) + 16,
+            ((IMM8 as u32 >> 2) & 0b11) + 16,
+            ((IMM8 as u32 >> 4) & 0b11) + 16,
+            ((IMM8 as u32 >> 6) & 0b11) + 16,
+            20,
+            21,
+            22,
+            23,
+            (IMM8 as u32 & 0b11) + 24,
+            ((IMM8 as u32 >> 2) & 0b11) + 24,
+            ((IMM8 as u32 >> 4) & 0b11) + 24,
+            ((IMM8 as u32 >> 6) & 0b11) + 24,
+            28,
+            29,
+            30,
+            31,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shufflelo_epi16&expand=5219)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_shufflelo_epi16::<IMM8>(a);
+    transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shufflelo_epi16&expand=5220)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_shufflelo_epi16::<IMM8>(a);
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, r.as_i16x32(), zero))
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shufflelo_epi16&expand=5216)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
+    transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shufflelo_epi16&expand=5217)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shuffle.as_i16x16(), zero))
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shufflelo_epi16&expand=5213)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_shufflelo_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
+    transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shufflelo_epi16&expand=5214)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shuffle.as_i16x8(), zero))
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shufflehi_epi16&expand=5212)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i16x32();
+    let r: i16x32 = simd_shuffle32!(
+        a,
+        a,
+        <const IMM8: i32> [
+            0,
+            1,
+            2,
+            3,
+            (IMM8 as u32 & 0b11) + 4,
+            ((IMM8 as u32 >> 2) & 0b11) + 4,
+            ((IMM8 as u32 >> 4) & 0b11) + 4,
+            ((IMM8 as u32 >> 6) & 0b11) + 4,
+            8,
+            9,
+            10,
+            11,
+            (IMM8 as u32 & 0b11) + 12,
+            ((IMM8 as u32 >> 2) & 0b11) + 12,
+            ((IMM8 as u32 >> 4) & 0b11) + 12,
+            ((IMM8 as u32 >> 6) & 0b11) + 12,
+            16,
+            17,
+            18,
+            19,
+            (IMM8 as u32 & 0b11) + 20,
+            ((IMM8 as u32 >> 2) & 0b11) + 20,
+            ((IMM8 as u32 >> 4) & 0b11) + 20,
+            ((IMM8 as u32 >> 6) & 0b11) + 20,
+            24,
+            25,
+            26,
+            27,
+            (IMM8 as u32 & 0b11) + 28,
+            ((IMM8 as u32 >> 2) & 0b11) + 28,
+            ((IMM8 as u32 >> 4) & 0b11) + 28,
+            ((IMM8 as u32 >> 6) & 0b11) + 28,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shufflehi_epi16&expand=5210)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_shufflehi_epi16::<IMM8>(a);
+    transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shufflehi_epi16&expand=5211)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_shufflehi_epi16::<IMM8>(a);
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, r.as_i16x32(), zero))
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shufflehi_epi16&expand=5207)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
+    transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shufflehi_epi16&expand=5208)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shuffle.as_i16x16(), zero))
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shufflehi_epi16&expand=5204)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_shufflehi_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
+    transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shufflehi_epi16&expand=5205)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shuffle.as_i16x8(), zero))
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_epi8&expand=5159)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub unsafe fn _mm512_shuffle_epi8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpshufb(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi8&expand=5157)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub unsafe fn _mm512_mask_shuffle_epi8(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, shuffle, src.as_i8x64()))
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi8&expand=5158)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub unsafe fn _mm512_maskz_shuffle_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, shuffle, zero))
+}
+
+/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_epi8&expand=5154)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub unsafe fn _mm256_mask_shuffle_epi8(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, shuffle, src.as_i8x32()))
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_epi8&expand=5155)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub unsafe fn _mm256_maskz_shuffle_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, shuffle, zero))
+}
+
+/// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_epi8&expand=5151)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub unsafe fn _mm_mask_shuffle_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, shuffle, src.as_i8x16()))
+}
+
+/// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_epi8&expand=5152)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+pub unsafe fn _mm_maskz_shuffle_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, shuffle, zero))
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi16_mask&expand=5884)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub unsafe fn _mm512_test_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi16_mask&expand=5883)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub unsafe fn _mm512_mask_test_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_test_epi16_mask&expand=5882)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub unsafe fn _mm256_test_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_test_epi16_mask&expand=5881)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub unsafe fn _mm256_mask_test_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_epi16_mask&expand=5880)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub unsafe fn _mm_test_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_test_epi16_mask&expand=5879)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmw))]
+pub unsafe fn _mm_mask_test_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi8_mask&expand=5902)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub unsafe fn _mm512_test_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi8_mask&expand=5901)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub unsafe fn _mm512_mask_test_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_test_epi8_mask&expand=5900)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub unsafe fn _mm256_test_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_test_epi8_mask&expand=5899)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub unsafe fn _mm256_mask_test_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_epi8_mask&expand=5898)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub unsafe fn _mm_test_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_test_epi8_mask&expand=5897)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmb))]
+pub unsafe fn _mm_mask_test_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi16_mask&expand=5915)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub unsafe fn _mm512_testn_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi16&expand=5914)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub unsafe fn _mm512_mask_testn_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testn_epi16_mask&expand=5913)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub unsafe fn _mm256_testn_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_testn_epi16_mask&expand=5912)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub unsafe fn _mm256_mask_testn_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testn_epi16_mask&expand=5911)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub unsafe fn _mm_testn_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi16_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_testn_epi16_mask&expand=5910)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmw))]
+pub unsafe fn _mm_mask_testn_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi16_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi8_mask&expand=5933)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub unsafe fn _mm512_testn_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi8_mask&expand=5932)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub unsafe fn _mm512_mask_testn_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+    let and = _mm512_and_si512(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testn_epi8_mask&expand=5931)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub unsafe fn _mm256_testn_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_testn_epi8_mask&expand=5930)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub unsafe fn _mm256_mask_testn_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi8_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testn_epi8_mask&expand=5929)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub unsafe fn _mm_testn_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi8_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_testn_epi8_mask&expand=5928)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmb))]
+pub unsafe fn _mm_mask_testn_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi8_mask(k, and, zero)
+}
+
+/// Store 64-bit mask from a into memory.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_store_mask64&expand=5578)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovq
+pub unsafe fn _store_mask64(mem_addr: *mut u64, a: __mmask64) {
+    ptr::write(mem_addr as *mut __mmask64, a);
+}
+
+/// Store 32-bit mask from a into memory.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_store_mask32&expand=5577)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovd
+pub unsafe fn _store_mask32(mem_addr: *mut u32, a: __mmask32) {
+    ptr::write(mem_addr as *mut __mmask32, a);
+}
+
+/// Load 64-bit mask from memory into k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_load_mask64&expand=3318)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovq
+pub unsafe fn _load_mask64(mem_addr: *const u64) -> __mmask64 {
+    ptr::read(mem_addr as *const __mmask64)
+}
+
+/// Load 32-bit mask from memory into k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_load_mask32&expand=3317)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(mov))] //should be kmovd
+pub unsafe fn _load_mask32(mem_addr: *const u32) -> __mmask32 {
+    ptr::read(mem_addr as *const __mmask32)
+}
+
+/// Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sad_epu8&expand=4855)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsadbw))]
+pub unsafe fn _mm512_sad_epu8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpsadbw(a.as_u8x64(), b.as_u8x64()))
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dbsad_epu8&expand=2114)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm512_dbsad_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_u8x64();
+    let b = b.as_u8x64();
+    let r = vdbpsadbw(a, b, IMM8);
+    transmute(r)
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dbsad_epu8&expand=2115)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm512_mask_dbsad_epu8<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_u8x64();
+    let b = b.as_u8x64();
+    let r = vdbpsadbw(a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_u16x32()))
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dbsad_epu8&expand=2116)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm512_maskz_dbsad_epu8<const IMM8: i32>(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_u8x64();
+    let b = b.as_u8x64();
+    let r = vdbpsadbw(a, b, IMM8);
+    transmute(simd_select_bitmask(
+        k,
+        r,
+        _mm512_setzero_si512().as_u16x32(),
+    ))
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dbsad_epu8&expand=2111)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm256_dbsad_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
+    let r = vdbpsadbw256(a, b, IMM8);
+    transmute(r)
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dbsad_epu8&expand=2112)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
+    let r = vdbpsadbw256(a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_u16x16()))
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dbsad_epu8&expand=2113)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_u8x32();
+    let b = b.as_u8x32();
+    let r = vdbpsadbw256(a, b, IMM8);
+    transmute(simd_select_bitmask(
+        k,
+        r,
+        _mm256_setzero_si256().as_u16x16(),
+    ))
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dbsad_epu8&expand=2108)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm_dbsad_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    let r = vdbpsadbw128(a, b, IMM8);
+    transmute(r)
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dbsad_epu8&expand=2109)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm_mask_dbsad_epu8<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    let r = vdbpsadbw128(a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_u16x8()))
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dbsad_epu8&expand=2110)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm_maskz_dbsad_epu8<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
+    let r = vdbpsadbw128(a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, _mm_setzero_si128().as_u16x8()))
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movepi16_mask&expand=3873)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovw2m))]
+pub unsafe fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
+    let filter = _mm512_set1_epi16(1 << 15);
+    let a = _mm512_and_si512(a, filter);
+    _mm512_cmpeq_epi16_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movepi16_mask&expand=3872)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovw2m))]
+pub unsafe fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
+    let filter = _mm256_set1_epi16(1 << 15);
+    let a = _mm256_and_si256(a, filter);
+    _mm256_cmpeq_epi16_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi16_mask&expand=3871)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovw2m))]
+pub unsafe fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
+    let filter = _mm_set1_epi16(1 << 15);
+    let a = _mm_and_si128(a, filter);
+    _mm_cmpeq_epi16_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movepi8_mask&expand=3883)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovb2m))]
+pub unsafe fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
+    let filter = _mm512_set1_epi8(1 << 7);
+    let a = _mm512_and_si512(a, filter);
+    _mm512_cmpeq_epi8_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movepi8_mask&expand=3882)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
+                                           // using vpmovb2m plus converting the mask register to a standard register.
+pub unsafe fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
+    let filter = _mm256_set1_epi8(1 << 7);
+    let a = _mm256_and_si256(a, filter);
+    _mm256_cmpeq_epi8_mask(a, filter)
+}
+
+/// Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi8_mask&expand=3881)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
+                                           // using vpmovb2m plus converting the mask register to a standard register.
+pub unsafe fn _mm_movepi8_mask(a: __m128i) -> __mmask16 {
+    let filter = _mm_set1_epi8(1 << 7);
+    let a = _mm_and_si128(a, filter);
+    _mm_cmpeq_epi8_mask(a, filter)
+}
+
+/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movm_epi16&expand=3886)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovm2w))]
+pub unsafe fn _mm512_movm_epi16(k: __mmask32) -> __m512i {
+    let one = _mm512_set1_epi16(
+        1 << 15
+            | 1 << 14
+            | 1 << 13
+            | 1 << 12
+            | 1 << 11
+            | 1 << 10
+            | 1 << 9
+            | 1 << 8
+            | 1 << 7
+            | 1 << 6
+            | 1 << 5
+            | 1 << 4
+            | 1 << 3
+            | 1 << 2
+            | 1 << 1
+            | 1 << 0,
+    )
+    .as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, one, zero))
+}
+
+/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movm_epi16&expand=3885)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2w))]
+pub unsafe fn _mm256_movm_epi16(k: __mmask16) -> __m256i {
+    let one = _mm256_set1_epi16(
+        1 << 15
+            | 1 << 14
+            | 1 << 13
+            | 1 << 12
+            | 1 << 11
+            | 1 << 10
+            | 1 << 9
+            | 1 << 8
+            | 1 << 7
+            | 1 << 6
+            | 1 << 5
+            | 1 << 4
+            | 1 << 3
+            | 1 << 2
+            | 1 << 1
+            | 1 << 0,
+    )
+    .as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, one, zero))
+}
+
+/// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movm_epi16&expand=3884)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2w))]
+pub unsafe fn _mm_movm_epi16(k: __mmask8) -> __m128i {
+    let one = _mm_set1_epi16(
+        1 << 15
+            | 1 << 14
+            | 1 << 13
+            | 1 << 12
+            | 1 << 11
+            | 1 << 10
+            | 1 << 9
+            | 1 << 8
+            | 1 << 7
+            | 1 << 6
+            | 1 << 5
+            | 1 << 4
+            | 1 << 3
+            | 1 << 2
+            | 1 << 1
+            | 1 << 0,
+    )
+    .as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, one, zero))
+}
+
+/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movm_epi8&expand=3895)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovm2b))]
+pub unsafe fn _mm512_movm_epi8(k: __mmask64) -> __m512i {
+    let one =
+        _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+            .as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, one, zero))
+}
+
+/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movm_epi8&expand=3894)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2b))]
+pub unsafe fn _mm256_movm_epi8(k: __mmask32) -> __m256i {
+    let one =
+        _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+            .as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, one, zero))
+}
+
+/// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movm_epi8&expand=3893)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovm2b))]
+pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
+    let one = _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+        .as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, one, zero))
+}
+
+/// Add 32-bit masks in a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kadd_mask32&expand=3207)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(all(test, target_arch = "x86"), assert_instr(add))]
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(lea))] // generate normal lea/add code instead of kaddd
+                                                                  //llvm.x86.avx512.kadd.d
+pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    transmute(a + b)
+}
+
+/// Add 64-bit masks in a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kadd_mask64&expand=3208)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(all(test, target_arch = "x86"), assert_instr(add))]
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(lea))] // generate normal lea/add code instead of kaddd
+                                                                  //llvm.x86.avx512.kadd.d
+pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    transmute(a + b)
+}
+
+/// Compute the bitwise AND of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kand_mask32&expand=3213)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandd
+pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    transmute(a & b)
+}
+
+/// Compute the bitwise AND of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kand_mask64&expand=3214)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandq
+pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    transmute(a & b)
+}
+
+/// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_knot_mask32&expand=3234)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
+    transmute(a ^ 0b11111111_11111111_11111111_11111111)
+}
+
+/// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_knot_mask64&expand=3235)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
+    transmute(a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111)
+}
+
+/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kandn_mask32&expand=3219)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandnd
+pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    transmute(_knot_mask32(a) & b)
+}
+
+/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kandn_mask64&expand=3220)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandnq
+pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    transmute(_knot_mask64(a) & b)
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kor_mask32&expand=3240)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(or))] // generate normal and code instead of kord
+pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    transmute(a | b)
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kor_mask64&expand=3241)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(or))] // generate normal and code instead of korq
+pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    transmute(a | b)
+}
+
+/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kxor_mask32&expand=3292)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kxord
+pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    transmute(a ^ b)
+}
+
+/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kxor_mask64&expand=3293)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kxorq
+pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    transmute(a ^ b)
+}
+
+/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kxnor_mask32&expand=3286)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kxnord
+pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    transmute(_knot_mask32(a ^ b))
+}
+
+/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kxnor_mask64&expand=3287)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kxnorq
+pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    transmute(_knot_mask64(a ^ b))
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_epi8&expand=1407)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm512_cvtepi16_epi8(a: __m512i) -> __m256i {
+    let a = a.as_i16x32();
+    transmute::<i8x32, _>(simd_cast(a))
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_epi8&expand=1408)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm512_mask_cvtepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
+    transmute(simd_select_bitmask(k, convert, src.as_i8x32()))
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_epi8&expand=1409)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm512_maskz_cvtepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
+    transmute(simd_select_bitmask(
+        k,
+        convert,
+        _mm256_setzero_si256().as_i8x32(),
+    ))
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi8&expand=1404)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm256_cvtepi16_epi8(a: __m256i) -> __m128i {
+    let a = a.as_i16x16();
+    transmute::<i8x16, _>(simd_cast(a))
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_epi8&expand=1405)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm256_mask_cvtepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
+    transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_epi8&expand=1406)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
+    transmute(simd_select_bitmask(
+        k,
+        convert,
+        _mm_setzero_si128().as_i8x16(),
+    ))
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi8&expand=1401)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
+    let a = a.as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    let v256: i16x16 = simd_shuffle16!(a, zero, [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]);
+    transmute::<i8x16, _>(simd_cast(v256))
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_epi8&expand=1402)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm_mask_cvtepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi16_epi8(a).as_i8x16();
+    let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
+    transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_epi8&expand=1403)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm_maskz_cvtepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi16_epi8(a).as_i8x16();
+    let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi16_epi8&expand=1807)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i {
+    transmute(vpmovswb(
+        a.as_i16x32(),
+        _mm256_setzero_si256().as_i8x32(),
+        0b11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi16_epi8&expand=1808)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    transmute(vpmovswb(a.as_i16x32(), src.as_i8x32(), k))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi16_epi8&expand=1809)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    transmute(vpmovswb(
+        a.as_i16x32(),
+        _mm256_setzero_si256().as_i8x32(),
+        k,
+    ))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi16_epi8&expand=1804)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovswb256(
+        a.as_i16x16(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111_11111111,
+    ))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi16_epi8&expand=1805)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    transmute(vpmovswb256(a.as_i16x16(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi16_epi8&expand=1806)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm256_maskz_cvtsepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    transmute(vpmovswb256(
+        a.as_i16x16(),
+        _mm_setzero_si128().as_i8x16(),
+        k,
+    ))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi16_epi8&expand=1801)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm_cvtsepi16_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovswb128(
+        a.as_i16x8(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi16_epi8&expand=1802)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm_mask_cvtsepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovswb128(a.as_i16x8(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi16_epi8&expand=1803)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm_maskz_cvtsepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovswb128(a.as_i16x8(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi16_epi8&expand=2042)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i {
+    transmute(vpmovuswb(
+        a.as_u16x32(),
+        _mm256_setzero_si256().as_u8x32(),
+        0b11111111_11111111_11111111_11111111,
+    ))
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi16_epi8&expand=2043)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    transmute(vpmovuswb(a.as_u16x32(), src.as_u8x32(), k))
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi16_epi8&expand=2044)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    transmute(vpmovuswb(
+        a.as_u16x32(),
+        _mm256_setzero_si256().as_u8x32(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi16_epi8&expand=2039)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovuswb256(
+        a.as_u16x16(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111_11111111,
+    ))
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi16_epi8&expand=2040)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    transmute(vpmovuswb256(a.as_u16x16(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi16_epi8&expand=2041)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm256_maskz_cvtusepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    transmute(vpmovuswb256(
+        a.as_u16x16(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi16_epi8&expand=2036)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm_cvtusepi16_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovuswb128(
+        a.as_u16x8(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi16_epi8&expand=2037)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm_mask_cvtusepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovuswb128(a.as_u16x8(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi16_epi8&expand=2038)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm_maskz_cvtusepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovuswb128(
+        a.as_u16x8(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi8_epi16&expand=1526)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub unsafe fn _mm512_cvtepi8_epi16(a: __m256i) -> __m512i {
+    let a = a.as_i8x32();
+    transmute::<i16x32, _>(simd_cast(a))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi8_epi16&expand=1527)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub unsafe fn _mm512_mask_cvtepi8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
+    transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi8_epi16&expand=1528)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub unsafe fn _mm512_maskz_cvtepi8_epi16(k: __mmask32, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
+    transmute(simd_select_bitmask(
+        k,
+        convert,
+        _mm512_setzero_si512().as_i16x32(),
+    ))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi8_epi16&expand=1524)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub unsafe fn _mm256_mask_cvtepi8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
+    transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi8_epi16&expand=1525)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub unsafe fn _mm256_maskz_cvtepi8_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
+    transmute(simd_select_bitmask(
+        k,
+        convert,
+        _mm256_setzero_si256().as_i16x16(),
+    ))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi8_epi16&expand=1521)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub unsafe fn _mm_mask_cvtepi8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi8_epi16(a).as_i16x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi8_epi16&expand=1522)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+pub unsafe fn _mm_maskz_cvtepi8_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi8_epi16(a).as_i16x8();
+    transmute(simd_select_bitmask(
+        k,
+        convert,
+        _mm_setzero_si128().as_i16x8(),
+    ))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu8_epi16&expand=1612)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub unsafe fn _mm512_cvtepu8_epi16(a: __m256i) -> __m512i {
+    let a = a.as_u8x32();
+    transmute::<i16x32, _>(simd_cast(a))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu8_epi16&expand=1613)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub unsafe fn _mm512_mask_cvtepu8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
+    transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu8_epi16&expand=1614)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub unsafe fn _mm512_maskz_cvtepu8_epi16(k: __mmask32, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
+    transmute(simd_select_bitmask(
+        k,
+        convert,
+        _mm512_setzero_si512().as_i16x32(),
+    ))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu8_epi16&expand=1610)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub unsafe fn _mm256_mask_cvtepu8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
+    transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi16&expand=1611)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub unsafe fn _mm256_maskz_cvtepu8_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
+    transmute(simd_select_bitmask(
+        k,
+        convert,
+        _mm256_setzero_si256().as_i16x16(),
+    ))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu8_epi16&expand=1607)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub unsafe fn _mm_mask_cvtepu8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi16(a).as_i16x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi16&expand=1608)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+pub unsafe fn _mm_maskz_cvtepu8_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi16(a).as_i16x8();
+    transmute(simd_select_bitmask(
+        k,
+        convert,
+        _mm_setzero_si128().as_i16x8(),
+    ))
+}
+
+/// Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_bslli_epi128&expand=591)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || i % 16 < shift {
+            0
+        } else {
+            64 + (i - shift)
+        }
+    }
+    let a = a.as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    let r: i8x64 = simd_shuffle64!(
+        zero,
+        a,
+        <const IMM8: i32> [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+            mask(IMM8, 16),
+            mask(IMM8, 17),
+            mask(IMM8, 18),
+            mask(IMM8, 19),
+            mask(IMM8, 20),
+            mask(IMM8, 21),
+            mask(IMM8, 22),
+            mask(IMM8, 23),
+            mask(IMM8, 24),
+            mask(IMM8, 25),
+            mask(IMM8, 26),
+            mask(IMM8, 27),
+            mask(IMM8, 28),
+            mask(IMM8, 29),
+            mask(IMM8, 30),
+            mask(IMM8, 31),
+            mask(IMM8, 32),
+            mask(IMM8, 33),
+            mask(IMM8, 34),
+            mask(IMM8, 35),
+            mask(IMM8, 36),
+            mask(IMM8, 37),
+            mask(IMM8, 38),
+            mask(IMM8, 39),
+            mask(IMM8, 40),
+            mask(IMM8, 41),
+            mask(IMM8, 42),
+            mask(IMM8, 43),
+            mask(IMM8, 44),
+            mask(IMM8, 45),
+            mask(IMM8, 46),
+            mask(IMM8, 47),
+            mask(IMM8, 48),
+            mask(IMM8, 49),
+            mask(IMM8, 50),
+            mask(IMM8, 51),
+            mask(IMM8, 52),
+            mask(IMM8, 53),
+            mask(IMM8, 54),
+            mask(IMM8, 55),
+            mask(IMM8, 56),
+            mask(IMM8, 57),
+            mask(IMM8, 58),
+            mask(IMM8, 59),
+            mask(IMM8, 60),
+            mask(IMM8, 61),
+            mask(IMM8, 62),
+            mask(IMM8, 63),
+        ],
+    );
+    transmute(r)
+}
+
+/// Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_bsrli_epi128&expand=594)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    let r: i8x64 = match IMM8 % 16 {
+        0 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+            ],
+        ),
+        1 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21, 22, 23,
+                24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+                45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
+            ],
+        ),
+        2 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+                46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
+            ],
+        ),
+        3 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+                46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
+                114,
+            ],
+        ),
+        4 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23, 24, 25,
+                26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
+                47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
+                115,
+            ],
+        ),
+        5 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24, 25, 26,
+                27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
+                115, 116,
+            ],
+        ),
+        6 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
+                97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
+                116, 117,
+            ],
+        ),
+        7 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
+                97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
+                116, 117, 118,
+            ],
+        ),
+        8 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26, 27, 28,
+                29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45, 46, 47, 96, 97,
+                98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
+                116, 117, 118, 119,
+            ],
+        ),
+        9 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27, 28, 29,
+                30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46, 47, 96, 97, 98,
+                99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
+                117, 118, 119, 120,
+            ],
+        ),
+        10 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28, 29, 30,
+                31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47, 96, 97, 98, 99,
+                100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
+                118, 119, 120, 121,
+            ],
+        ),
+        11 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29, 30, 31,
+                80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96, 97, 98, 99,
+                100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
+                117, 118, 119, 120, 121, 122,
+            ],
+        ),
+        12 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30, 31, 80,
+                81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97, 98, 99, 100,
+                101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
+                118, 119, 120, 121, 122, 123,
+            ],
+        ),
+        13 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31, 80, 81,
+                82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98, 99, 100, 101,
+                102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114, 115, 116, 117, 118,
+                119, 120, 121, 122, 123, 124,
+            ],
+        ),
+        14 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80, 81, 82,
+                83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99, 100, 101, 102,
+                103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114, 115, 116, 117, 118, 119,
+                120, 121, 122, 123, 124, 125,
+            ],
+        ),
+        15 => simd_shuffle64!(
+            a,
+            zero,
+            [
+                15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81, 82, 83,
+                84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99, 100, 101, 102, 103,
+                104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+                121, 122, 123, 124, 125, 126,
+            ],
+        ),
+        _ => zero,
+    };
+    transmute(r)
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_alignr_epi8&expand=263)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if IMM8 > 32 {
+        return _mm512_set1_epi8(0);
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b) = if IMM8 > 16 {
+        (_mm512_set1_epi8(0), a)
+    } else {
+        (a, b)
+    };
+    let a = a.as_i8x64();
+    let b = b.as_i8x64();
+
+    let r: i8x64 = match IMM8 % 16 {
+        0 => simd_shuffle64!(
+            b,
+            a,
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+            ],
+        ),
+        1 => simd_shuffle64!(
+            b,
+            a,
+            [
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21, 22, 23,
+                24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+                45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
+            ],
+        ),
+        2 => simd_shuffle64!(
+            b,
+            a,
+            [
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+                46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
+            ],
+        ),
+        3 => simd_shuffle64!(
+            b,
+            a,
+            [
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+                46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
+                114,
+            ],
+        ),
+        4 => simd_shuffle64!(
+            b,
+            a,
+            [
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23, 24, 25,
+                26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
+                47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
+                115,
+            ],
+        ),
+        5 => simd_shuffle64!(
+            b,
+            a,
+            [
+                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24, 25, 26,
+                27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
+                115, 116,
+            ],
+        ),
+        6 => simd_shuffle64!(
+            b,
+            a,
+            [
+                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
+                97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
+                116, 117,
+            ],
+        ),
+        7 => simd_shuffle64!(
+            b,
+            a,
+            [
+                7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
+                97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
+                116, 117, 118,
+            ],
+        ),
+        8 => simd_shuffle64!(
+            b,
+            a,
+            [
+                8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26, 27, 28,
+                29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45, 46, 47, 96, 97,
+                98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
+                116, 117, 118, 119,
+            ],
+        ),
+        9 => simd_shuffle64!(
+            b,
+            a,
+            [
+                9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27, 28, 29,
+                30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46, 47, 96, 97, 98,
+                99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
+                117, 118, 119, 120,
+            ],
+        ),
+        10 => simd_shuffle64!(
+            b,
+            a,
+            [
+                10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28, 29, 30,
+                31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47, 96, 97, 98, 99,
+                100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
+                118, 119, 120, 121,
+            ],
+        ),
+        11 => simd_shuffle64!(
+            b,
+            a,
+            [
+                11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29, 30, 31,
+                80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96, 97, 98, 99,
+                100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
+                117, 118, 119, 120, 121, 122,
+            ],
+        ),
+        12 => simd_shuffle64!(
+            b,
+            a,
+            [
+                12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30, 31, 80,
+                81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97, 98, 99, 100,
+                101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
+                118, 119, 120, 121, 122, 123,
+            ],
+        ),
+        13 => simd_shuffle64!(
+            b,
+            a,
+            [
+                13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31, 80, 81,
+                82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98, 99, 100, 101,
+                102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114, 115, 116, 117, 118,
+                119, 120, 121, 122, 123, 124,
+            ],
+        ),
+        14 => simd_shuffle64!(
+            b,
+            a,
+            [
+                14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80, 81, 82,
+                83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99, 100, 101, 102,
+                103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114, 115, 116, 117, 118, 119,
+                120, 121, 122, 123, 124, 125,
+            ],
+        ),
+        15 => simd_shuffle64!(
+            b,
+            a,
+            [
+                15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81, 82, 83,
+                84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99, 100, 101, 102, 103,
+                104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+                121, 122, 123, 124, 125, 126,
+            ],
+        ),
+        _ => b,
+    };
+    transmute(r)
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_alignr_epi8&expand=264)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_alignr_epi8<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_alignr_epi8::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i8x64(), src.as_i8x64()))
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_alignr_epi8&expand=265)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_alignr_epi8<const IMM8: i32>(
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_alignr_epi8::<IMM8>(a, b);
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, r.as_i8x64(), zero))
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_alignr_epi8&expand=261)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub unsafe fn _mm256_mask_alignr_epi8<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let r = _mm256_alignr_epi8::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i8x32(), src.as_i8x32()))
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_alignr_epi8&expand=262)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub unsafe fn _mm256_maskz_alignr_epi8<const IMM8: i32>(
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let r = _mm256_alignr_epi8::<IMM8>(a, b);
+    transmute(simd_select_bitmask(
+        k,
+        r.as_i8x32(),
+        _mm256_setzero_si256().as_i8x32(),
+    ))
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_alignr_epi8&expand=258)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub unsafe fn _mm_mask_alignr_epi8<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let r = _mm_alignr_epi8::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i8x16(), src.as_i8x16()))
+}
+
+/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_alignr_epi8&expand=259)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub unsafe fn _mm_maskz_alignr_epi8<const IMM8: i32>(
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let r = _mm_alignr_epi8::<IMM8>(a, b);
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, r.as_i8x16(), zero))
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi16_storeu_epi8&expand=1812)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm512_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
+    vpmovswbmem(mem_addr as *mut i8, a.as_i16x32(), k);
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi16_storeu_epi8&expand=1811)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm256_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
+    vpmovswbmem256(mem_addr as *mut i8, a.as_i16x16(), k);
+}
+
+/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi16_storeu_epi8&expand=1810)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovswb))]
+pub unsafe fn _mm_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovswbmem128(mem_addr as *mut i8, a.as_i16x8(), k);
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_storeu_epi8&expand=1412)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
+    vpmovwbmem(mem_addr as *mut i8, a.as_i16x32(), k);
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_storeu_epi8&expand=1411)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
+    vpmovwbmem256(mem_addr as *mut i8, a.as_i16x16(), k);
+}
+
+/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_storeu_epi8&expand=1410)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovwb))]
+pub unsafe fn _mm_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovwbmem128(mem_addr as *mut i8, a.as_i16x8(), k);
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi16_storeu_epi8&expand=2047)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm512_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
+    vpmovuswbmem(mem_addr as *mut i8, a.as_i16x32(), k);
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi16_storeu_epi8&expand=2046)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm256_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
+    vpmovuswbmem256(mem_addr as *mut i8, a.as_i16x16(), k);
+}
+
+/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi16_storeu_epi8&expand=2045)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovuswb))]
+pub unsafe fn _mm_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovuswbmem128(mem_addr as *mut i8, a.as_i16x8(), k);
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.mask.paddus.w.512"]
+    fn vpaddusw(a: u16x32, b: u16x32, src: u16x32, mask: u32) -> u16x32;
+    #[link_name = "llvm.x86.avx512.mask.paddus.w.256"]
+    fn vpaddusw256(a: u16x16, b: u16x16, src: u16x16, mask: u16) -> u16x16;
+    #[link_name = "llvm.x86.avx512.mask.paddus.w.128"]
+    fn vpaddusw128(a: u16x8, b: u16x8, src: u16x8, mask: u8) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.paddus.b.512"]
+    fn vpaddusb(a: u8x64, b: u8x64, src: u8x64, mask: u64) -> u8x64;
+    #[link_name = "llvm.x86.avx512.mask.paddus.b.256"]
+    fn vpaddusb256(a: u8x32, b: u8x32, src: u8x32, mask: u32) -> u8x32;
+    #[link_name = "llvm.x86.avx512.mask.paddus.b.128"]
+    fn vpaddusb128(a: u8x16, b: u8x16, src: u8x16, mask: u16) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.padds.w.512"]
+    fn vpaddsw(a: i16x32, b: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.padds.w.256"]
+    fn vpaddsw256(a: i16x16, b: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.padds.w.128"]
+    fn vpaddsw128(a: i16x8, b: i16x8, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.padds.b.512"]
+    fn vpaddsb(a: i8x64, b: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.padds.b.256"]
+    fn vpaddsb256(a: i8x32, b: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.padds.b.128"]
+    fn vpaddsb128(a: i8x16, b: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.psubus.w.512"]
+    fn vpsubusw(a: u16x32, b: u16x32, src: u16x32, mask: u32) -> u16x32;
+    #[link_name = "llvm.x86.avx512.mask.psubus.w.256"]
+    fn vpsubusw256(a: u16x16, b: u16x16, src: u16x16, mask: u16) -> u16x16;
+    #[link_name = "llvm.x86.avx512.mask.psubus.w.128"]
+    fn vpsubusw128(a: u16x8, b: u16x8, src: u16x8, mask: u8) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.psubus.b.512"]
+    fn vpsubusb(a: u8x64, b: u8x64, src: u8x64, mask: u64) -> u8x64;
+    #[link_name = "llvm.x86.avx512.mask.psubus.b.256"]
+    fn vpsubusb256(a: u8x32, b: u8x32, src: u8x32, mask: u32) -> u8x32;
+    #[link_name = "llvm.x86.avx512.mask.psubus.b.128"]
+    fn vpsubusb128(a: u8x16, b: u8x16, src: u8x16, mask: u16) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.psubs.w.512"]
+    fn vpsubsw(a: i16x32, b: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.psubs.w.256"]
+    fn vpsubsw256(a: i16x16, b: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.psubs.w.128"]
+    fn vpsubsw128(a: i16x8, b: i16x8, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.psubs.b.512"]
+    fn vpsubsb(a: i8x64, b: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.psubs.b.256"]
+    fn vpsubsb256(a: i8x32, b: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.psubs.b.128"]
+    fn vpsubsb128(a: i8x16, b: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.pmulhu.w.512"]
+    fn vpmulhuw(a: u16x32, b: u16x32) -> u16x32;
+    #[link_name = "llvm.x86.avx512.pmulh.w.512"]
+    fn vpmulhw(a: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
+    fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.mask.ucmp.w.512"]
+    fn vpcmpuw(a: u16x32, b: u16x32, op: i32, mask: u32) -> u32;
+    #[link_name = "llvm.x86.avx512.mask.ucmp.w.256"]
+    fn vpcmpuw256(a: u16x16, b: u16x16, op: i32, mask: u16) -> u16;
+    #[link_name = "llvm.x86.avx512.mask.ucmp.w.128"]
+    fn vpcmpuw128(a: u16x8, b: u16x8, op: i32, mask: u8) -> u8;
+
+    #[link_name = "llvm.x86.avx512.mask.ucmp.b.512"]
+    fn vpcmpub(a: u8x64, b: u8x64, op: i32, mask: u64) -> u64;
+    #[link_name = "llvm.x86.avx512.mask.ucmp.b.256"]
+    fn vpcmpub256(a: u8x32, b: u8x32, op: i32, mask: u32) -> u32;
+    #[link_name = "llvm.x86.avx512.mask.ucmp.b.128"]
+    fn vpcmpub128(a: u8x16, b: u8x16, op: i32, mask: u16) -> u16;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.w.512"]
+    fn vpcmpw(a: i16x32, b: i16x32, op: i32, mask: u32) -> u32;
+    #[link_name = "llvm.x86.avx512.mask.cmp.w.256"]
+    fn vpcmpw256(a: i16x16, b: i16x16, op: i32, mask: u16) -> u16;
+    #[link_name = "llvm.x86.avx512.mask.cmp.w.128"]
+    fn vpcmpw128(a: i16x8, b: i16x8, op: i32, mask: u8) -> u8;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.b.512"]
+    fn vpcmpb(a: i8x64, b: i8x64, op: i32, mask: u64) -> u64;
+    #[link_name = "llvm.x86.avx512.mask.cmp.b.256"]
+    fn vpcmpb256(a: i8x32, b: i8x32, op: i32, mask: u32) -> u32;
+    #[link_name = "llvm.x86.avx512.mask.cmp.b.128"]
+    fn vpcmpb128(a: i8x16, b: i8x16, op: i32, mask: u16) -> u16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmaxu.w.512"]
+    fn vpmaxuw(a: u16x32, b: u16x32) -> u16x32;
+    #[link_name = "llvm.x86.avx512.mask.pmaxu.b.512"]
+    fn vpmaxub(a: u8x64, b: u8x64) -> u8x64;
+    #[link_name = "llvm.x86.avx512.mask.pmaxs.w.512"]
+    fn vpmaxsw(a: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.pmaxs.b.512"]
+    fn vpmaxsb(a: i8x64, b: i8x64) -> i8x64;
+
+    #[link_name = "llvm.x86.avx512.mask.pminu.w.512"]
+    fn vpminuw(a: u16x32, b: u16x32) -> u16x32;
+    #[link_name = "llvm.x86.avx512.mask.pminu.b.512"]
+    fn vpminub(a: u8x64, b: u8x64) -> u8x64;
+    #[link_name = "llvm.x86.avx512.mask.pmins.w.512"]
+    fn vpminsw(a: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.pmins.b.512"]
+    fn vpminsb(a: i8x64, b: i8x64) -> i8x64;
+
+    #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
+    fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
+    fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.packssdw.512"]
+    fn vpackssdw(a: i32x16, b: i32x16) -> i16x32;
+    #[link_name = "llvm.x86.avx512.packsswb.512"]
+    fn vpacksswb(a: i16x32, b: i16x32) -> i8x64;
+    #[link_name = "llvm.x86.avx512.packusdw.512"]
+    fn vpackusdw(a: i32x16, b: i32x16) -> u16x32;
+    #[link_name = "llvm.x86.avx512.packuswb.512"]
+    fn vpackuswb(a: i16x32, b: i16x32) -> u8x64;
+
+    #[link_name = "llvm.x86.avx512.pavg.w.512"]
+    fn vpavgw(a: u16x32, b: u16x32) -> u16x32;
+    #[link_name = "llvm.x86.avx512.pavg.b.512"]
+    fn vpavgb(a: u8x64, b: u8x64) -> u8x64;
+
+    #[link_name = "llvm.x86.avx512.psll.w.512"]
+    fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
+    #[link_name = "llvm.x86.avx512.pslli.w.512"]
+    fn vpslliw(a: i16x32, imm8: u32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx2.pslli.w"]
+    fn pslliw256(a: i16x16, imm8: i32) -> i16x16;
+    #[link_name = "llvm.x86.sse2.pslli.w"]
+    fn pslliw128(a: i16x8, imm8: i32) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.psllv.w.512"]
+    fn vpsllvw(a: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psllv.w.256"]
+    fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.psllv.w.128"]
+    fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.psrl.w.512"]
+    fn vpsrlw(a: i16x32, count: i16x8) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psrli.w.512"]
+    fn vpsrliw(a: i16x32, imm8: u32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.psrlv.w.512"]
+    fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psrlv.w.256"]
+    fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.psrlv.w.128"]
+    fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.psra.w.512"]
+    fn vpsraw(a: i16x32, count: i16x8) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psrai.w.512"]
+    fn vpsraiw(a: i16x32, imm8: u32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx2.psrai.w"]
+    fn psraiw256(a: i16x16, imm8: i32) -> i16x16;
+    #[link_name = "llvm.x86.sse2.psrai.w"]
+    fn psraiw128(a: i16x8, imm8: i32) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.psrav.w.512"]
+    fn vpsravw(a: i16x32, count: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.psrav.w.256"]
+    fn vpsravw256(a: i16x16, count: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.psrav.w.128"]
+    fn vpsravw128(a: i16x8, count: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.hi.512"]
+    fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.vpermi2var.hi.256"]
+    fn vpermi2w256(a: i16x16, idx: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.vpermi2var.hi.128"]
+    fn vpermi2w128(a: i16x8, idx: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.permvar.hi.512"]
+    fn vpermw(a: i16x32, idx: i16x32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.permvar.hi.256"]
+    fn vpermw256(a: i16x16, idx: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.permvar.hi.128"]
+    fn vpermw128(a: i16x8, idx: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.pshuf.b.512"]
+    fn vpshufb(a: i8x64, b: i8x64) -> i8x64;
+
+    #[link_name = "llvm.x86.avx512.psad.bw.512"]
+    fn vpsadbw(a: u8x64, b: u8x64) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.dbpsadbw.512"]
+    fn vdbpsadbw(a: u8x64, b: u8x64, imm8: i32) -> u16x32;
+    #[link_name = "llvm.x86.avx512.dbpsadbw.256"]
+    fn vdbpsadbw256(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
+    #[link_name = "llvm.x86.avx512.dbpsadbw.128"]
+    fn vdbpsadbw128(a: u8x16, b: u8x16, imm8: i32) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.512"]
+    fn vpmovswb(a: i16x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.256"]
+    fn vpmovswb256(a: i16x16, src: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.128"]
+    fn vpmovswb128(a: i16x8, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.512"]
+    fn vpmovuswb(a: u16x32, src: u8x32, mask: u32) -> u8x32;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.256"]
+    fn vpmovuswb256(a: u16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.128"]
+    fn vpmovuswb128(a: u16x8, src: u8x16, mask: u8) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.512"]
+    fn vpmovswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.256"]
+    fn vpmovswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.128"]
+    fn vpmovswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.512"]
+    fn vpmovwbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.256"]
+    fn vpmovwbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.128"]
+    fn vpmovwbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.512"]
+    fn vpmovuswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.256"]
+    fn vpmovuswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.128"]
+    fn vpmovuswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::hint::black_box;
+    use crate::mem::{self};
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_abs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let r = _mm512_abs_epi16(a);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_abs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let r = _mm512_mask_abs_epi16(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi16(a, 0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_abs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let r = _mm512_maskz_abs_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi16(0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let r = _mm256_mask_abs_epi16(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi16(a, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let r = _mm256_maskz_abs_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let r = _mm_mask_abs_epi16(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi16(a, 0b00001111, a);
+        let e = _mm_set_epi16(-1, -1, -1, -1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let r = _mm_maskz_abs_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_abs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let r = _mm512_abs_epi8(a);
+        let e = _mm512_set1_epi8(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_abs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let r = _mm512_mask_abs_epi8(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_abs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let r = _mm512_maskz_abs_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let r = _mm256_mask_abs_epi8(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi8(a, 0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let r = _mm256_maskz_abs_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi8(0b00000000_11111111_00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+                                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_mask_abs_epi8(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi8(a, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_maskz_abs_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi8(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_add_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_add_epi16(a, b);
+        let e = _mm512_set1_epi16(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_add_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_add_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_add_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_add_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_add_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_add_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_add_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_maskz_add_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_add_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_add_epi8(a, b);
+        let e = _mm512_set1_epi8(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_add_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_mask_add_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_add_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_maskz_add_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_mask_add_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
+                                1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_maskz_add_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
+                                0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_mask_add_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_maskz_add_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_adds_epu16(a, b);
+        let e = _mm512_set1_epi16(u16::MAX as i16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_mask_adds_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_maskz_adds_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_mask_adds_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epu16(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_maskz_adds_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epu16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_mask_adds_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epu16(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi16(1, 1, 1, 1, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_maskz_adds_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epu16(0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi16(0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_adds_epu8(a, b);
+        let e = _mm512_set1_epi8(u8::MAX as i8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_mask_adds_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epu8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_maskz_adds_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epu8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_mask_adds_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_maskz_adds_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epu8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_mask_adds_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epu8(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_maskz_adds_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epu8(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_adds_epi16(a, b);
+        let e = _mm512_set1_epi16(i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_mask_adds_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_maskz_adds_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_mask_adds_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epi16(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_maskz_adds_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epi16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_mask_adds_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_maskz_adds_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_adds_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_adds_epi8(a, b);
+        let e = _mm512_set1_epi8(i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_adds_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_mask_adds_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_adds_epi8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_adds_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_maskz_adds_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_adds_epi8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_adds_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_mask_adds_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_adds_epi8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_adds_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_maskz_adds_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_adds_epi8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_adds_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_mask_adds_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_adds_epi8(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_adds_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_maskz_adds_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_adds_epi8(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sub_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_sub_epi16(a, b);
+        let e = _mm512_set1_epi16(-1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sub_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_sub_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sub_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_sub_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_sub_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_sub_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_sub_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_maskz_sub_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sub_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_sub_epi8(a, b);
+        let e = _mm512_set1_epi8(-1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sub_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_mask_sub_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sub_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_maskz_sub_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_mask_sub_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_maskz_sub_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
+                                0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_mask_sub_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_maskz_sub_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_subs_epu16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_mask_subs_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(u16::MAX as i16);
+        let r = _mm512_maskz_subs_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_mask_subs_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epu16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(u16::MAX as i16);
+        let r = _mm256_maskz_subs_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epu16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_mask_subs_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(u16::MAX as i16);
+        let r = _mm_maskz_subs_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_subs_epu8(a, b);
+        let e = _mm512_set1_epi8(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_mask_subs_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epu8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(u8::MAX as i8);
+        let r = _mm512_maskz_subs_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epu8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_mask_subs_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(u8::MAX as i8);
+        let r = _mm256_maskz_subs_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epu8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_mask_subs_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epu8(a, 0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(u8::MAX as i8);
+        let r = _mm_maskz_subs_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epu8(0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_subs_epi16(a, b);
+        let e = _mm512_set1_epi16(i16::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_mask_subs_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_maskz_subs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_mask_subs_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epi16(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_maskz_subs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epi16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_mask_subs_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(-1, -1, -1, -1, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(i16::MAX);
+        let r = _mm_maskz_subs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MIN, i16::MIN, i16::MIN, i16::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_subs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_subs_epi8(a, b);
+        let e = _mm512_set1_epi8(i8::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_subs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_mask_subs_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_subs_epi8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_subs_epi8() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(i8::MAX);
+        let r = _mm512_maskz_subs_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_subs_epi8(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_subs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_mask_subs_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_subs_epi8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_subs_epi8() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(i8::MAX);
+        let r = _mm256_maskz_subs_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_subs_epi8(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_subs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_mask_subs_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_subs_epi8(a, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_subs_epi8() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(i8::MAX);
+        let r = _mm_maskz_subs_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_subs_epi8(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MIN, i8::MIN, i8::MIN);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mulhi_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mulhi_epu16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mulhi_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mulhi_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mulhi_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mulhi_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mulhi_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mulhi_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mulhi_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mulhi_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mulhi_epu16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mulhi_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mulhi_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mulhi_epu16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mulhi_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mulhi_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mulhi_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mulhi_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mulhi_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mulhi_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mulhi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mulhi_epi16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mulhi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mulhi_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mulhi_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mulhi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mulhi_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mulhi_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mulhi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mulhi_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mulhi_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mulhi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mulhi_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mulhi_epi16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mulhi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mulhi_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mulhi_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mulhi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mulhi_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mulhi_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mulhrs_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mulhrs_epi16(a, b);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mulhrs_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mulhrs_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mulhrs_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mulhrs_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mulhrs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mulhrs_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mulhrs_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mulhrs_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mulhrs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mulhrs_epi16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mulhrs_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mulhrs_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mulhrs_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mulhrs_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mulhrs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mulhrs_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mullo_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mullo_epi16(a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mullo_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_mullo_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mullo_epi16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mullo_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_mullo_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mullo_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mullo_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_mullo_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mullo_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mullo_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_mullo_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mullo_epi16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mullo_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_mullo_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mullo_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mullo_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_mullo_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mullo_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15,
+                                15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_max_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_max_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi16(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi16(a, 0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi8(
+            a,
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi8(
+            0b00000000_11111111_00000000_11111111_00000000_11111111_00000000_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi8(a, 0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi8(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_mask_min_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi8(a, 0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_maskz_min_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi8(0b00000000_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epu16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmplt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epu16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmplt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmplt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epu16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmplt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmplt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epu8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmplt_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epu8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmplt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmplt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epu8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmplt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmplt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epi16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmplt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epi16_mask() {
+        let a = _mm512_set1_epi16(-2);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmplt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi16_mask() {
+        let a = _mm256_set1_epi16(-2);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmplt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epi16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmplt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi16_mask() {
+        let a = _mm_set1_epi16(-2);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmplt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmplt_epi8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmplt_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmplt_epi8_mask() {
+        let a = _mm512_set1_epi8(-2);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmplt_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmplt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi8_mask() {
+        let a = _mm256_set1_epi8(-2);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmplt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmplt_epi8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmplt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi8_mask() {
+        let a = _mm_set1_epi8(-2);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmplt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpgt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpgt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpgt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpgt_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpgt_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpgt_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpgt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpgt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpgt_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpgt_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epi16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpgt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epi16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpgt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b001010101_01010101;
+        let r = _mm256_mask_cmpgt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpgt_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpgt_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpgt_epi8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpgt_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpgt_epi8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpgt_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpgt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpgt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpgt_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpgt_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epu16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmple_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epu16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epu16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmple_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmple_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epu16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmple_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmple_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epu8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmple_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epu8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epu8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmple_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epu8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmple_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmple_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmple_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmple_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpge_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpge_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpge_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpge_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpge_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpge_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpge_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpge_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpge_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpge_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpge_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpge_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpge_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpge_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpge_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpeq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpeq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpeq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpeq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpeq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpeq_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpeq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpeq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpeq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpeq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpeq_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpeq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpeq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpneq_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpneq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpneq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpneq_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpneq_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epi16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpneq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epi16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpneq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpneq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpneq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpneq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epi8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpneq_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epi8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpneq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpneq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpneq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpneq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epu16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epu16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epu16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epu16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epu8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epu8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epu8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epu8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epi16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epi16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epi16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epi16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmp_epi8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmp_epi8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmp_epi8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmp_epi8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_loadu_epi16() {
+        #[rustfmt::skip]
+        let a: [i16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let r = _mm512_loadu_epi16(&a[0]);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_loadu_epi16() {
+        let a: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let r = _mm256_loadu_epi16(&a[0]);
+        let e = _mm256_set_epi16(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_loadu_epi16() {
+        let a: [i16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let r = _mm_loadu_epi16(&a[0]);
+        let e = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_loadu_epi8() {
+        #[rustfmt::skip]
+        let a: [i8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                           1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let r = _mm512_loadu_epi8(&a[0]);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+                                32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_loadu_epi8() {
+        #[rustfmt::skip]
+        let a: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        let r = _mm256_loadu_epi8(&a[0]);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_loadu_epi8() {
+        let a: [i8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let r = _mm_loadu_epi8(&a[0]);
+        let e = _mm_set_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_storeu_epi16() {
+        let a = _mm512_set1_epi16(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_epi16(&mut r as *mut _ as *mut i16, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_storeu_epi16() {
+        let a = _mm256_set1_epi16(9);
+        let mut r = _mm256_set1_epi32(0);
+        _mm256_storeu_epi16(&mut r as *mut _ as *mut i16, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_storeu_epi16() {
+        let a = _mm_set1_epi16(9);
+        let mut r = _mm_set1_epi32(0);
+        _mm_storeu_epi16(&mut r as *mut _ as *mut i16, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_storeu_epi8() {
+        let a = _mm512_set1_epi8(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_epi8(&mut r as *mut _ as *mut i8, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_storeu_epi8() {
+        let a = _mm256_set1_epi8(9);
+        let mut r = _mm256_set1_epi32(0);
+        _mm256_storeu_epi8(&mut r as *mut _ as *mut i8, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_storeu_epi8() {
+        let a = _mm_set1_epi8(9);
+        let mut r = _mm_set1_epi32(0);
+        _mm_storeu_epi8(&mut r as *mut _ as *mut i8, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_loadu_epi16() {
+        let src = _mm512_set1_epi16(42);
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm512_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_maskz_loadu_epi16() {
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm512_maskz_loadu_epi16(m, black_box(p));
+        let e = &[
+            0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_storeu_epi16() {
+        let mut r = [42_i16; 32];
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let a = _mm512_loadu_epi16(a.as_ptr());
+        let m = 0b10101010_11001100_11101000_11001010;
+        _mm512_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(_mm512_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_loadu_epi8() {
+        let src = _mm512_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        let r = _mm512_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
+            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_maskz_loadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        let r = _mm512_maskz_loadu_epi8(m, black_box(p));
+        let e = &[
+            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+            50, 51, 52, 53, 54, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_storeu_epi8() {
+        let mut r = [42_i8; 64];
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let a = _mm512_loadu_epi8(a.as_ptr());
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        _mm512_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
+            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(_mm512_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi16() {
+        let src = _mm256_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_maskz_loadu_epi16(m, black_box(p));
+        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi16() {
+        let mut r = [42_i16; 16];
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let a = _mm256_loadu_epi16(a.as_ptr());
+        let m = 0b11101000_11001010;
+        _mm256_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(_mm256_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi8() {
+        let src = _mm256_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm256_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm256_maskz_loadu_epi8(m, black_box(p));
+        let e = &[
+            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi8() {
+        let mut r = [42_i8; 32];
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let a = _mm256_loadu_epi8(a.as_ptr());
+        let m = 0b10101010_11001100_11101000_11001010;
+        _mm256_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(_mm256_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi16() {
+        let src = _mm_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm_maskz_loadu_epi16(m, black_box(p));
+        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi16() {
+        let mut r = [42_i16; 8];
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let a = _mm_loadu_epi16(a.as_ptr());
+        let m = 0b11001010;
+        _mm_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(_mm_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi8() {
+        let src = _mm_set1_epi8(42);
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi8() {
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_maskz_loadu_epi8(m, black_box(p));
+        let e = &[0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi8() {
+        let mut r = [42_i8; 16];
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let a = _mm_loadu_epi8(a.as_ptr());
+        let m = 0b11101000_11001010;
+        _mm_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(_mm_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_madd_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_madd_epi16(a, b);
+        let e = _mm512_set1_epi32(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_madd_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_madd_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_madd_epi16(a, 0b00000000_00001111, a, b);
+        let e = _mm512_set_epi32(
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            2,
+            2,
+            2,
+            2,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_madd_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_madd_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_madd_epi16(0b00000000_00001111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_madd_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_madd_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_madd_epi16(a, 0b00001111, a, b);
+        let e = _mm256_set_epi32(
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            1 << 16 | 1,
+            2,
+            2,
+            2,
+            2,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_madd_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_madd_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_madd_epi16(0b00001111, a, b);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_madd_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_madd_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_madd_epi16(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_madd_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_madd_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_madd_epi16(0b00001111, a, b);
+        let e = _mm_set_epi32(2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maddubs_epi16() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maddubs_epi16(a, b);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_maddubs_epi16() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let src = _mm512_set1_epi16(1);
+        let r = _mm512_mask_maddubs_epi16(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_add_epi16(src, 0b00000000_00000000_00000000_00000001, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1<<9|2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_maddubs_epi16() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_maddubs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_maddubs_epi16(0b00000000_11111111_00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_maddubs_epi16() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let src = _mm256_set1_epi16(1);
+        let r = _mm256_mask_maddubs_epi16(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_add_epi16(src, 0b00000000_00000001, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 9 | 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_maddubs_epi16() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_maddubs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_maddubs_epi16(0b00000000_11111111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_maddubs_epi16() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let src = _mm_set1_epi16(1);
+        let r = _mm_mask_maddubs_epi16(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_add_epi16(src, 0b00000001, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1 << 9 | 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_maddubs_epi16() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_maddubs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_maddubs_epi16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packs_epi32() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_packs_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX,
+                                 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packs_epi32() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let b = _mm512_set1_epi32(1 << 16 | 1);
+        let r = _mm512_mask_packs_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packs_epi32(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packs_epi32() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_packs_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packs_epi32(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packs_epi32() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let b = _mm256_set1_epi32(1 << 16 | 1);
+        let r = _mm256_mask_packs_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packs_epi32(b, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packs_epi32() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_packs_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packs_epi32(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packs_epi32() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let b = _mm_set1_epi32(1 << 16 | 1);
+        let r = _mm_mask_packs_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packs_epi32(b, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packs_epi32() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_packs_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packs_epi32(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packs_epi16() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_packs_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+                                1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packs_epi16() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let b = _mm512_set1_epi16(1 << 8 | 1);
+        let r = _mm512_mask_packs_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packs_epi16(
+            b,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packs_epi16() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_packs_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packs_epi16(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packs_epi16() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let b = _mm256_set1_epi16(1 << 8 | 1);
+        let r = _mm256_mask_packs_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packs_epi16(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packs_epi16() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_packs_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packs_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packs_epi16() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let b = _mm_set1_epi16(1 << 8 | 1);
+        let r = _mm_mask_packs_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packs_epi16(b, 0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packs_epi16() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_packs_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packs_epi16(0b00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packus_epi32() {
+        let a = _mm512_set1_epi32(-1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_packus_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+                                 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packus_epi32() {
+        let a = _mm512_set1_epi32(-1);
+        let b = _mm512_set1_epi32(1 << 16 | 1);
+        let r = _mm512_mask_packus_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packus_epi32(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packus_epi32() {
+        let a = _mm512_set1_epi32(-1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_packus_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packus_epi32(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packus_epi32() {
+        let a = _mm256_set1_epi32(-1);
+        let b = _mm256_set1_epi32(1 << 16 | 1);
+        let r = _mm256_mask_packus_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packus_epi32(b, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packus_epi32() {
+        let a = _mm256_set1_epi32(-1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_packus_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packus_epi32(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packus_epi32() {
+        let a = _mm_set1_epi32(-1);
+        let b = _mm_set1_epi32(1 << 16 | 1);
+        let r = _mm_mask_packus_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packus_epi32(b, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packus_epi32() {
+        let a = _mm_set1_epi32(-1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_packus_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packus_epi32(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_packus_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_packus_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_packus_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(1 << 8 | 1);
+        let r = _mm512_mask_packus_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_packus_epi16(
+            b,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_packus_epi16() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_packus_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_packus_epi16(
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_packus_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(1 << 8 | 1);
+        let r = _mm256_mask_packus_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_packus_epi16(b, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_packus_epi16() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_packus_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_packus_epi16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_packus_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(1 << 8 | 1);
+        let r = _mm_mask_packus_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_packus_epi16(b, 0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_packus_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_packus_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_packus_epi16(0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_avg_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_avg_epu16(a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_avg_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_mask_avg_epu16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_avg_epu16(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_avg_epu16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_avg_epu16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_avg_epu16(0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_avg_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_mask_avg_epu16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_avg_epu16(a, 0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_avg_epu16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_avg_epu16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_avg_epu16(0b00000000_00001111, a, b);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_avg_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_mask_avg_epu16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_avg_epu16(a, 0b00001111, a, b);
+        let e = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_avg_epu16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_maskz_avg_epu16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_avg_epu16(0b00001111, a, b);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_avg_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_avg_epu8(a, b);
+        let e = _mm512_set1_epi8(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_avg_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_avg_epu8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_avg_epu8(
+            a,
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_avg_epu8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_avg_epu8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_avg_epu8(
+            0b00000000_000000000_00000000_00000000_00000000_0000000_00000000_00001111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_avg_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_avg_epu8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_avg_epu8(a, 0b00000000_00000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_avg_epu8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_avg_epu8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_avg_epu8(0b00000000_0000000_00000000_00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_avg_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_avg_epu8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_avg_epu8(a, 0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_avg_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_avg_epu8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_avg_epu8(0b00000000_00001111, a, b);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sll_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_sll_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sll_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_mask_sll_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sll_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sll_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_maskz_sll_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sll_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sll_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_mask_sll_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sll_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sll_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_maskz_sll_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sll_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sll_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_sll_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sll_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sll_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_sll_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sll_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_slli_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_slli_epi16::<1>(a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_slli_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_mask_slli_epi16::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_slli_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_slli_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_maskz_slli_epi16::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_slli_epi16::<1>(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_slli_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_mask_slli_epi16::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_slli_epi16::<1>(a, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_slli_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_maskz_slli_epi16::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_slli_epi16::<1>(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_slli_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let r = _mm_mask_slli_epi16::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_slli_epi16::<1>(a, 0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_slli_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let r = _mm_maskz_slli_epi16::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_slli_epi16::<1>(0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sllv_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_sllv_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sllv_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_mask_sllv_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sllv_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sllv_epi16() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_sllv_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sllv_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_sllv_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_sllv_epi16(a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sllv_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_mask_sllv_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sllv_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sllv_epi16() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_sllv_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sllv_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_sllv_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_sllv_epi16(a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sllv_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_sllv_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sllv_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sllv_epi16() {
+        let a = _mm_set1_epi16(1 << 15);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_sllv_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sllv_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srl_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_srl_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srl_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_mask_srl_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srl_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srl_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm512_maskz_srl_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srl_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srl_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_mask_srl_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srl_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srl_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm256_maskz_srl_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srl_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srl_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_srl_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srl_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srl_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_srl_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srl_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srli_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let r = _mm512_srli_epi16::<2>(a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srli_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let r = _mm512_mask_srli_epi16::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srli_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srli_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let r = _mm512_maskz_srli_epi16::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srli_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srli_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let r = _mm256_mask_srli_epi16::<2>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srli_epi16::<2>(a, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srli_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let r = _mm256_maskz_srli_epi16::<2>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srli_epi16::<2>(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srli_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let r = _mm_mask_srli_epi16::<2>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srli_epi16::<2>(a, 0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srli_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let r = _mm_maskz_srli_epi16::<2>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srli_epi16::<2>(0b11111111, a);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srlv_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_srlv_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srlv_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_mask_srlv_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srlv_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srlv_epi16() {
+        let a = _mm512_set1_epi16(1 << 1);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_srlv_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srlv_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_srlv_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_srlv_epi16(a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srlv_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_mask_srlv_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srlv_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srlv_epi16() {
+        let a = _mm256_set1_epi16(1 << 1);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_srlv_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srlv_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_srlv_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_srlv_epi16(a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srlv_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_srlv_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srlv_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srlv_epi16() {
+        let a = _mm_set1_epi16(1 << 1);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_srlv_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srlv_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sra_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm512_sra_epi16(a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_sra_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm512_mask_sra_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sra_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_sra_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm512_maskz_sra_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sra_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_sra_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm256_mask_sra_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sra_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_sra_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm256_maskz_sra_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sra_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_sra_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm_mask_sra_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sra_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_sra_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(1);
+        let r = _mm_maskz_sra_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sra_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srai_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let r = _mm512_srai_epi16::<2>(a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srai_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let r = _mm512_mask_srai_epi16::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srai_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srai_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let r = _mm512_maskz_srai_epi16::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srai_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srai_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let r = _mm256_mask_srai_epi16::<2>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srai_epi16::<2>(a, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srai_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let r = _mm256_maskz_srai_epi16::<2>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srai_epi16::<2>(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srai_epi16() {
+        let a = _mm_set1_epi16(8);
+        let r = _mm_mask_srai_epi16::<2>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srai_epi16::<2>(a, 0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srai_epi16() {
+        let a = _mm_set1_epi16(8);
+        let r = _mm_maskz_srai_epi16::<2>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srai_epi16::<2>(0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_srav_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_srav_epi16(a, count);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_srav_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_mask_srav_epi16(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srav_epi16(a, 0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_srav_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let count = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_srav_epi16(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srav_epi16(0b11111111_11111111_11111111_11111111, a, count);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_srav_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_srav_epi16(a, count);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_srav_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_mask_srav_epi16(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srav_epi16(a, 0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_srav_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let count = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_srav_epi16(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srav_epi16(0b11111111_11111111, a, count);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_srav_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_srav_epi16(a, count);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_srav_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_mask_srav_epi16(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srav_epi16(a, 0b11111111, a, count);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_srav_epi16() {
+        let a = _mm_set1_epi16(8);
+        let count = _mm_set1_epi16(2);
+        let r = _mm_maskz_srav_epi16(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srav_epi16(0b11111111, a, count);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_permutex2var_epi16(a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_mask_permutex2var_epi16(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi16(a, 0b11111111_11111111_11111111_11111111, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_maskz_permutex2var_epi16(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi16(0b11111111_11111111_11111111_11111111, a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask2_permutex2var_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi16(1, 1<<5, 2, 1<<5, 3, 1<<5, 4, 1<<5, 5, 1<<5, 6, 1<<5, 7, 1<<5, 8, 1<<5,
+                                   9, 1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm512_set1_epi16(100);
+        let r = _mm512_mask2_permutex2var_epi16(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi16(a, idx, 0b11111111_11111111_11111111_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_permutex2var_epi16(a, idx, b);
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_mask_permutex2var_epi16(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi16(a, 0b11111111_11111111, idx, b);
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_maskz_permutex2var_epi16(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi16(0b11111111_11111111, a, idx, b);
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi16(1, 1<<4, 2, 1<<4, 3, 1<<4, 4, 1<<4, 5, 1<<4, 6, 1<<4, 7, 1<<4, 8, 1<<4);
+        let b = _mm256_set1_epi16(100);
+        let r = _mm256_mask2_permutex2var_epi16(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi16(a, idx, 0b11111111_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_permutex2var_epi16(a, idx, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_mask_permutex2var_epi16(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi16(a, 0b11111111, idx, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_maskz_permutex2var_epi16(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi16(0b11111111, a, idx, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm_set_epi16(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm_set1_epi16(100);
+        let r = _mm_mask2_permutex2var_epi16(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi16(a, idx, 0b11111111, b);
+        let e = _mm_set_epi16(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_permutexvar_epi16() {
+        let idx = _mm512_set1_epi16(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_permutexvar_epi16(idx, a);
+        let e = _mm512_set1_epi16(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_permutexvar_epi16() {
+        let idx = _mm512_set1_epi16(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_permutexvar_epi16(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi16(a, 0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm512_set1_epi16(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_permutexvar_epi16() {
+        let idx = _mm512_set1_epi16(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_permutexvar_epi16(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi16(0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm512_set1_epi16(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi16() {
+        let idx = _mm256_set1_epi16(1);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_permutexvar_epi16(idx, a);
+        let e = _mm256_set1_epi16(14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi16() {
+        let idx = _mm256_set1_epi16(1);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_permutexvar_epi16(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi16(a, 0b11111111_11111111, idx, a);
+        let e = _mm256_set1_epi16(14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi16() {
+        let idx = _mm256_set1_epi16(1);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_permutexvar_epi16(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi16(0b11111111_11111111, idx, a);
+        let e = _mm256_set1_epi16(14);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_permutexvar_epi16() {
+        let idx = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_permutexvar_epi16(idx, a);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_permutexvar_epi16() {
+        let idx = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_permutexvar_epi16(a, 0, idx, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutexvar_epi16(a, 0b11111111, idx, a);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_permutexvar_epi16() {
+        let idx = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_permutexvar_epi16(0, idx, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutexvar_epi16(0b11111111, idx, a);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_blend_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_blend_epi16(0b11111111_00000000_11111111_00000000, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_blend_epi16(0b11111111_00000000, a, b);
+        let e = _mm256_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_blend_epi16(0b11110000, a, b);
+        let e = _mm_set_epi16(2, 2, 2, 2, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_blend_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(2);
+        let r = _mm512_mask_blend_epi8(
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(2);
+        let r = _mm256_mask_blend_epi8(0b11111111_00000000_11111111_00000000, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+                                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(2);
+        let r = _mm_mask_blend_epi8(0b11111111_00000000, a, b);
+        let e = _mm_set_epi8(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_broadcastw_epi16(a);
+        let e = _mm512_set1_epi16(24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_broadcastw_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_broadcastw_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastw_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_broadcastw_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastw_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(24);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastw_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_broadcastw_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastw_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_broadcastw_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastw_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_broadcastw_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm_mask_broadcastw_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastw_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(24);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastw_epi16() {
+        let a = _mm_set_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm_maskz_broadcastw_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastw_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(24);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_broadcastb_epi8(a);
+        let e = _mm512_set1_epi8(32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_broadcastb_epi8() {
+        let src = _mm512_set1_epi8(1);
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_broadcastb_epi8(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastb_epi8(
+            src,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_broadcastb_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastb_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastb_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_mask_broadcastb_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastb_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm256_maskz_broadcastb_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastb_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_broadcastb_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm_mask_broadcastb_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastb_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastb_epi8() {
+        let a = _mm_set_epi8(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm_maskz_broadcastb_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastb_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpackhi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_unpackhi_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
+                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpackhi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_mask_unpackhi_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
+                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpackhi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_maskz_unpackhi_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi16(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(33, 1,  34, 2,  35, 3,  36, 4,  41, 9,  42, 10, 43, 11, 44, 12,
+                                 49, 17, 50, 18, 51, 19, 52, 20, 57, 25, 58, 26, 59, 27, 60, 28);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_mask_unpackhi_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi16(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(33, 1, 34, 2, 35, 3, 36, 4, 41, 9, 42, 10, 43, 11, 44, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_maskz_unpackhi_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi16(0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(33, 1, 34, 2, 35, 3, 36, 4, 41, 9, 42, 10, 43, 11, 44, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_mask_unpackhi_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi16(a, 0b11111111, a, b);
+        let e = _mm_set_epi16(33, 1, 34, 2, 35, 3, 36, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_maskz_unpackhi_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi16(0b11111111, a, b);
+        let e = _mm_set_epi16(33, 1, 34, 2, 35, 3, 36, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_unpackhi_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
+                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
+                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_mask_unpackhi_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
+                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
+                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_maskz_unpackhi_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24,
+                                97, 33, 98, 34, 99, 35, 100, 36, 101, 37, 102, 38, 103, 39, 104, 40,
+                                113, 49, 114, 50, 115, 51, 116, 52, 117, 53, 118, 54, 119, 55, 120, 56);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_mask_unpackhi_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_maskz_unpackhi_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(65, 1,  66, 2,  67, 3,  68, 4,  69, 5,  70, 6,  71, 7,  72, 8,
+                                81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_mask_unpackhi_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_maskz_unpackhi_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpacklo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_unpacklo_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
+                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpacklo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_mask_unpacklo_epi16(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi16(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
+                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpacklo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        let r = _mm512_maskz_unpacklo_epi16(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi16(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(37, 5,  38, 6,  39, 7,  40, 8,  45, 13, 46, 14, 47, 15, 48, 16,
+                                 53, 21, 54, 22, 55, 23, 56, 24, 61, 29, 62, 30, 63, 31, 64, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_mask_unpacklo_epi16(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi16(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(37, 5, 38, 6, 39, 7, 40, 8, 45, 13, 46, 14, 47, 15, 48, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi16() {
+        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm256_set_epi16(
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        );
+        let r = _mm256_maskz_unpacklo_epi16(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi16(0b11111111_11111111, a, b);
+        let e = _mm256_set_epi16(37, 5, 38, 6, 39, 7, 40, 8, 45, 13, 46, 14, 47, 15, 48, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_mask_unpacklo_epi16(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi16(a, 0b11111111, a, b);
+        let e = _mm_set_epi16(37, 5, 38, 6, 39, 7, 40, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi16() {
+        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+        let r = _mm_maskz_unpacklo_epi16(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi16(0b11111111, a, b);
+        let e = _mm_set_epi16(37, 5, 38, 6, 39, 7, 40, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_unpacklo_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
+                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
+                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_mask_unpacklo_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
+                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
+                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+                                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                                97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0);
+        let r = _mm512_maskz_unpacklo_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32,
+                                105, 41, 106, 42, 107, 43, 108, 44, 109, 45, 110, 46, 111, 47, 112, 48,
+                                121, 57, 122, 58, 123, 59, 124, 60, 125, 61, 126, 62, 127, 63, 0,   64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_mask_unpacklo_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+        #[rustfmt::skip]
+        let b = _mm256_set_epi8(65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+                                81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96);
+        let r = _mm256_maskz_unpacklo_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(73,  9,  74,  10, 75,  11, 76,  12, 77,  13, 78,  14, 79,  15, 80,  16,
+                                89,  25, 90,  26, 91,  27, 92,  28, 93,  29, 94,  30, 95,  31, 96,  32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_mask_unpacklo_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi8() {
+        let a = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_set_epi8(
+            65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+        );
+        let r = _mm_maskz_unpacklo_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mov_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_mask_mov_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mov_epi16() {
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_mov_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi16(0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_mask_mov_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi16(src, 0b11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_mov_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi16(0b11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi16(2);
+        let r = _mm_mask_mov_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi16(src, 0b11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi16() {
+        let a = _mm_set1_epi16(2);
+        let r = _mm_maskz_mov_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi16(0b11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_mov_epi8() {
+        let src = _mm512_set1_epi8(1);
+        let a = _mm512_set1_epi8(2);
+        let r = _mm512_mask_mov_epi8(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi8(
+            src,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_mov_epi8() {
+        let a = _mm512_set1_epi8(2);
+        let r = _mm512_maskz_mov_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm256_set1_epi8(2);
+        let r = _mm256_mask_mov_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm256_maskz_mov_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi8(0b11111111_11111111_11111111_11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm_mask_mov_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi8(src, 0b11111111_11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi8() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm_maskz_mov_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi8(0b11111111_11111111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_set1_epi16() {
+        let src = _mm512_set1_epi16(2);
+        let a: i16 = 11;
+        let r = _mm512_mask_set1_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_set1_epi16() {
+        let a: i16 = 11;
+        let r = _mm512_maskz_set1_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi16() {
+        let src = _mm256_set1_epi16(2);
+        let a: i16 = 11;
+        let r = _mm256_mask_set1_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_set1_epi16() {
+        let a: i16 = 11;
+        let r = _mm256_maskz_set1_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi16() {
+        let src = _mm_set1_epi16(2);
+        let a: i16 = 11;
+        let r = _mm_mask_set1_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_set1_epi16() {
+        let a: i16 = 11;
+        let r = _mm_maskz_set1_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_set1_epi8() {
+        let src = _mm512_set1_epi8(2);
+        let a: i8 = 11;
+        let r = _mm512_mask_set1_epi8(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi8(
+            src,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_set1_epi8() {
+        let a: i8 = 11;
+        let r = _mm512_maskz_set1_epi8(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_epi8(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi8() {
+        let src = _mm256_set1_epi8(2);
+        let a: i8 = 11;
+        let r = _mm256_mask_set1_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_set1_epi8() {
+        let a: i8 = 11;
+        let r = _mm256_maskz_set1_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi8() {
+        let src = _mm_set1_epi8(2);
+        let a: i8 = 11;
+        let r = _mm_mask_set1_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_set1_epi8() {
+        let a: i8 = 11;
+        let r = _mm_maskz_set1_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
+            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
+        );
+        let r = _mm512_shufflelo_epi16::<0b00_01_01_11>(a);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(
+            a,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
+            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_shufflelo_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r =
+            _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,
+            16, 17, 18, 19, 23, 22, 22, 20, 24, 25, 26, 27, 31, 30, 30, 28,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_shufflelo_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_shufflelo_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_shufflelo_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_shufflelo_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
+            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
+        );
+        let r = _mm512_shufflehi_epi16::<0b00_01_01_11>(a);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(
+            a,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
+            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_shufflehi_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r =
+            _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,
+            19, 18, 18, 16, 20, 21, 22, 23, 27, 26, 26, 24, 28, 29, 30, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_shufflehi_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
+        let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_shufflehi_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
+        let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_shufflehi_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111, a);
+        let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_shufflehi_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111, a);
+        let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_shuffle_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
+                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_shuffle_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
+                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_shuffle_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+                                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
+                                62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_shuffle_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_shuffle_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+                                30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_shuffle_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shuffle_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_shuffle_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shuffle_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(
+            14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_test_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi16_mask(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_test_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi16_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_test_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi16_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi16_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_test_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_test_epi16_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_test_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi16_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_test_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi8_mask(a, b);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_test_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi8_mask(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_test_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi8_mask(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi8_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_test_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_test_epi8_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_test_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi8_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_testn_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi16_mask(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_testn_epi16_mask() {
+        let a = _mm512_set1_epi16(1 << 0);
+        let b = _mm512_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm512_mask_testn_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi16_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_testn_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_testn_epi16_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi16_mask() {
+        let a = _mm256_set1_epi16(1 << 0);
+        let b = _mm256_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm256_mask_testn_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi16_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_testn_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_testn_epi16_mask(a, b);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi16_mask() {
+        let a = _mm_set1_epi16(1 << 0);
+        let b = _mm_set1_epi16(1 << 0 | 1 << 1);
+        let r = _mm_mask_testn_epi16_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi16_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_testn_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi8_mask(a, b);
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_testn_epi8_mask() {
+        let a = _mm512_set1_epi8(1 << 0);
+        let b = _mm512_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm512_mask_testn_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi8_mask(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_testn_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_testn_epi8_mask(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi8_mask() {
+        let a = _mm256_set1_epi8(1 << 0);
+        let b = _mm256_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm256_mask_testn_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi8_mask(0b11111111_11111111_11111111_11111111, a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_testn_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_testn_epi8_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi8_mask() {
+        let a = _mm_set1_epi8(1 << 0);
+        let b = _mm_set1_epi8(1 << 0 | 1 << 1);
+        let r = _mm_mask_testn_epi8_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi8_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_store_mask64() {
+        let a: __mmask64 =
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
+        let mut r = 0;
+        _store_mask64(&mut r as *mut _ as *mut u64, a);
+        assert_eq!(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_store_mask32() {
+        let a: __mmask32 = 0b11111111_00000000_11111111_00000000;
+        let mut r = 0;
+        _store_mask32(&mut r as *mut _ as *mut u32, a);
+        assert_eq!(r, a);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_load_mask64() {
+        let p: __mmask64 =
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
+        let r = _load_mask64(&p);
+        let e: __mmask64 =
+            0b11111111_00000000_11111111_00000000_11111111_00000000_11111111_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_load_mask32() {
+        let p: __mmask32 = 0b11111111_00000000_11111111_00000000;
+        let r = _load_mask32(&p);
+        let e: __mmask32 = 0b11111111_00000000_11111111_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_sad_epu8() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_sad_epu8(a, b);
+        let e = _mm512_set1_epi64(16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_dbsad_epu8() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_dbsad_epu8::<0>(a, b);
+        let e = _mm512_set1_epi16(8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_dbsad_epu8() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_mask_dbsad_epu8::<0>(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dbsad_epu8::<0>(src, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_dbsad_epu8() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(4);
+        let r = _mm512_maskz_dbsad_epu8::<0>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dbsad_epu8::<0>(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_dbsad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_dbsad_epu8::<0>(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_dbsad_epu8() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_mask_dbsad_epu8::<0>(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dbsad_epu8::<0>(src, 0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_dbsad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_maskz_dbsad_epu8::<0>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dbsad_epu8::<0>(0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_dbsad_epu8() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(4);
+        let r = _mm_dbsad_epu8::<0>(a, b);
+        let e = _mm_set1_epi16(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_dbsad_epu8() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(4);
+        let r = _mm_mask_dbsad_epu8::<0>(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dbsad_epu8::<0>(src, 0b11111111, a, b);
+        let e = _mm_set1_epi16(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_dbsad_epu8() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(4);
+        let r = _mm_maskz_dbsad_epu8::<0>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dbsad_epu8::<0>(0b11111111, a, b);
+        let e = _mm_set1_epi16(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movepi16_mask() {
+        let a = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_movepi16_mask(a);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movepi16_mask() {
+        let a = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_movepi16_mask(a);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movepi16_mask() {
+        let a = _mm_set1_epi16(1 << 15);
+        let r = _mm_movepi16_mask(a);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movepi8_mask() {
+        let a = _mm512_set1_epi8(1 << 7);
+        let r = _mm512_movepi8_mask(a);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movepi8_mask() {
+        let a = _mm256_set1_epi8(1 << 7);
+        let r = _mm256_movepi8_mask(a);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movepi8_mask() {
+        let a = _mm_set1_epi8(1 << 7);
+        let r = _mm_movepi8_mask(a);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movm_epi16() {
+        let a: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        let r = _mm512_movm_epi16(a);
+        let e = _mm512_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movm_epi16() {
+        let a: __mmask16 = 0b11111111_11111111;
+        let r = _mm256_movm_epi16(a);
+        let e = _mm256_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movm_epi16() {
+        let a: __mmask8 = 0b11111111;
+        let r = _mm_movm_epi16(a);
+        let e = _mm_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_movm_epi8() {
+        let a: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        let r = _mm512_movm_epi8(a);
+        let e =
+            _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_movm_epi8() {
+        let a: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        let r = _mm256_movm_epi8(a);
+        let e =
+            _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_movm_epi8() {
+        let a: __mmask16 = 0b11111111_11111111;
+        let r = _mm_movm_epi8(a);
+        let e =
+            _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kadd_mask32() {
+        let a: __mmask32 = 11;
+        let b: __mmask32 = 22;
+        let r = _kadd_mask32(a, b);
+        let e: __mmask32 = 33;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kadd_mask64() {
+        let a: __mmask64 = 11;
+        let b: __mmask64 = 22;
+        let r = _kadd_mask64(a, b);
+        let e: __mmask64 = 33;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kand_mask32() {
+        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kand_mask32(a, b);
+        let e: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kand_mask64() {
+        let a: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kand_mask64(a, b);
+        let e: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_knot_mask32() {
+        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _knot_mask32(a);
+        let e: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_knot_mask64() {
+        let a: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _knot_mask64(a);
+        let e: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kandn_mask32() {
+        let a: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kandn_mask32(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kandn_mask64() {
+        let a: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kandn_mask64(a, b);
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kor_mask32() {
+        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kor_mask32(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kor_mask64() {
+        let a: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kor_mask64(a, b);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxor_mask32() {
+        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kxor_mask32(a, b);
+        let e: __mmask32 = 0b11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxor_mask64() {
+        let a: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kxor_mask64(a, b);
+        let e: __mmask64 =
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxnor_mask32() {
+        let a: __mmask32 = 0b00110011_11001100_00110011_11001100;
+        let b: __mmask32 = 0b11001100_00110011_11001100_00110011;
+        let r = _kxnor_mask32(a, b);
+        let e: __mmask32 = 0b00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kxnor_mask64() {
+        let a: __mmask64 =
+            0b00110011_11001100_00110011_11001100_00110011_11001100_00110011_11001100;
+        let b: __mmask64 =
+            0b11001100_00110011_11001100_00110011_11001100_00110011_11001100_00110011;
+        let r = _kxnor_mask64(a, b);
+        let e: __mmask64 =
+            0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtepi16_epi8() {
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_cvtepi16_epi8(a);
+        let e = _mm256_set1_epi8(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepi16_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_mask_cvtepi16_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtepi16_epi8() {
+        let a = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_cvtepi16_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtepi16_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cvtepi16_epi8() {
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_cvtepi16_epi8(a);
+        let e = _mm_set1_epi8(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_mask_cvtepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi16_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_epi8() {
+        let a = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_cvtepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi16_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cvtepi16_epi8() {
+        let a = _mm_set1_epi16(2);
+        let r = _mm_cvtepi16_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_epi8() {
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a = _mm_set1_epi16(2);
+        let r = _mm_mask_cvtepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi16_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_epi8() {
+        let a = _mm_set1_epi16(2);
+        let r = _mm_maskz_cvtepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi16_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtsepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_cvtsepi16_epi8(a);
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtsepi16_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_mask_cvtsepi16_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtsepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cvtsepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_cvtsepi16_epi8(a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi16_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_mask_cvtsepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi16_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let r = _mm256_maskz_cvtsepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi16_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cvtsepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let r = _mm_cvtsepi16_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi16_epi8() {
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a = _mm_set1_epi16(i16::MAX);
+        let r = _mm_mask_cvtsepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi16_epi8(src, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let r = _mm_maskz_cvtsepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi16_epi8(0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtsepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let r = _mm512_maskz_cvtsepi16_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtsepi16_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtusepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MIN);
+        let r = _mm512_cvtusepi16_epi8(a);
+        let e = _mm256_set1_epi8(-1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtusepi16_epi8() {
+        let src = _mm256_set1_epi8(1);
+        let a = _mm512_set1_epi16(i16::MIN);
+        let r = _mm512_mask_cvtusepi16_epi8(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtusepi16_epi8(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(-1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtusepi16_epi8() {
+        let a = _mm512_set1_epi16(i16::MIN);
+        let r = _mm512_maskz_cvtusepi16_epi8(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtusepi16_epi8(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm256_set1_epi8(-1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cvtusepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MIN);
+        let r = _mm256_cvtusepi16_epi8(a);
+        let e = _mm_set1_epi8(-1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi16_epi8() {
+        let src = _mm_set1_epi8(1);
+        let a = _mm256_set1_epi16(i16::MIN);
+        let r = _mm256_mask_cvtusepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi16_epi8(src, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(-1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi16_epi8() {
+        let a = _mm256_set1_epi16(i16::MIN);
+        let r = _mm256_maskz_cvtusepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi16_epi8(0b11111111_11111111, a);
+        let e = _mm_set1_epi8(-1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cvtusepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MIN);
+        let r = _mm_cvtusepi16_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi16_epi8() {
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a = _mm_set1_epi16(i16::MIN);
+        let r = _mm_mask_cvtusepi16_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi16_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi16_epi8() {
+        let a = _mm_set1_epi16(i16::MIN);
+        let r = _mm_maskz_cvtusepi16_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi16_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtepi8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_cvtepi8_epi16(a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepi8_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_mask_cvtepi8_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi8_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtepi8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_maskz_cvtepi8_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi8_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_mask_cvtepi8_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_maskz_cvtepi8_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm_mask_cvtepi8_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm_maskz_cvtepi8_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cvtepu8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_cvtepu8_epi16(a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepu8_epi16() {
+        let src = _mm512_set1_epi16(1);
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_mask_cvtepu8_epi16(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu8_epi16(src, 0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_cvtepu8_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let r = _mm512_maskz_cvtepu8_epi16(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu8_epi16(0b11111111_11111111_11111111_11111111, a);
+        let e = _mm512_set1_epi16(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi16() {
+        let src = _mm256_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_mask_cvtepu8_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi16(src, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm256_maskz_cvtepu8_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi16(0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi16() {
+        let src = _mm_set1_epi16(1);
+        let a = _mm_set1_epi8(2);
+        let r = _mm_mask_cvtepu8_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi16(src, 0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(2);
+        let r = _mm_maskz_cvtepu8_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi16(0b11111111, a);
+        let e = _mm_set1_epi16(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_bslli_epi128() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let r = _mm512_bslli_epi128::<9>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_bsrli_epi128() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+            33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+            49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        );
+        let r = _mm512_bsrli_epi128::<3>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+            0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+            0, 0, 0, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            0, 0, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_alignr_epi8::<14>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_alignr_epi8::<14>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_alignr_epi8::<14>(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_maskz_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_alignr_epi8::<14>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_alignr_epi8::<14>(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_alignr_epi8::<14>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi8::<14>(a, 0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+            1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        );
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_alignr_epi8::<14>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi8::<14>(0b11111111_11111111_11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi8() {
+        let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_alignr_epi8::<14>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi8::<14>(a, 0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi8() {
+        let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_alignr_epi8::<14>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi8::<14>(0b11111111_11111111, a, b);
+        let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtsepi16_storeu_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi16_storeu_epi8(
+            &mut r as *mut _ as *mut i8,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm256_set1_epi8(i8::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi16_storeu_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi16_storeu_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtepi16_storeu_epi8() {
+        let a = _mm512_set1_epi16(8);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtepi16_storeu_epi8(
+            &mut r as *mut _ as *mut i8,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm256_set1_epi8(8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_storeu_epi8() {
+        let a = _mm256_set1_epi16(8);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_storeu_epi8() {
+        let a = _mm_set1_epi16(8);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cvtusepi16_storeu_epi8() {
+        let a = _mm512_set1_epi16(i16::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi16_storeu_epi8(
+            &mut r as *mut _ as *mut i8,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
+        let e = _mm256_set1_epi8(u8::MAX as i8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi16_storeu_epi8() {
+        let a = _mm256_set1_epi16(i16::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi16_storeu_epi8() {
+        let a = _mm_set1_epi16(i16::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, 
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512cd.rs b/library/stdarch/crates/core_arch/src/x86/avx512cd.rs
new file mode 100644
index 000000000..ac9d3aed3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512cd.rs
@@ -0,0 +1,1170 @@
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastmw_epi32&expand=553)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
+pub unsafe fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
+    _mm512_set1_epi32(k as i32)
+}
+
+/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastmw_epi32&expand=552)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
+pub unsafe fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
+    _mm256_set1_epi32(k as i32)
+}
+
+/// Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastmw_epi32&expand=551)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
+pub unsafe fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
+    _mm_set1_epi32(k as i32)
+}
+
+/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastmb_epi64&expand=550)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
+pub unsafe fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
+    _mm512_set1_epi64(k as i64)
+}
+
+/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastmb_epi64&expand=549)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
+pub unsafe fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
+    _mm256_set1_epi64x(k as i64)
+}
+
+/// Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastmb_epi64&expand=548)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
+pub unsafe fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
+    _mm_set1_epi64x(k as i64)
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conflict_epi32&expand=1248)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub unsafe fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
+    transmute(vpconflictd(a.as_i32x16()))
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conflict_epi32&expand=1249)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub unsafe fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    let conflict = _mm512_conflict_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, conflict, src.as_i32x16()))
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conflict_epi32&expand=1250)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub unsafe fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    let conflict = _mm512_conflict_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, conflict, zero))
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conflict_epi32&expand=1245)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub unsafe fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
+    transmute(vpconflictd256(a.as_i32x8()))
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conflict_epi32&expand=1246)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub unsafe fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    let conflict = _mm256_conflict_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, conflict, src.as_i32x8()))
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conflict_epi32&expand=1247)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub unsafe fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    let conflict = _mm256_conflict_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, conflict, zero))
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conflict_epi32&expand=1242)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub unsafe fn _mm_conflict_epi32(a: __m128i) -> __m128i {
+    transmute(vpconflictd128(a.as_i32x4()))
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conflict_epi32&expand=1243)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub unsafe fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let conflict = _mm_conflict_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, conflict, src.as_i32x4()))
+}
+
+/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conflict_epi32&expand=1244)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictd))]
+pub unsafe fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let conflict = _mm_conflict_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, conflict, zero))
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conflict_epi64&expand=1257)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub unsafe fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
+    transmute(vpconflictq(a.as_i64x8()))
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conflict_epi64&expand=1258)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub unsafe fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    let conflict = _mm512_conflict_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, conflict, src.as_i64x8()))
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conflict_epi64&expand=1259)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub unsafe fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    let conflict = _mm512_conflict_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, conflict, zero))
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conflict_epi64&expand=1254)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub unsafe fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
+    transmute(vpconflictq256(a.as_i64x4()))
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conflict_epi64&expand=1255)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub unsafe fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    let conflict = _mm256_conflict_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, conflict, src.as_i64x4()))
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conflict_epi64&expand=1256)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub unsafe fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    let conflict = _mm256_conflict_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, conflict, zero))
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conflict_epi64&expand=1251)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub unsafe fn _mm_conflict_epi64(a: __m128i) -> __m128i {
+    transmute(vpconflictq128(a.as_i64x2()))
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conflict_epi64&expand=1252)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub unsafe fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let conflict = _mm_conflict_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, conflict, src.as_i64x2()))
+}
+
+/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conflict_epi64&expand=1253)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vpconflictq))]
+pub unsafe fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let conflict = _mm_conflict_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, conflict, zero))
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_lzcnt_epi32&expand=3491)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub unsafe fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
+    transmute(vplzcntd(a.as_i32x16(), false))
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_lzcnt_epi32&expand=3492)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub unsafe fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, zerocount, src.as_i32x16()))
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_lzcnt_epi32&expand=3493)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub unsafe fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, zerocount, zero))
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_lzcnt_epi32&expand=3488)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub unsafe fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
+    transmute(vplzcntd256(a.as_i32x8(), false))
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_lzcnt_epi32&expand=3489)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub unsafe fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, zerocount, src.as_i32x8()))
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_lzcnt_epi32&expand=3490)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub unsafe fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, zerocount, zero))
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lzcnt_epi32&expand=3485)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub unsafe fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
+    transmute(vplzcntd128(a.as_i32x4(), false))
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_lzcnt_epi32&expand=3486)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub unsafe fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, zerocount, src.as_i32x4()))
+}
+
+/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_lzcnt_epi32&expand=3487)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntd))]
+pub unsafe fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, zerocount, zero))
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_lzcnt_epi64&expand=3500)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub unsafe fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
+    transmute(vplzcntq(a.as_i64x8(), false))
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_lzcnt_epi64&expand=3501)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub unsafe fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, zerocount, src.as_i64x8()))
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_lzcnt_epi64&expand=3502)
+#[inline]
+#[target_feature(enable = "avx512cd")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub unsafe fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, zerocount, zero))
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_lzcnt_epi64&expand=3497)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub unsafe fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
+    transmute(vplzcntq256(a.as_i64x4(), false))
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_lzcnt_epi64&expand=3498)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub unsafe fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, zerocount, src.as_i64x4()))
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_lzcnt_epi64&expand=3499)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub unsafe fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, zerocount, zero))
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lzcnt_epi64&expand=3494)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub unsafe fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
+    transmute(vplzcntq128(a.as_i64x2(), false))
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_lzcnt_epi64&expand=3495)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub unsafe fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, zerocount, src.as_i64x2()))
+}
+
+/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_lzcnt_epi64&expand=3496)
+#[inline]
+#[target_feature(enable = "avx512cd,avx512vl")]
+#[cfg_attr(test, assert_instr(vplzcntq))]
+pub unsafe fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, zerocount, zero))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.conflict.d.512"]
+    fn vpconflictd(a: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.conflict.d.256"]
+    fn vpconflictd256(a: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.conflict.d.128"]
+    fn vpconflictd128(a: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.conflict.q.512"]
+    fn vpconflictq(a: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.conflict.q.256"]
+    fn vpconflictq256(a: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.conflict.q.128"]
+    fn vpconflictq128(a: i64x2) -> i64x2;
+
+    #[link_name = "llvm.ctlz.v16i32"]
+    fn vplzcntd(a: i32x16, nonzero: bool) -> i32x16;
+    #[link_name = "llvm.ctlz.v8i32"]
+    fn vplzcntd256(a: i32x8, nonzero: bool) -> i32x8;
+    #[link_name = "llvm.ctlz.v4i32"]
+    fn vplzcntd128(a: i32x4, nonzero: bool) -> i32x4;
+
+    #[link_name = "llvm.ctlz.v8i64"]
+    fn vplzcntq(a: i64x8, nonzero: bool) -> i64x8;
+    #[link_name = "llvm.ctlz.v4i64"]
+    fn vplzcntq256(a: i64x4, nonzero: bool) -> i64x4;
+    #[link_name = "llvm.ctlz.v2i64"]
+    fn vplzcntq128(a: i64x2, nonzero: bool) -> i64x2;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_broadcastmw_epi32() {
+        let a: __mmask16 = 2;
+        let r = _mm512_broadcastmw_epi32(a);
+        let e = _mm512_set1_epi32(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_broadcastmw_epi32() {
+        let a: __mmask16 = 2;
+        let r = _mm256_broadcastmw_epi32(a);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_broadcastmw_epi32() {
+        let a: __mmask16 = 2;
+        let r = _mm_broadcastmw_epi32(a);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_broadcastmb_epi64() {
+        let a: __mmask8 = 2;
+        let r = _mm512_broadcastmb_epi64(a);
+        let e = _mm512_set1_epi64(2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_broadcastmb_epi64() {
+        let a: __mmask8 = 2;
+        let r = _mm256_broadcastmb_epi64(a);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_broadcastmb_epi64() {
+        let a: __mmask8 = 2;
+        let r = _mm_broadcastmb_epi64(a);
+        let e = _mm_set1_epi64x(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_conflict_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_conflict_epi32(a);
+        let e = _mm512_set_epi32(
+            1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_conflict_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_mask_conflict_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_conflict_epi32(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(
+            1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_conflict_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_conflict_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_conflict_epi32(0b11111111_11111111, a);
+        let e = _mm512_set_epi32(
+            1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+            1 << 9 | 1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 8 | 1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_conflict_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_conflict_epi32(a);
+        let e = _mm256_set_epi32(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_conflict_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_mask_conflict_epi32(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_conflict_epi32(a, 0b11111111, a);
+        let e = _mm256_set_epi32(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_conflict_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_conflict_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_conflict_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_conflict_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_conflict_epi32(a);
+        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_conflict_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_mask_conflict_epi32(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_conflict_epi32(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_conflict_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_maskz_conflict_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_conflict_epi32(0b00001111, a);
+        let e = _mm_set_epi32(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_conflict_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_conflict_epi64(a);
+        let e = _mm512_set_epi64(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_conflict_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_mask_conflict_epi64(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_conflict_epi64(a, 0b11111111, a);
+        let e = _mm512_set_epi64(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_conflict_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_conflict_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_conflict_epi64(0b11111111, a);
+        let e = _mm512_set_epi64(
+            1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 3 | 1 << 2 | 1 << 1 | 1 << 0,
+            1 << 2 | 1 << 1 | 1 << 0,
+            1 << 1 | 1 << 0,
+            1 << 0,
+            0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_conflict_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_conflict_epi64(a);
+        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_conflict_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_conflict_epi64(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_conflict_epi64(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_conflict_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_conflict_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_conflict_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 2 | 1 << 1 | 1 << 0, 1 << 1 | 1 << 0, 1 << 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_conflict_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_conflict_epi64(a);
+        let e = _mm_set_epi64x(1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_conflict_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_mask_conflict_epi64(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_conflict_epi64(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_conflict_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_maskz_conflict_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_conflict_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_lzcnt_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_lzcnt_epi32(a);
+        let e = _mm512_set1_epi32(31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_lzcnt_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let r = _mm512_mask_lzcnt_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_lzcnt_epi32(a, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_lzcnt_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_lzcnt_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_lzcnt_epi32(0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(30);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_lzcnt_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_lzcnt_epi32(a);
+        let e = _mm256_set1_epi32(31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_lzcnt_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_mask_lzcnt_epi32(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_lzcnt_epi32(a, 0b11111111, a);
+        let e = _mm256_set1_epi32(31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_lzcnt_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_lzcnt_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_lzcnt_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_lzcnt_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_lzcnt_epi32(a);
+        let e = _mm_set1_epi32(31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_lzcnt_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_mask_lzcnt_epi32(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_lzcnt_epi32(a, 0b00001111, a);
+        let e = _mm_set1_epi32(31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_lzcnt_epi32() {
+        let a = _mm_set1_epi32(1);
+        let r = _mm_maskz_lzcnt_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_lzcnt_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_lzcnt_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_lzcnt_epi64(a);
+        let e = _mm512_set1_epi64(63);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_mask_lzcnt_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let r = _mm512_mask_lzcnt_epi64(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_lzcnt_epi64(a, 0b11111111, a);
+        let e = _mm512_set1_epi64(63);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd")]
+    unsafe fn test_mm512_maskz_lzcnt_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_lzcnt_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_lzcnt_epi64(0b11111111, a);
+        let e = _mm512_set1_epi64(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_lzcnt_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_lzcnt_epi64(a);
+        let e = _mm256_set1_epi64x(63);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_mask_lzcnt_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_lzcnt_epi64(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_lzcnt_epi64(a, 0b00001111, a);
+        let e = _mm256_set1_epi64x(63);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm256_maskz_lzcnt_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_lzcnt_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_lzcnt_epi64(0b00001111, a);
+        let e = _mm256_set1_epi64x(63);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_lzcnt_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_lzcnt_epi64(a);
+        let e = _mm_set1_epi64x(63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_mask_lzcnt_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_mask_lzcnt_epi64(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_lzcnt_epi64(a, 0b00001111, a);
+        let e = _mm_set1_epi64x(63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512cd,avx512vl")]
+    unsafe fn test_mm_maskz_lzcnt_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let r = _mm_maskz_lzcnt_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_lzcnt_epi64(0b00001111, a);
+        let e = _mm_set1_epi64x(63);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
new file mode 100644
index 000000000..f70a28466
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -0,0 +1,55830 @@
+use crate::{
+    arch::asm,
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::{self, transmute},
+    ptr,
+};
+
+// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vpl {
+    ($inst:expr) => {
+        concat!($inst, ", [{p:e}]")
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vpl {
+    ($inst:expr) => {
+        concat!($inst, ", [{p}]")
+    };
+}
+#[cfg(target_pointer_width = "32")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p:e}]", $inst2)
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p}]", $inst2)
+    };
+}
+
+pub(crate) use {vpl, vps};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_epi32&expand=39)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i {
+    let a = a.as_i32x16();
+    // all-0 is a properly initialized i32x16
+    let zero: i32x16 = mem::zeroed();
+    let sub = simd_sub(zero, a);
+    let cmp: i32x16 = simd_gt(a, zero);
+    transmute(simd_select(cmp, a, sub))
+}
+
+/// Computes the absolute value of packed 32-bit integers in `a`, and store the
+/// unsigned results in `dst` using writemask `k` (elements are copied from
+/// `src` when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_epi32&expand=40)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, abs, src.as_i32x16()))
+}
+
+/// Computes the absolute value of packed 32-bit integers in `a`, and store the
+/// unsigned results in `dst` using zeromask `k` (elements are zeroed out when
+/// the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_abs_epi32&expand=41)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_abs_epi32&expand=37)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm256_mask_abs_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    let abs = _mm256_abs_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, abs, src.as_i32x8()))
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_abs_epi32&expand=38)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm256_maskz_abs_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    let abs = _mm256_abs_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_abs_epi32&expand=34)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm_mask_abs_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let abs = _mm_abs_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, abs, src.as_i32x4()))
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_abs_epi32&expand=35)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+pub unsafe fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let abs = _mm_abs_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_epi64&expand=48)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub unsafe fn _mm512_abs_epi64(a: __m512i) -> __m512i {
+    let a = a.as_i64x8();
+    // all-0 is a properly initialized i64x8
+    let zero: i64x8 = mem::zeroed();
+    let sub = simd_sub(zero, a);
+    let cmp: i64x8 = simd_gt(a, zero);
+    transmute(simd_select(cmp, a, sub))
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_epi64&expand=49)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub unsafe fn _mm512_mask_abs_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, abs, src.as_i64x8()))
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_abs_epi64&expand=50)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub unsafe fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    let abs = _mm512_abs_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi64&expand=45)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub unsafe fn _mm256_abs_epi64(a: __m256i) -> __m256i {
+    let a = a.as_i64x4();
+    // all-0 is a properly initialized i64x4
+    let zero: i64x4 = mem::zeroed();
+    let sub = simd_sub(zero, a);
+    let cmp: i64x4 = simd_gt(a, zero);
+    transmute(simd_select(cmp, a, sub))
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_abs_epi64&expand=46)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub unsafe fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    let abs = _mm256_abs_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, abs, src.as_i64x4()))
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi64&expand=45)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub unsafe fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    let abs = _mm256_abs_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
+/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ps&expand=65)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm512_abs_ps(v2: __m512) -> __m512 {
+    let a = _mm512_set1_epi32(0x7FFFFFFF); // from LLVM code
+    let b = transmute::<f32x16, __m512i>(v2.as_f32x16());
+    let abs = _mm512_and_epi32(a, b);
+    transmute(abs)
+}
+
+/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_ps&expand=66)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub unsafe fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 {
+    let abs = _mm512_abs_ps(v2).as_f32x16();
+    transmute(simd_select_bitmask(k, abs, src.as_f32x16()))
+}
+
+/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_pd&expand=60)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm512_abs_pd(v2: __m512d) -> __m512d {
+    let a = _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF); // from LLVM code
+    let b = transmute::<f64x8, __m512i>(v2.as_f64x8());
+    let abs = _mm512_and_epi64(a, b);
+    transmute(abs)
+}
+
+/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_pd&expand=61)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d {
+    let abs = _mm512_abs_pd(v2).as_f64x8();
+    transmute(simd_select_bitmask(k, abs, src.as_f64x8()))
+}
+
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi32&expand=3801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    let mov = a.as_i32x16();
+    transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi32&expand=3802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    let mov = a.as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_epi32&expand=3799)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm256_mask_mov_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    let mov = a.as_i32x8();
+    transmute(simd_select_bitmask(k, mov, src.as_i32x8()))
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_epi32&expand=3800)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm256_maskz_mov_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    let mov = a.as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_epi32&expand=3797)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm_mask_mov_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let mov = a.as_i32x4();
+    transmute(simd_select_bitmask(k, mov, src.as_i32x4()))
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_epi32&expand=3798)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm_maskz_mov_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let mov = a.as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi64&expand=3807)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    let mov = a.as_i64x8();
+    transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi64&expand=3808)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    let mov = a.as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_epi64&expand=3805)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm256_mask_mov_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    let mov = a.as_i64x4();
+    transmute(simd_select_bitmask(k, mov, src.as_i64x4()))
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_epi64&expand=3806)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm256_maskz_mov_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    let mov = a.as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_epi64&expand=3803)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm_mask_mov_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let mov = a.as_i64x2();
+    transmute(simd_select_bitmask(k, mov, src.as_i64x2()))
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_epi64&expand=3804)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm_maskz_mov_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let mov = a.as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_ps&expand=3825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    let mov = a.as_f32x16();
+    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_ps&expand=3826)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
+    let mov = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_ps&expand=3823)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm256_mask_mov_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    let mov = a.as_f32x8();
+    transmute(simd_select_bitmask(k, mov, src.as_f32x8()))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_ps&expand=3824)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm256_maskz_mov_ps(k: __mmask8, a: __m256) -> __m256 {
+    let mov = a.as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_ps&expand=3821)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm_mask_mov_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    let mov = a.as_f32x4();
+    transmute(simd_select_bitmask(k, mov, src.as_f32x4()))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_ps&expand=3822)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm_maskz_mov_ps(k: __mmask8, a: __m128) -> __m128 {
+    let mov = a.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_pd&expand=3819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    let mov = a.as_f64x8();
+    transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_pd&expand=3820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
+    let mov = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_pd&expand=3817)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm256_mask_mov_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    let mov = a.as_f64x4();
+    transmute(simd_select_bitmask(k, mov, src.as_f64x4()))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_pd&expand=3818)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm256_maskz_mov_pd(k: __mmask8, a: __m256d) -> __m256d {
+    let mov = a.as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_pd&expand=3815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm_mask_mov_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    let mov = a.as_f64x2();
+    transmute(simd_select_bitmask(k, mov, src.as_f64x2()))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_pd&expand=3816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm_maskz_mov_pd(k: __mmask8, a: __m128d) -> __m128d {
+    let mov = a.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_epi32&expand=100)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub unsafe fn _mm512_add_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_add(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_epi32&expand=101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub unsafe fn _mm512_mask_add_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let add = _mm512_add_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, add, src.as_i32x16()))
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_epi32&expand=102)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub unsafe fn _mm512_maskz_add_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let add = _mm512_add_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_epi32&expand=98)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub unsafe fn _mm256_mask_add_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let add = _mm256_add_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, add, src.as_i32x8()))
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_epi32&expand=99)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub unsafe fn _mm256_maskz_add_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let add = _mm256_add_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_epi32&expand=95)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub unsafe fn _mm_mask_add_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let add = _mm_add_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, add, src.as_i32x4()))
+}
+
+/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_epi32&expand=96)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+pub unsafe fn _mm_maskz_add_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let add = _mm_add_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_epi64&expand=109)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub unsafe fn _mm512_add_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_add(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_epi64&expand=110)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub unsafe fn _mm512_mask_add_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let add = _mm512_add_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, add, src.as_i64x8()))
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_epi64&expand=111)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub unsafe fn _mm512_maskz_add_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let add = _mm512_add_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_epi64&expand=107)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub unsafe fn _mm256_mask_add_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let add = _mm256_add_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, add, src.as_i64x4()))
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_epi64&expand=108)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub unsafe fn _mm256_maskz_add_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let add = _mm256_add_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_epi64&expand=104)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub unsafe fn _mm_mask_add_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let add = _mm_add_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, add, src.as_i64x2()))
+}
+
+/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_epi64&expand=105)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+pub unsafe fn _mm_maskz_add_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let add = _mm_add_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ps&expand=139)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub unsafe fn _mm512_add_ps(a: __m512, b: __m512) -> __m512 {
+    transmute(simd_add(a.as_f32x16(), b.as_f32x16()))
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ps&expand=140)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub unsafe fn _mm512_mask_add_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let add = _mm512_add_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, add, src.as_f32x16()))
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ps&expand=141)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub unsafe fn _mm512_maskz_add_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let add = _mm512_add_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ps&expand=137)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub unsafe fn _mm256_mask_add_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let add = _mm256_add_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, add, src.as_f32x8()))
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ps&expand=138)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub unsafe fn _mm256_maskz_add_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let add = _mm256_add_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ps&expand=134)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub unsafe fn _mm_mask_add_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let add = _mm_add_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, add, src.as_f32x4()))
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ps&expand=135)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddps))]
+pub unsafe fn _mm_maskz_add_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let add = _mm_add_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_pd&expand=127)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub unsafe fn _mm512_add_pd(a: __m512d, b: __m512d) -> __m512d {
+    transmute(simd_add(a.as_f64x8(), b.as_f64x8()))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_pd&expand=128)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub unsafe fn _mm512_mask_add_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let add = _mm512_add_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, add, src.as_f64x8()))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_pd&expand=129)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub unsafe fn _mm512_maskz_add_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let add = _mm512_add_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_pd&expand=125)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub unsafe fn _mm256_mask_add_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let add = _mm256_add_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, add, src.as_f64x4()))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_pd&expand=126)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub unsafe fn _mm256_maskz_add_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let add = _mm256_add_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_pd&expand=122)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub unsafe fn _mm_mask_add_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let add = _mm_add_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, add, src.as_f64x2()))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_pd&expand=123)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vaddpd))]
+pub unsafe fn _mm_maskz_add_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let add = _mm_add_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, add, zero))
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi32&expand=5694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub unsafe fn _mm512_sub_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_sub(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_epi32&expand=5692)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub unsafe fn _mm512_mask_sub_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let sub = _mm512_sub_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, sub, src.as_i32x16()))
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_epi32&expand=5693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub unsafe fn _mm512_maskz_sub_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let sub = _mm512_sub_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_epi32&expand=5689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub unsafe fn _mm256_mask_sub_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let sub = _mm256_sub_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, sub, src.as_i32x8()))
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_epi32&expand=5690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub unsafe fn _mm256_maskz_sub_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let sub = _mm256_sub_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_epi32&expand=5686)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub unsafe fn _mm_mask_sub_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let sub = _mm_sub_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, sub, src.as_i32x4()))
+}
+
+/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_epi32&expand=5687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+pub unsafe fn _mm_maskz_sub_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let sub = _mm_sub_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi64&expand=5703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub unsafe fn _mm512_sub_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_sub(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_epi64&expand=5701)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub unsafe fn _mm512_mask_sub_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let sub = _mm512_sub_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, sub, src.as_i64x8()))
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_epi64&expand=5702)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub unsafe fn _mm512_maskz_sub_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let sub = _mm512_sub_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_epi64&expand=5698)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub unsafe fn _mm256_mask_sub_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let sub = _mm256_sub_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, sub, src.as_i64x4()))
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_epi64&expand=5699)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub unsafe fn _mm256_maskz_sub_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let sub = _mm256_sub_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_epi64&expand=5695)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub unsafe fn _mm_mask_sub_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let sub = _mm_sub_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, sub, src.as_i64x2()))
+}
+
+/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_epi64&expand=5696)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+pub unsafe fn _mm_maskz_sub_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let sub = _mm_sub_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ps&expand=5733)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub unsafe fn _mm512_sub_ps(a: __m512, b: __m512) -> __m512 {
+    transmute(simd_sub(a.as_f32x16(), b.as_f32x16()))
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ps&expand=5731)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub unsafe fn _mm512_mask_sub_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let sub = _mm512_sub_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, sub, src.as_f32x16()))
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ps&expand=5732)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub unsafe fn _mm512_maskz_sub_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let sub = _mm512_sub_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ps&expand=5728)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub unsafe fn _mm256_mask_sub_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let sub = _mm256_sub_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, sub, src.as_f32x8()))
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ps&expand=5729)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub unsafe fn _mm256_maskz_sub_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let sub = _mm256_sub_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ps&expand=5725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub unsafe fn _mm_mask_sub_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let sub = _mm_sub_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, sub, src.as_f32x4()))
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ps&expand=5726)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubps))]
+pub unsafe fn _mm_maskz_sub_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let sub = _mm_sub_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_pd&expand=5721)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub unsafe fn _mm512_sub_pd(a: __m512d, b: __m512d) -> __m512d {
+    transmute(simd_sub(a.as_f64x8(), b.as_f64x8()))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_pd&expand=5719)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub unsafe fn _mm512_mask_sub_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let sub = _mm512_sub_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, sub, src.as_f64x8()))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_pd&expand=5720)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub unsafe fn _mm512_maskz_sub_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let sub = _mm512_sub_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_pd&expand=5716)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub unsafe fn _mm256_mask_sub_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let sub = _mm256_sub_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, sub, src.as_f64x4()))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_pd&expand=5717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub unsafe fn _mm256_maskz_sub_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let sub = _mm256_sub_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_pd&expand=5713)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub unsafe fn _mm_mask_sub_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let sub = _mm_sub_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, sub, src.as_f64x2()))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_pd&expand=5714)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsubpd))]
+pub unsafe fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let sub = _mm_sub_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, sub, zero))
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_epi32&expand=3907)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub unsafe fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmuldq(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_epi32&expand=3905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub unsafe fn _mm512_mask_mul_epi32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let mul = _mm512_mul_epi32(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_epi32&expand=3906)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub unsafe fn _mm512_maskz_mul_epi32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let mul = _mm512_mul_epi32(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_epi32&expand=3902)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub unsafe fn _mm256_mask_mul_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let mul = _mm256_mul_epi32(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, mul, src.as_i64x4()))
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_epi32&expand=3903)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub unsafe fn _mm256_maskz_mul_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let mul = _mm256_mul_epi32(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_epi32&expand=3899)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub unsafe fn _mm_mask_mul_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mul_epi32(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, mul, src.as_i64x2()))
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_epi32&expand=3900)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+pub unsafe fn _mm_maskz_mul_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mul_epi32(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mullo_epi&expand=4005)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub unsafe fn _mm512_mullo_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_mul(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mullo_epi32&expand=4003)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub unsafe fn _mm512_mask_mullo_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let mul = _mm512_mullo_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, mul, src.as_i32x16()))
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mullo_epi32&expand=4004)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub unsafe fn _mm512_maskz_mullo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let mul = _mm512_mullo_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mullo_epi32&expand=4000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub unsafe fn _mm256_mask_mullo_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let mul = _mm256_mullo_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, mul, src.as_i32x8()))
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mullo_epi32&expand=4001)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub unsafe fn _mm256_maskz_mullo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let mul = _mm256_mullo_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mullo_epi32&expand=3997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub unsafe fn _mm_mask_mullo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mullo_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, mul, src.as_i32x4()))
+}
+
+/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mullo_epi32&expand=3998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+pub unsafe fn _mm_maskz_mullo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mullo_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mullox_epi64&expand=4017)
+///
+/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mullox_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_mul(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mullox&expand=4016)
+///
+/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_mullox_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let mul = _mm512_mullox_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mul_epu32&expand=3916)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub unsafe fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmuludq(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mul_epu32&expand=3914)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub unsafe fn _mm512_mask_mul_epu32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let mul = _mm512_mul_epu32(a, b).as_u64x8();
+    transmute(simd_select_bitmask(k, mul, src.as_u64x8()))
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mul_epu32&expand=3915)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub unsafe fn _mm512_maskz_mul_epu32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let mul = _mm512_mul_epu32(a, b).as_u64x8();
+    let zero = _mm512_setzero_si512().as_u64x8();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_epu32&expand=3911)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub unsafe fn _mm256_mask_mul_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let mul = _mm256_mul_epu32(a, b).as_u64x4();
+    transmute(simd_select_bitmask(k, mul, src.as_u64x4()))
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_epu32&expand=3912)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub unsafe fn _mm256_maskz_mul_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let mul = _mm256_mul_epu32(a, b).as_u64x4();
+    let zero = _mm256_setzero_si256().as_u64x4();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_epu32&expand=3908)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub unsafe fn _mm_mask_mul_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mul_epu32(a, b).as_u64x2();
+    transmute(simd_select_bitmask(k, mul, src.as_u64x2()))
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_epu32&expand=3909)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+pub unsafe fn _mm_maskz_mul_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let mul = _mm_mul_epu32(a, b).as_u64x2();
+    let zero = _mm_setzero_si128().as_u64x2();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ps&expand=3934)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub unsafe fn _mm512_mul_ps(a: __m512, b: __m512) -> __m512 {
+    transmute(simd_mul(a.as_f32x16(), b.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ps&expand=3932)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub unsafe fn _mm512_mask_mul_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let mul = _mm512_mul_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, mul, src.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ps&expand=3933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub unsafe fn _mm512_maskz_mul_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let mul = _mm512_mul_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ps&expand=3929)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub unsafe fn _mm256_mask_mul_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let mul = _mm256_mul_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, mul, src.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ps&expand=3930)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub unsafe fn _mm256_maskz_mul_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let mul = _mm256_mul_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ps&expand=3926)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub unsafe fn _mm_mask_mul_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mul = _mm_mul_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, mul, src.as_f32x4()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ps&expand=3927)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulps))]
+pub unsafe fn _mm_maskz_mul_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mul = _mm_mul_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pd&expand=3925)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub unsafe fn _mm512_mul_pd(a: __m512d, b: __m512d) -> __m512d {
+    transmute(simd_mul(a.as_f64x8(), b.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pd&expand=3923)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub unsafe fn _mm512_mask_mul_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let mul = _mm512_mul_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, mul, src.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pd&expand=3924)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub unsafe fn _mm512_maskz_mul_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let mul = _mm512_mul_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pd&expand=3920)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub unsafe fn _mm256_mask_mul_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let mul = _mm256_mul_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, mul, src.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pd&expand=3921)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub unsafe fn _mm256_maskz_mul_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let mul = _mm256_mul_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pd&expand=3917)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub unsafe fn _mm_mask_mul_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mul = _mm_mul_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, mul, src.as_f64x2()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pd&expand=3918)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmulpd))]
+pub unsafe fn _mm_maskz_mul_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mul = _mm_mul_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, mul, zero))
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ps&expand=2162)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub unsafe fn _mm512_div_ps(a: __m512, b: __m512) -> __m512 {
+    transmute(simd_div(a.as_f32x16(), b.as_f32x16()))
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ps&expand=2163)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub unsafe fn _mm512_mask_div_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let div = _mm512_div_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, div, src.as_f32x16()))
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ps&expand=2164)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub unsafe fn _mm512_maskz_div_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let div = _mm512_div_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, div, zero))
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ps&expand=2160)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub unsafe fn _mm256_mask_div_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let div = _mm256_div_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, div, src.as_f32x8()))
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ps&expand=2161)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub unsafe fn _mm256_maskz_div_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let div = _mm256_div_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, div, zero))
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ps&expand=2157)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub unsafe fn _mm_mask_div_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let div = _mm_div_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, div, src.as_f32x4()))
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ps&expand=2158)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivps))]
+pub unsafe fn _mm_maskz_div_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let div = _mm_div_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, div, zero))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_div_pd&expand=2153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub unsafe fn _mm512_div_pd(a: __m512d, b: __m512d) -> __m512d {
+    transmute(simd_div(a.as_f64x8(), b.as_f64x8()))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_pd&expand=2154)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub unsafe fn _mm512_mask_div_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let div = _mm512_div_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, div, src.as_f64x8()))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_pd&expand=2155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub unsafe fn _mm512_maskz_div_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let div = _mm512_div_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, div, zero))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_pd&expand=2151)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub unsafe fn _mm256_mask_div_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let div = _mm256_div_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, div, src.as_f64x4()))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_pd&expand=2152)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub unsafe fn _mm256_maskz_div_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let div = _mm256_div_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, div, zero))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_pd&expand=2148)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub unsafe fn _mm_mask_div_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let div = _mm_div_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, div, src.as_f64x2()))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_pd&expand=2149)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vdivpd))]
+pub unsafe fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let div = _mm_div_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, div, zero))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epi32&expand=3582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub unsafe fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaxsd(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epi32&expand=3580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub unsafe fn _mm512_mask_max_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, max, src.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epi32&expand=3581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub unsafe fn _mm512_maskz_max_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epi32&expand=3577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub unsafe fn _mm256_mask_max_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, max, src.as_i32x8()))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epi32&expand=3578)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub unsafe fn _mm256_maskz_max_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epi32&expand=3574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub unsafe fn _mm_mask_max_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, max, src.as_i32x4()))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epi32&expand=3575)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+pub unsafe fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epi64&expand=3591)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub unsafe fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaxsq(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epi64&expand=3589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub unsafe fn _mm512_mask_max_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, max, src.as_i64x8()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epi64&expand=3590)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub unsafe fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi64&expand=3588)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub unsafe fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpmaxsq256(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epi64&expand=3586)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub unsafe fn _mm256_mask_max_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, max, src.as_i64x4()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epi64&expand=3587)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub unsafe fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi64&expand=3585)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub unsafe fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpmaxsq128(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epi64&expand=3583)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub unsafe fn _mm_mask_max_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, max, src.as_i64x2()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epi64&expand=3584)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxsq))]
+pub unsafe fn _mm_maskz_max_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ps&expand=3655)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub unsafe fn _mm512_max_ps(a: __m512, b: __m512) -> __m512 {
+    transmute(vmaxps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ps&expand=3653)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub unsafe fn _mm512_mask_max_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let max = _mm512_max_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, max, src.as_f32x16()))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ps&expand=3654)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub unsafe fn _mm512_maskz_max_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let max = _mm512_max_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ps&expand=3650)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub unsafe fn _mm256_mask_max_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let max = _mm256_max_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, max, src.as_f32x8()))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ps&expand=3651)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub unsafe fn _mm256_maskz_max_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let max = _mm256_max_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ps&expand=3647)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub unsafe fn _mm_mask_max_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let max = _mm_max_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, max, src.as_f32x4()))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ps&expand=3648)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxps))]
+pub unsafe fn _mm_maskz_max_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let max = _mm_max_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_pd&expand=3645)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub unsafe fn _mm512_max_pd(a: __m512d, b: __m512d) -> __m512d {
+    transmute(vmaxpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_pd&expand=3643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub unsafe fn _mm512_mask_max_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let max = _mm512_max_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, max, src.as_f64x8()))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_pd&expand=3644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub unsafe fn _mm512_maskz_max_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let max = _mm512_max_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_pd&expand=3640)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub unsafe fn _mm256_mask_max_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let max = _mm256_max_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, max, src.as_f64x4()))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_pd&expand=3641)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub unsafe fn _mm256_maskz_max_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let max = _mm256_max_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_pd&expand=3637)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub unsafe fn _mm_mask_max_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let max = _mm_max_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, max, src.as_f64x2()))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_pd&expand=3638)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmaxpd))]
+pub unsafe fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let max = _mm_max_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epu32&expand=3618)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub unsafe fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaxud(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epu32&expand=3616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub unsafe fn _mm512_mask_max_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epu32(a, b).as_u32x16();
+    transmute(simd_select_bitmask(k, max, src.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epu32&expand=3617)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub unsafe fn _mm512_maskz_max_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epu32(a, b).as_u32x16();
+    let zero = _mm512_setzero_si512().as_u32x16();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epu32&expand=3613)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub unsafe fn _mm256_mask_max_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epu32(a, b).as_u32x8();
+    transmute(simd_select_bitmask(k, max, src.as_u32x8()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epu32&expand=3614)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub unsafe fn _mm256_maskz_max_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epu32(a, b).as_u32x8();
+    let zero = _mm256_setzero_si256().as_u32x8();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epu32&expand=3610)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub unsafe fn _mm_mask_max_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epu32(a, b).as_u32x4();
+    transmute(simd_select_bitmask(k, max, src.as_u32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epu32&expand=3611)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+pub unsafe fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epu32(a, b).as_u32x4();
+    let zero = _mm_setzero_si128().as_u32x4();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epu64&expand=3627)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub unsafe fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmaxuq(a.as_u64x8(), b.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epu64&expand=3625)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub unsafe fn _mm512_mask_max_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epu64(a, b).as_u64x8();
+    transmute(simd_select_bitmask(k, max, src.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epu&expand=3626)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub unsafe fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let max = _mm512_max_epu64(a, b).as_u64x8();
+    let zero = _mm512_setzero_si512().as_u64x8();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu64&expand=3624)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub unsafe fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpmaxuq256(a.as_u64x4(), b.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epu64&expand=3622)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub unsafe fn _mm256_mask_max_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epu64(a, b).as_u64x4();
+    transmute(simd_select_bitmask(k, max, src.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epu64&expand=3623)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub unsafe fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let max = _mm256_max_epu64(a, b).as_u64x4();
+    let zero = _mm256_setzero_si256().as_u64x4();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu64&expand=3621)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub unsafe fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpmaxuq128(a.as_u64x2(), b.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epu64&expand=3619)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub unsafe fn _mm_mask_max_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epu64(a, b).as_u64x2();
+    transmute(simd_select_bitmask(k, max, src.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epu64&expand=3620)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmaxuq))]
+pub unsafe fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let max = _mm_max_epu64(a, b).as_u64x2();
+    let zero = _mm_setzero_si128().as_u64x2();
+    transmute(simd_select_bitmask(k, max, zero))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epi32&expand=3696)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub unsafe fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpminsd(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epi32&expand=3694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub unsafe fn _mm512_mask_min_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, min, src.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epi32&expand=3695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub unsafe fn _mm512_maskz_min_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epi32&expand=3691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub unsafe fn _mm256_mask_min_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, min, src.as_i32x8()))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epi32&expand=3692)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub unsafe fn _mm256_maskz_min_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epi32&expand=3688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub unsafe fn _mm_mask_min_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, min, src.as_i32x4()))
+}
+
+/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epi32&expand=3689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+pub unsafe fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epi64&expand=3705)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub unsafe fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpminsq(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epi64&expand=3703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub unsafe fn _mm512_mask_min_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, min, src.as_i64x8()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_epi64&expand=3704)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub unsafe fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi64&expand=3702)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub unsafe fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpminsq256(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epi64&expand=3700)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub unsafe fn _mm256_mask_min_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, min, src.as_i64x4()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epi64&expand=3701)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub unsafe fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ps&expand=3769)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub unsafe fn _mm512_min_ps(a: __m512, b: __m512) -> __m512 {
+    transmute(vminps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ps&expand=3767)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub unsafe fn _mm512_mask_min_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let min = _mm512_min_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, min, src.as_f32x16()))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ps&expand=3768)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub unsafe fn _mm512_maskz_min_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let min = _mm512_min_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ps&expand=3764)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub unsafe fn _mm256_mask_min_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let min = _mm256_min_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, min, src.as_f32x8()))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ps&expand=3765)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub unsafe fn _mm256_maskz_min_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let min = _mm256_min_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ps&expand=3761)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub unsafe fn _mm_mask_min_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let min = _mm_min_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, min, src.as_f32x4()))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ps&expand=3762)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vminps))]
+pub unsafe fn _mm_maskz_min_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let min = _mm_min_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_pd&expand=3759)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub unsafe fn _mm512_min_pd(a: __m512d, b: __m512d) -> __m512d {
+    transmute(vminpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_pd&expand=3757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub unsafe fn _mm512_mask_min_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let min = _mm512_min_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, min, src.as_f64x8()))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_pd&expand=3758)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub unsafe fn _mm512_maskz_min_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let min = _mm512_min_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_pd&expand=3754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub unsafe fn _mm256_mask_min_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let min = _mm256_min_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, min, src.as_f64x4()))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_pd&expand=3755)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub unsafe fn _mm256_maskz_min_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let min = _mm256_min_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_pd&expand=3751)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub unsafe fn _mm_mask_min_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let min = _mm_min_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, min, src.as_f64x2()))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_pd&expand=3752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vminpd))]
+pub unsafe fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let min = _mm_min_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epu32&expand=3732)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub unsafe fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpminud(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epu32&expand=3730)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub unsafe fn _mm512_mask_min_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epu32(a, b).as_u32x16();
+    transmute(simd_select_bitmask(k, min, src.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epu32&expand=3731)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub unsafe fn _mm512_maskz_min_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epu32(a, b).as_u32x16();
+    let zero = _mm512_setzero_si512().as_u32x16();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epu32&expand=3727)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub unsafe fn _mm256_mask_min_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epu32(a, b).as_u32x8();
+    transmute(simd_select_bitmask(k, min, src.as_u32x8()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epu32&expand=3728)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub unsafe fn _mm256_maskz_min_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epu32(a, b).as_u32x8();
+    let zero = _mm256_setzero_si256().as_u32x8();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epu32&expand=3724)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub unsafe fn _mm_mask_min_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epu32(a, b).as_u32x4();
+    transmute(simd_select_bitmask(k, min, src.as_u32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epu32&expand=3725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminud))]
+pub unsafe fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epu32(a, b).as_u32x4();
+    let zero = _mm_setzero_si128().as_u32x4();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epu64&expand=3741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub unsafe fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpminuq(a.as_u64x8(), b.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epu64&expand=3739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub unsafe fn _mm512_mask_min_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epu64(a, b).as_u64x8();
+    transmute(simd_select_bitmask(k, min, src.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epu64&expand=3740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub unsafe fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let min = _mm512_min_epu64(a, b).as_u64x8();
+    let zero = _mm512_setzero_si512().as_u64x8();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu64&expand=3738)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub unsafe fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpminuq256(a.as_u64x4(), b.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epu64&expand=3736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub unsafe fn _mm256_mask_min_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epu64(a, b).as_u64x4();
+    transmute(simd_select_bitmask(k, min, src.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epu64&expand=3737)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub unsafe fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let min = _mm256_min_epu64(a, b).as_u64x4();
+    let zero = _mm256_setzero_si256().as_u64x4();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu64&expand=3735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub unsafe fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpminuq128(a.as_u64x2(), b.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epu64&expand=3733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub unsafe fn _mm_mask_min_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epu64(a, b).as_u64x2();
+    transmute(simd_select_bitmask(k, min, src.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epu64&expand=3734)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpminuq))]
+pub unsafe fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epu64(a, b).as_u64x2();
+    let zero = _mm_setzero_si128().as_u64x2();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ps&expand=5371)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub unsafe fn _mm512_sqrt_ps(a: __m512) -> __m512 {
+    transmute(vsqrtps(a.as_f32x16(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ps&expand=5369)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub unsafe fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    let sqrt = _mm512_sqrt_ps(a).as_f32x16();
+    transmute(simd_select_bitmask(k, sqrt, src.as_f32x16()))
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ps&expand=5370)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub unsafe fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
+    let sqrt = _mm512_sqrt_ps(a).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, sqrt, zero))
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ps&expand=5366)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub unsafe fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    let sqrt = _mm256_sqrt_ps(a).as_f32x8();
+    transmute(simd_select_bitmask(k, sqrt, src.as_f32x8()))
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ps&expand=5367)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub unsafe fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
+    let sqrt = _mm256_sqrt_ps(a).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, sqrt, zero))
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ps&expand=5363)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub unsafe fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    let sqrt = _mm_sqrt_ps(a).as_f32x4();
+    transmute(simd_select_bitmask(k, sqrt, src.as_f32x4()))
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ps&expand=5364)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtps))]
+pub unsafe fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
+    let sqrt = _mm_sqrt_ps(a).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, sqrt, zero))
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_pd&expand=5362)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub unsafe fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
+    transmute(vsqrtpd(a.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_pd&expand=5360)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub unsafe fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    let sqrt = _mm512_sqrt_pd(a).as_f64x8();
+    transmute(simd_select_bitmask(k, sqrt, src.as_f64x8()))
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_pd&expand=5361)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub unsafe fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
+    let sqrt = _mm512_sqrt_pd(a).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, sqrt, zero))
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_pd&expand=5357)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub unsafe fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    let sqrt = _mm256_sqrt_pd(a).as_f64x4();
+    transmute(simd_select_bitmask(k, sqrt, src.as_f64x4()))
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_pd&expand=5358)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub unsafe fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
+    let sqrt = _mm256_sqrt_pd(a).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, sqrt, zero))
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_pd&expand=5354)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub unsafe fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    let sqrt = _mm_sqrt_pd(a).as_f64x2();
+    transmute(simd_select_bitmask(k, sqrt, src.as_f64x2()))
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_pd&expand=5355)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vsqrtpd))]
+pub unsafe fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
+    let sqrt = _mm_sqrt_pd(a).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, sqrt, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ps&expand=2557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ps&expand=2558)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fmadd, a.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ps&expand=2560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, fmadd, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ps&expand=2559)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fmadd, c.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ps&expand=2554)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fmadd, a.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ps&expand=2556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, fmadd, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ps&expand=2555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fmadd, c.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ps&expand=2550)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fmadd, a.as_f32x4()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ps&expand=2552)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, fmadd, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ps&expand=2551)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+pub unsafe fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fmadd, c.as_f32x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pd&expand=2545)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pd&expand=2546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fmadd, a.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pd&expand=2548)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, fmadd, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pd&expand=2547)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fmadd, c.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pd&expand=2542)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fmadd, a.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pd&expand=2544)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, fmadd, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pd&expand=2543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fmadd, c.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pd&expand=2538)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fmadd, a.as_f64x2()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pd&expand=2540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, fmadd, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pd&expand=2539)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fmadd, c.as_f64x2()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ps&expand=2643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f32x16());
+    transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ps&expand=2644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fmsub, a.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ps&expand=2646)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, fmsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ps&expand=2645)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fmsub, c.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ps&expand=2640)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fmsub, a.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ps&expand=2642)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, fmsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ps&expand=2641)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fmsub, c.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ps&expand=2636)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fmsub, a.as_f32x4()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ps&expand=2638)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, fmsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ps&expand=2637)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fmsub, c.as_f32x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_pd&expand=2631)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f64x8());
+    transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_pd&expand=2632)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fmsub, a.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_pd&expand=2634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, fmsub, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_pd&expand=2633)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fmsub, c.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_pd&expand=2628)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fmsub, a.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_pd&expand=2630)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, fmsub, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_pd&expand=2629)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fmsub, c.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_pd&expand=2624)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fmsub, a.as_f64x2()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_pd&expand=2626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, fmsub, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_pd&expand=2625)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+pub unsafe fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fmsub, c.as_f64x2()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ps&expand=2611)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    transmute(vfmaddsub213ps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        c.as_f32x16(),
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ps&expand=2612)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ps&expand=2614)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, fmaddsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ps&expand=2613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ps&expand=2608)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ps&expand=2610)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, fmaddsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ps&expand=2609)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ps&expand=2604)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x4()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ps&expand=2606)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, fmaddsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ps&expand=2605)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+pub unsafe fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_pd&expand=2599)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    transmute(vfmaddsub213pd(
+        a.as_f64x8(),
+        b.as_f64x8(),
+        c.as_f64x8(),
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_pd&expand=2600)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_pd&expand=2602)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, fmaddsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ps&expand=2613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_pd&expand=2596)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_pd&expand=2598)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, fmaddsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_pd&expand=2597)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_pd&expand=2592)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x2()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_pd&expand=2594)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, fmaddsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_pd&expand=2593)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x2()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ps&expand=2691)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f32x16());
+    transmute(vfmaddsub213ps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        sub,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ps&expand=2692)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ps&expand=2694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, fmsubadd, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ps&expand=2693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ps&expand=2688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ps&expand=2690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, fmsubadd, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ps&expand=2689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ps&expand=2684)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x4()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ps&expand=2686)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, fmsubadd, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ps&expand=2685)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_pd&expand=2679)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f64x8());
+    transmute(vfmaddsub213pd(
+        a.as_f64x8(),
+        b.as_f64x8(),
+        sub,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_pd&expand=2680)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_pd&expand=2682)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, fmsubadd, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_pd&expand=2681)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_pd&expand=2676)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_pd&expand=2678)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, fmsubadd, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_pd&expand=2677)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_pd&expand=2672)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x2()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_pd&expand=2674)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, fmsubadd, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_pd&expand=2673)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x2()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ps&expand=2723)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, a.as_f32x16());
+    transmute(vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ps&expand=2724)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fnmadd, a.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ps&expand=2726)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, fnmadd, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ps&expand=2725)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fnmadd, c.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ps&expand=2720)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fnmadd, a.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ps&expand=2722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, fnmadd, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ps&expand=2721)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fnmadd, c.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ps&expand=2716)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fnmadd, a.as_f32x4()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ps&expand=2718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, fnmadd, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ps&expand=2717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fnmadd, c.as_f32x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_pd&expand=2711)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, a.as_f64x8());
+    transmute(vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_pd&expand=2712)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fnmadd, a.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_pd&expand=2714)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, fnmadd, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_pd&expand=2713)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fnmadd, c.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_pd&expand=2708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fnmadd, a.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_pd&expand=2710)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, fnmadd, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_pd&expand=2709)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fnmadd, c.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_pd&expand=2704)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fnmadd, a.as_f64x2()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_pd&expand=2706)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, fnmadd, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_pd&expand=2705)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fnmadd, c.as_f64x2()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ps&expand=2771)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    let zero: f32x16 = mem::zeroed();
+    let suba = simd_sub(zero, a.as_f32x16());
+    let subc = simd_sub(zero, c.as_f32x16());
+    transmute(vfmadd132ps(suba, b.as_f32x16(), subc))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ps&expand=2772)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fnmsub, a.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ps&expand=2774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, fnmsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ps&expand=2773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
+    transmute(simd_select_bitmask(k, fnmsub, c.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ps&expand=2768)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fnmsub, a.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ps&expand=2770)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, fnmsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ps&expand=2769)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8();
+    transmute(simd_select_bitmask(k, fnmsub, c.as_f32x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ps&expand=2764)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fnmsub, a.as_f32x4()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ps&expand=2766)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, fnmsub, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ps&expand=2765)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4();
+    transmute(simd_select_bitmask(k, fnmsub, c.as_f32x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_pd&expand=2759)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let zero: f64x8 = mem::zeroed();
+    let suba = simd_sub(zero, a.as_f64x8());
+    let subc = simd_sub(zero, c.as_f64x8());
+    transmute(vfmadd132pd(suba, b.as_f64x8(), subc))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_pd&expand=2760)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fnmsub, a.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_pd&expand=2762)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, fnmsub, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_pd&expand=2761)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
+    transmute(simd_select_bitmask(k, fnmsub, c.as_f64x8()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_pd&expand=2756)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fnmsub, a.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_pd&expand=2758)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, fnmsub, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_pd&expand=2757)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4();
+    transmute(simd_select_bitmask(k, fnmsub, c.as_f64x4()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_pd&expand=2752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fnmsub, a.as_f64x2()))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_pd&expand=2754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, fnmsub, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_pd&expand=2753)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+pub unsafe fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2();
+    transmute(simd_select_bitmask(k, fnmsub, c.as_f64x2()))
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_ps&expand=4502)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub unsafe fn _mm512_rcp14_ps(a: __m512) -> __m512 {
+    transmute(vrcp14ps(
+        a.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        0b11111111_11111111,
+    ))
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_ps&expand=4500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub unsafe fn _mm512_mask_rcp14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    transmute(vrcp14ps(a.as_f32x16(), src.as_f32x16(), k))
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp14_ps&expand=4501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub unsafe fn _mm512_maskz_rcp14_ps(k: __mmask16, a: __m512) -> __m512 {
+    transmute(vrcp14ps(a.as_f32x16(), _mm512_setzero_ps().as_f32x16(), k))
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp14_ps&expand=4499)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub unsafe fn _mm256_rcp14_ps(a: __m256) -> __m256 {
+    transmute(vrcp14ps256(
+        a.as_f32x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        0b11111111,
+    ))
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp14_ps&expand=4497)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub unsafe fn _mm256_mask_rcp14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    transmute(vrcp14ps256(a.as_f32x8(), src.as_f32x8(), k))
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp14_ps&expand=4498)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub unsafe fn _mm256_maskz_rcp14_ps(k: __mmask8, a: __m256) -> __m256 {
+    transmute(vrcp14ps256(a.as_f32x8(), _mm256_setzero_ps().as_f32x8(), k))
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ps&expand=4496)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub unsafe fn _mm_rcp14_ps(a: __m128) -> __m128 {
+    transmute(vrcp14ps128(
+        a.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b00001111,
+    ))
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ps&expand=4494)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub unsafe fn _mm_mask_rcp14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    transmute(vrcp14ps128(a.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ps&expand=4495)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14ps))]
+pub unsafe fn _mm_maskz_rcp14_ps(k: __mmask8, a: __m128) -> __m128 {
+    transmute(vrcp14ps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_pd&expand=4493)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub unsafe fn _mm512_rcp14_pd(a: __m512d) -> __m512d {
+    transmute(vrcp14pd(
+        a.as_f64x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        0b11111111,
+    ))
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_pd&expand=4491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub unsafe fn _mm512_mask_rcp14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vrcp14pd(a.as_f64x8(), src.as_f64x8(), k))
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp14_pd&expand=4492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub unsafe fn _mm512_maskz_rcp14_pd(k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vrcp14pd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp14_pd&expand=4490)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub unsafe fn _mm256_rcp14_pd(a: __m256d) -> __m256d {
+    transmute(vrcp14pd256(
+        a.as_f64x4(),
+        _mm256_setzero_pd().as_f64x4(),
+        0b00001111,
+    ))
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp14_pd&expand=4488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub unsafe fn _mm256_mask_rcp14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vrcp14pd256(a.as_f64x4(), src.as_f64x4(), k))
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp14_pd&expand=4489)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub unsafe fn _mm256_maskz_rcp14_pd(k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vrcp14pd256(a.as_f64x4(), _mm256_setzero_pd().as_f64x4(), k))
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_pd&expand=4487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub unsafe fn _mm_rcp14_pd(a: __m128d) -> __m128d {
+    transmute(vrcp14pd128(
+        a.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b00000011,
+    ))
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_pd&expand=4485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub unsafe fn _mm_mask_rcp14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vrcp14pd128(a.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_pd&expand=4486)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrcp14pd))]
+pub unsafe fn _mm_maskz_rcp14_pd(k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vrcp14pd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_ps&expand=4819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub unsafe fn _mm512_rsqrt14_ps(a: __m512) -> __m512 {
+    transmute(vrsqrt14ps(
+        a.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        0b11111111_11111111,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_ps&expand=4817)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub unsafe fn _mm512_mask_rsqrt14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    transmute(vrsqrt14ps(a.as_f32x16(), src.as_f32x16(), k))
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=4818)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub unsafe fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
+    transmute(vrsqrt14ps(
+        a.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        k,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt14_ps&expand=4815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub unsafe fn _mm256_mask_rsqrt14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    transmute(vrsqrt14ps256(a.as_f32x8(), src.as_f32x8(), k))
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt14_ps&expand=4816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub unsafe fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 {
+    transmute(vrsqrt14ps256(
+        a.as_f32x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        k,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ps&expand=4813)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub unsafe fn _mm_mask_rsqrt14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    transmute(vrsqrt14ps128(a.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ps&expand=4814)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub unsafe fn _mm_maskz_rsqrt14_ps(k: __mmask8, a: __m128) -> __m128 {
+    transmute(vrsqrt14ps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=4812)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub unsafe fn _mm512_rsqrt14_pd(a: __m512d) -> __m512d {
+    transmute(vrsqrt14pd(
+        a.as_f64x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        0b11111111,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=4810)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub unsafe fn _mm512_mask_rsqrt14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vrsqrt14pd(a.as_f64x8(), src.as_f64x8(), k))
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=4811)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub unsafe fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vrsqrt14pd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt14_pd&expand=4808)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub unsafe fn _mm256_mask_rsqrt14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vrsqrt14pd256(a.as_f64x4(), src.as_f64x4(), k))
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt14_pd&expand=4809)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub unsafe fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vrsqrt14pd256(
+        a.as_f64x4(),
+        _mm256_setzero_pd().as_f64x4(),
+        k,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_pd&expand=4806)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub unsafe fn _mm_mask_rsqrt14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vrsqrt14pd128(a.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_pd&expand=4807)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub unsafe fn _mm_maskz_rsqrt14_pd(k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vrsqrt14pd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ps&expand=2844)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub unsafe fn _mm512_getexp_ps(a: __m512) -> __m512 {
+    transmute(vgetexpps(
+        a.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        0b11111111_11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ps&expand=2845)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub unsafe fn _mm512_mask_getexp_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    transmute(vgetexpps(
+        a.as_f32x16(),
+        src.as_f32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ps&expand=2846)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub unsafe fn _mm512_maskz_getexp_ps(k: __mmask16, a: __m512) -> __m512 {
+    transmute(vgetexpps(
+        a.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ps&expand=2841)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub unsafe fn _mm256_getexp_ps(a: __m256) -> __m256 {
+    transmute(vgetexpps256(
+        a.as_f32x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ps&expand=2842)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub unsafe fn _mm256_mask_getexp_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    transmute(vgetexpps256(a.as_f32x8(), src.as_f32x8(), k))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ps&expand=2843)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub unsafe fn _mm256_maskz_getexp_ps(k: __mmask8, a: __m256) -> __m256 {
+    transmute(vgetexpps256(
+        a.as_f32x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        k,
+    ))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ps&expand=2838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub unsafe fn _mm_getexp_ps(a: __m128) -> __m128 {
+    transmute(vgetexpps128(
+        a.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b00001111,
+    ))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ps&expand=2839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub unsafe fn _mm_mask_getexp_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    transmute(vgetexpps128(a.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ps&expand=2840)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpps))]
+pub unsafe fn _mm_maskz_getexp_ps(k: __mmask8, a: __m128) -> __m128 {
+    transmute(vgetexpps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_pd&expand=2835)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub unsafe fn _mm512_getexp_pd(a: __m512d) -> __m512d {
+    transmute(vgetexppd(
+        a.as_f64x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_pd&expand=2836)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub unsafe fn _mm512_mask_getexp_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vgetexppd(
+        a.as_f64x8(),
+        src.as_f64x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_pd&expand=2837)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub unsafe fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vgetexppd(
+        a.as_f64x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_pd&expand=2832)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub unsafe fn _mm256_getexp_pd(a: __m256d) -> __m256d {
+    transmute(vgetexppd256(
+        a.as_f64x4(),
+        _mm256_setzero_pd().as_f64x4(),
+        0b00001111,
+    ))
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_pd&expand=2833)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub unsafe fn _mm256_mask_getexp_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vgetexppd256(a.as_f64x4(), src.as_f64x4(), k))
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_pd&expand=2834)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub unsafe fn _mm256_maskz_getexp_pd(k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vgetexppd256(
+        a.as_f64x4(),
+        _mm256_setzero_pd().as_f64x4(),
+        k,
+    ))
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_pd&expand=2829)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub unsafe fn _mm_getexp_pd(a: __m128d) -> __m128d {
+    transmute(vgetexppd128(
+        a.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b00000011,
+    ))
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_pd&expand=2830)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub unsafe fn _mm_mask_getexp_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vgetexppd128(a.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_pd&expand=2831)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexppd))]
+pub unsafe fn _mm_maskz_getexp_pd(k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vgetexppd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=4784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_roundscale_ps<const IMM8: i32>(a: __m512) -> __m512 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vrndscaleps(a, IMM8, zero, 0b11111111_11111111, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=4782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_roundscale_ps<const IMM8: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x16();
+    let src = src.as_f32x16();
+    let r = vrndscaleps(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=4783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_roundscale_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vrndscaleps(a, IMM8, zero, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ps&expand=4781)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_roundscale_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    let r = vrndscaleps256(a, IMM8, zero, 0b11111111);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ps&expand=4779)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_roundscale_ps<const IMM8: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+) -> __m256 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let src = src.as_f32x8();
+    let r = vrndscaleps256(a, IMM8, src, k);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ps&expand=4780)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    let r = vrndscaleps256(a, IMM8, zero, k);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ps&expand=4778)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_roundscale_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vrndscaleps128(a, IMM8, zero, 0b00001111);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ps&expand=4776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_roundscale_ps<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vrndscaleps128(a, IMM8, src, k);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ps&expand=4777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vrndscaleps128(a, IMM8, zero, k);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=4775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_roundscale_pd<const IMM8: i32>(a: __m512d) -> __m512d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vrndscalepd(a, IMM8, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=4773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_roundscale_pd<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x8();
+    let src = src.as_f64x8();
+    let r = vrndscalepd(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=4774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vrndscalepd(a, IMM8, zero, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_pd&expand=4772)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_roundscale_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    let r = vrndscalepd256(a, IMM8, zero, 0b00001111);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_pd&expand=4770)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_roundscale_pd<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x4();
+    let src = src.as_f64x4();
+    let r = vrndscalepd256(a, IMM8, src, k);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_pd&expand=4771)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    let r = vrndscalepd256(a, IMM8, zero, k);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_pd&expand=4769)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_roundscale_pd<const IMM8: i32>(a: __m128d) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vrndscalepd128(a, IMM8, zero, 0b00000011);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_pd&expand=4767)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_roundscale_pd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vrndscalepd128(a, IMM8, src, k);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_pd&expand=4768)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vrndscalepd128(a, IMM8, zero, k);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ps&expand=4883)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
+    transmute(vscalefps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        0b11111111_11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=4881)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    transmute(vscalefps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        src.as_f32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=4882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    transmute(vscalefps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ps&expand=4880)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm256_scalef_ps(a: __m256, b: __m256) -> __m256 {
+    transmute(vscalefps256(
+        a.as_f32x8(),
+        b.as_f32x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        0b11111111,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ps&expand=4878)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm256_mask_scalef_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), src.as_f32x8(), k))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ps&expand=4879)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm256_maskz_scalef_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    transmute(vscalefps256(
+        a.as_f32x8(),
+        b.as_f32x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        k,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ps&expand=4877)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm_scalef_ps(a: __m128, b: __m128) -> __m128 {
+    transmute(vscalefps128(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b00001111,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ps&expand=4875)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm_mask_scalef_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ps&expand=4876)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm_maskz_scalef_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vscalefps128(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=4874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
+    transmute(vscalefpd(
+        a.as_f64x8(),
+        b.as_f64x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=4872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    transmute(vscalefpd(
+        a.as_f64x8(),
+        b.as_f64x8(),
+        src.as_f64x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=4873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    transmute(vscalefpd(
+        a.as_f64x8(),
+        b.as_f64x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_pd&expand=4871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm256_scalef_pd(a: __m256d, b: __m256d) -> __m256d {
+    transmute(vscalefpd256(
+        a.as_f64x4(),
+        b.as_f64x4(),
+        _mm256_setzero_pd().as_f64x4(),
+        0b00001111,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_pd&expand=4869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm256_mask_scalef_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), src.as_f64x4(), k))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_pd&expand=4870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm256_maskz_scalef_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    transmute(vscalefpd256(
+        a.as_f64x4(),
+        b.as_f64x4(),
+        _mm256_setzero_pd().as_f64x4(),
+        k,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_pd&expand=4868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm_scalef_pd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefpd128(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b00000011,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_pd&expand=4866)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm_mask_scalef_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_pd&expand=4867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm_maskz_scalef_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefpd128(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+    ))
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=2499)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fixupimm_ps<const IMM8: i32>(a: __m512, b: __m512, c: __m512i) -> __m512 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_i32x16();
+    let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_ps&expand=2500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fixupimm_ps<const IMM8: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_i32x16();
+    let r = vfixupimmps(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=2501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fixupimm_ps<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_i32x16();
+    let r = vfixupimmpsz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fixupimm_ps&expand=2496)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_fixupimm_ps<const IMM8: i32>(a: __m256, b: __m256, c: __m256i) -> __m256 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    let c = c.as_i32x8();
+    let r = vfixupimmps256(a, b, c, IMM8, 0b11111111);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fixupimm_ps&expand=2497)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_fixupimm_ps<const IMM8: i32>(
+    a: __m256,
+    k: __mmask8,
+    b: __m256,
+    c: __m256i,
+) -> __m256 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    let c = c.as_i32x8();
+    let r = vfixupimmps256(a, b, c, IMM8, k);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fixupimm_ps&expand=2498)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_maskz_fixupimm_ps<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+    c: __m256i,
+) -> __m256 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    let c = c.as_i32x8();
+    let r = vfixupimmpsz256(a, b, c, IMM8, k);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_ps&expand=2493)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fixupimm_ps<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let c = c.as_i32x4();
+    let r = vfixupimmps128(a, b, c, IMM8, 0b00001111);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_ps&expand=2494)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fixupimm_ps<const IMM8: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let c = c.as_i32x4();
+    let r = vfixupimmps128(a, b, c, IMM8, k);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_ps&expand=2495)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fixupimm_ps<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let c = c.as_i32x4();
+    let r = vfixupimmpsz128(a, b, c, IMM8, k);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=2490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fixupimm_pd<const IMM8: i32>(a: __m512d, b: __m512d, c: __m512i) -> __m512d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_i64x8();
+    let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_pd&expand=2491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fixupimm_pd<const IMM8: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_i64x8();
+    let r = vfixupimmpd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=2492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fixupimm_pd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_i64x8();
+    let r = vfixupimmpdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fixupimm_pd&expand=2487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_fixupimm_pd<const IMM8: i32>(a: __m256d, b: __m256d, c: __m256i) -> __m256d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    let c = c.as_i64x4();
+    let r = vfixupimmpd256(a, b, c, IMM8, 0b00001111);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fixupimm_pd&expand=2488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_fixupimm_pd<const IMM8: i32>(
+    a: __m256d,
+    k: __mmask8,
+    b: __m256d,
+    c: __m256i,
+) -> __m256d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    let c = c.as_i64x4();
+    let r = vfixupimmpd256(a, b, c, IMM8, k);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fixupimm_pd&expand=2489)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_maskz_fixupimm_pd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+    c: __m256i,
+) -> __m256d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    let c = c.as_i64x4();
+    let r = vfixupimmpdz256(a, b, c, IMM8, k);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_pd&expand=2484)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fixupimm_pd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let c = c.as_i64x2();
+    let r = vfixupimmpd128(a, b, c, IMM8, 0b00000011);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_pd&expand=2485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fixupimm_pd<const IMM8: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let c = c.as_i64x2();
+    let r = vfixupimmpd128(a, b, c, IMM8, k);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_pd&expand=2486)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fixupimm_pd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let c = c.as_i64x2();
+    let r = vfixupimmpdz128(a, b, c, IMM8, k);
+    transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5867)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_ternarylogic_epi32<const IMM8: i32>(
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let c = c.as_i32x16();
+    let r = vpternlogd(a, b, c, IMM8);
+    transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5865)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_ternarylogic_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let src = src.as_i32x16();
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let r = vpternlogd(src, a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, src))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5866)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_ternarylogic_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let c = c.as_i32x16();
+    let r = vpternlogd(a, b, c, IMM8);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ternarylogic_epi32&expand=5864)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_ternarylogic_epi32<const IMM8: i32>(
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let c = c.as_i32x8();
+    let r = vpternlogd256(a, b, c, IMM8);
+    transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ternarylogic_epi32&expand=5862)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_ternarylogic_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let src = src.as_i32x8();
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let r = vpternlogd256(src, a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, src))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ternarylogic_epi32&expand=5863)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_maskz_ternarylogic_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let c = c.as_i32x8();
+    let r = vpternlogd256(a, b, c, IMM8);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ternarylogic_epi32&expand=5861)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_ternarylogic_epi32<const IMM8: i32>(
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let c = c.as_i32x4();
+    let r = vpternlogd128(a, b, c, IMM8);
+    transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ternarylogic_epi32&expand=5859)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_ternarylogic_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let src = src.as_i32x4();
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let r = vpternlogd128(src, a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, src))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ternarylogic_epi32&expand=5860)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_ternarylogic_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let c = c.as_i32x4();
+    let r = vpternlogd128(a, b, c, IMM8);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5876)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_ternarylogic_epi64<const IMM8: i32>(
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let c = c.as_i64x8();
+    let r = vpternlogq(a, b, c, IMM8);
+    transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_ternarylogic_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let src = src.as_i64x8();
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let r = vpternlogq(src, a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, src))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5875)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_ternarylogic_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let c = c.as_i64x8();
+    let r = vpternlogq(a, b, c, IMM8);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ternarylogic_epi64&expand=5873)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_ternarylogic_epi64<const IMM8: i32>(
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    let c = c.as_i64x4();
+    let r = vpternlogq256(a, b, c, IMM8);
+    transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ternarylogic_epi64&expand=5871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_ternarylogic_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let src = src.as_i64x4();
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    let r = vpternlogq256(src, a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, src))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ternarylogic_epi64&expand=5872)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_maskz_ternarylogic_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    let c = c.as_i64x4();
+    let r = vpternlogq256(a, b, c, IMM8);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ternarylogic_epi64&expand=5870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_ternarylogic_epi64<const IMM8: i32>(
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x2();
+    let b = b.as_i64x2();
+    let c = c.as_i64x2();
+    let r = vpternlogq128(a, b, c, IMM8);
+    transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ternarylogic_epi64&expand=5868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_ternarylogic_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let src = src.as_i64x2();
+    let a = a.as_i64x2();
+    let b = b.as_i64x2();
+    let r = vpternlogq128(src, a, b, IMM8);
+    transmute(simd_select_bitmask(k, r, src))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ternarylogic_epi64&expand=5869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_ternarylogic_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x2();
+    let b = b.as_i64x2();
+    let c = c.as_i64x2();
+    let r = vpternlogq128(a, b, c, IMM8);
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ps&expand=2880)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub unsafe fn _mm512_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m512,
+) -> __m512 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vgetmantps(
+        a,
+        SIGN << 2 | NORM,
+        zero,
+        0b11111111_11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    );
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ps&expand=2881)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm512_mask_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x16();
+    let src = src.as_f32x16();
+    let r = vgetmantps(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ps&expand=2882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm512_maskz_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vgetmantps(a, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ps&expand=2877)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub unsafe fn _mm256_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m256,
+) -> __m256 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    let r = vgetmantps256(a, SIGN << 2 | NORM, zero, 0b11111111);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ps&expand=2878)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm256_mask_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+) -> __m256 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x8();
+    let src = src.as_f32x8();
+    let r = vgetmantps256(a, SIGN << 2 | NORM, src, k);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ps&expand=2879)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm256_maskz_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m256,
+) -> __m256 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    let r = vgetmantps256(a, SIGN << 2 | NORM, zero, k);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ps&expand=2874)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub unsafe fn _mm_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetmantps128(a, SIGN << 2 | NORM, zero, 0b00001111);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ps&expand=2875)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_mask_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vgetmantps128(a, SIGN << 2 | NORM, src, k);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ps&expand=2876)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_maskz_getmant_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetmantps128(a, SIGN << 2 | NORM, zero, k);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_pd&expand=2871)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub unsafe fn _mm512_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vgetmantpd(
+        a,
+        SIGN << 2 | NORM,
+        zero,
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    );
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_pd&expand=2872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm512_mask_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x8();
+    let src = src.as_f64x8();
+    let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_pd&expand=2873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm512_maskz_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vgetmantpd(a, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_pd&expand=2868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub unsafe fn _mm256_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m256d,
+) -> __m256d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    let r = vgetmantpd256(a, SIGN << 2 | NORM, zero, 0b00001111);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_pd&expand=2869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm256_mask_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x4();
+    let src = src.as_f64x4();
+    let r = vgetmantpd256(a, SIGN << 2 | NORM, src, k);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_pd&expand=2870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm256_maskz_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    let r = vgetmantpd256(a, SIGN << 2 | NORM, zero, k);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_pd&expand=2865)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+pub unsafe fn _mm_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vgetmantpd128(a, SIGN << 2 | NORM, zero, 0b00000011);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_pd&expand=2866)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_mask_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vgetmantpd128(a, SIGN << 2 | NORM, src, k);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_pd&expand=2867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_maskz_getmant_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vgetmantpd128(a, SIGN << 2 | NORM, zero, k);
+    transmute(r)
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ps&expand=145)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_add_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vaddps(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ps&expand=146)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_add_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vaddps(a, b, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ps&expand=147)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_add_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vaddps(a, b, ROUNDING);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_pd&expand=142)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_add_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vaddpd(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_pd&expand=143)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_add_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vaddpd(a, b, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_pd&expand=144)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_add_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vaddpd(a, b, ROUNDING);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ps&expand=5739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_sub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vsubps(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_sub_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vsubps(a, b, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_sub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vsubps(a, b, ROUNDING);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_pd&expand=5736)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_sub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vsubpd(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_pd&expand=5734)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_sub_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vsubpd(a, b, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_pd&expand=5735)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_sub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vsubpd(a, b, ROUNDING);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ps&expand=3940)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_mul_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vmulps(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ps&expand=3938)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_mul_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vmulps(a, b, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ps&expand=3939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_mul_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vmulps(a, b, ROUNDING);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pd&expand=3937)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_mul_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vmulpd(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pd&expand=3935)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_mul_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vmulpd(a, b, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ps&expand=3939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_mul_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vmulpd(a, b, ROUNDING);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ps&expand=2168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_div_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vdivps(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ps&expand=2169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_div_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vdivps(a, b, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ps&expand=2170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_div_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vdivps(a, b, ROUNDING);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, =and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_pd&expand=2165)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_div_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vdivpd(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_pd&expand=2166)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_div_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vdivpd(a, b, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_pd&expand=2167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_div_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vdivpd(a, b, ROUNDING);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ps&expand=5377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_sqrt_round_ps<const ROUNDING: i32>(a: __m512) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let r = vsqrtps(a, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ps&expand=5375)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_sqrt_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let r = vsqrtps(a, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ps&expand=5376)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_sqrt_round_ps<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let r = vsqrtps(a, ROUNDING);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_pd&expand=5374)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_sqrt_round_pd<const ROUNDING: i32>(a: __m512d) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let r = vsqrtpd(a, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_pd&expand=5372)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_sqrt_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let r = vsqrtpd(a, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_pd&expand=5373)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_sqrt_round_pd<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let r = vsqrtpd(a, ROUNDING);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ps&expand=2565)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmadd132psround(a, b, c, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ps&expand=2566)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmadd132psround(a, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ps&expand=2568)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fmadd_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmadd132psround(a, b, c, ROUNDING);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ps&expand=2567)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmadd132psround(a, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pd&expand=2561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmadd132pdround(a, b, c, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pd&expand=2562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmadd132pdround(a, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pd&expand=2564)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fmadd_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmadd132pdround(a, b, c, ROUNDING);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pd&expand=2563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmadd132pdround(a, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ps&expand=2651)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f32x16());
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vfmadd132psround(a, b, sub, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ps&expand=2652)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f32x16());
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vfmadd132psround(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ps&expand=2654)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f32x16());
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vfmadd132psround(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ps&expand=2653)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let c = c.as_f32x16();
+    let sub = simd_sub(zero, c);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vfmadd132psround(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_pd&expand=2647)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f64x8());
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vfmadd132pdround(a, b, sub, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_pd&expand=2648)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f64x8());
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vfmadd132pdround(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_pd&expand=2650)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f64x8());
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vfmadd132pdround(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_pd&expand=2649)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let c = c.as_f64x8();
+    let sub = simd_sub(zero, c);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vfmadd132pdround(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ps&expand=2619)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fmaddsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmaddsub213ps(a, b, c, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ps&expand=2620)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fmaddsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmaddsub213ps(a, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ps&expand=2622)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fmaddsub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmaddsub213ps(a, b, c, ROUNDING);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ps&expand=2621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fmaddsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmaddsub213ps(a, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_pd&expand=2615)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fmaddsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmaddsub213pd(a, b, c, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_pd&expand=2616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fmaddsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmaddsub213pd(a, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_pd&expand=2618)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fmaddsub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmaddsub213pd(a, b, c, ROUNDING);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_pd&expand=2617)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmaddsub213pd(a, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ps&expand=2699)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f32x16());
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ps&expand=2700)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f32x16());
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ps&expand=2702)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f32x16());
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ps&expand=2701)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let c = c.as_f32x16();
+    let sub = simd_sub(zero, c);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_pd&expand=2695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f64x8());
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_pd&expand=2696)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f64x8());
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_pd&expand=2698)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, c.as_f64x8());
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_pd&expand=2697)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let c = c.as_f64x8();
+    let sub = simd_sub(zero, c);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ps&expand=2731)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, a.as_f32x16());
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmadd132psround(sub, b, c, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ps&expand=2732)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, a.as_f32x16());
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmadd132psround(sub, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a.as_f32x16()))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ps&expand=2734)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, a.as_f32x16());
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmadd132psround(sub, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ps&expand=2733)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let sub = simd_sub(zero, a.as_f32x16());
+    let b = b.as_f32x16();
+    let c = c.as_f32x16();
+    let r = vfmadd132psround(sub, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_pd&expand=2711)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, a.as_f64x8());
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmadd132pdround(sub, b, c, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_pd&expand=2728)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let a = a.as_f64x8();
+    let sub = simd_sub(zero, a);
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmadd132pdround(sub, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_pd&expand=2730)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, a.as_f64x8());
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmadd132pdround(sub, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_pd&expand=2729)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let sub = simd_sub(zero, a.as_f64x8());
+    let b = b.as_f64x8();
+    let c = c.as_f64x8();
+    let r = vfmadd132pdround(sub, b, c, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ps&expand=2779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let suba = simd_sub(zero, a.as_f32x16());
+    let subc = simd_sub(zero, c.as_f32x16());
+    let b = b.as_f32x16();
+    let r = vfmadd132psround(suba, b, subc, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ps&expand=2780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let a = a.as_f32x16();
+    let suba = simd_sub(zero, a);
+    let subc = simd_sub(zero, c.as_f32x16());
+    let b = b.as_f32x16();
+    let r = vfmadd132psround(suba, b, subc, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ps&expand=2782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let suba = simd_sub(zero, a.as_f32x16());
+    let subc = simd_sub(zero, c.as_f32x16());
+    let b = b.as_f32x16();
+    let r = vfmadd132psround(suba, b, subc, ROUNDING);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ps&expand=2781)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512,
+    k: __mmask16,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let zero: f32x16 = mem::zeroed();
+    let suba = simd_sub(zero, a.as_f32x16());
+    let c = c.as_f32x16();
+    let subc = simd_sub(zero, c);
+    let b = b.as_f32x16();
+    let r = vfmadd132psround(suba, b, subc, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_pd&expand=2775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let suba = simd_sub(zero, a.as_f64x8());
+    let subc = simd_sub(zero, c.as_f64x8());
+    let b = b.as_f64x8();
+    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_pd&expand=2776)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let a = a.as_f64x8();
+    let suba = simd_sub(zero, a);
+    let subc = simd_sub(zero, c.as_f64x8());
+    let b = b.as_f64x8();
+    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
+    transmute(simd_select_bitmask(k, r, a))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_pd&expand=2778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let suba = simd_sub(zero, a.as_f64x8());
+    let subc = simd_sub(zero, c.as_f64x8());
+    let b = b.as_f64x8();
+    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_pd&expand=2777)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512d,
+    k: __mmask8,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let zero: f64x8 = mem::zeroed();
+    let suba = simd_sub(zero, a.as_f64x8());
+    let c = c.as_f64x8();
+    let subc = simd_sub(zero, c);
+    let b = b.as_f64x8();
+    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
+    transmute(simd_select_bitmask(k, r, c))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ps&expand=3662)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_max_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vmaxps(a, b, SAE);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ps&expand=3660)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_max_round_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vmaxps(a, b, SAE);
+    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ps&expand=3661)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_max_round_ps<const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vmaxps(a, b, SAE);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_pd&expand=3659)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_max_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vmaxpd(a, b, SAE);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_pd&expand=3657)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_max_round_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vmaxpd(a, b, SAE);
+    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_pd&expand=3658)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_max_round_pd<const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vmaxpd(a, b, SAE);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ps&expand=3776)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_min_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vminps(a, b, SAE);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ps&expand=3774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_min_round_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vminps(a, b, SAE);
+    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ps&expand=3775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_min_round_ps<const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vminps(a, b, SAE);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_pd&expand=3773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_min_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vminpd(a, b, SAE);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_pd&expand=3771)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_min_round_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vminpd(a, b, SAE);
+    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_pd&expand=3772)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_min_round_pd<const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vminpd(a, b, SAE);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ps&expand=2850)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_getexp_round_ps<const SAE: i32>(a: __m512) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vgetexpps(a, zero, 0b11111111_11111111, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ps&expand=2851)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_getexp_round_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let src = src.as_f32x16();
+    let r = vgetexpps(a, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ps&expand=2852)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_getexp_round_ps<const SAE: i32>(k: __mmask16, a: __m512) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vgetexpps(a, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_pd&expand=2847)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_getexp_round_pd<const SAE: i32>(a: __m512d) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vgetexppd(a, zero, 0b11111111, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_pd&expand=2848)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_getexp_round_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let src = src.as_f64x8();
+    let r = vgetexppd(a, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_pd&expand=2849)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_getexp_round_pd<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vgetexppd(a, zero, k, SAE);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=4790)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+pub unsafe fn _mm512_roundscale_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vrndscaleps(a, IMM8, zero, 0b11111111_11111111, SAE);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=4788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm512_mask_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let src = src.as_f32x16();
+    let r = vrndscaleps(a, IMM8, src, k, SAE);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=4789)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm512_maskz_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vrndscaleps(a, IMM8, zero, k, SAE);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=4787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+pub unsafe fn _mm512_roundscale_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vrndscalepd(a, IMM8, zero, 0b11111111, SAE);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=4785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm512_mask_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let src = src.as_f64x8();
+    let r = vrndscalepd(a, IMM8, src, k, SAE);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=4786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm512_maskz_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vrndscalepd(a, IMM8, zero, k, SAE);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ps&expand=4889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_scalef_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vscalefps(a, b, zero, 0b11111111_11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=4887)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_scalef_round_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let src = src.as_f32x16();
+    let r = vscalefps(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=4888)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_scalef_round_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vscalefps(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_pd&expand=4886)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_scalef_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vscalefpd(a, b, zero, 0b11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=4884)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_scalef_round_pd<const ROUNDING: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let src = src.as_f64x8();
+    let r = vscalefpd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=4885)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_scalef_round_pd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vscalefpd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=2505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm512_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_i32x16();
+    let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, SAE);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_ps&expand=2506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm512_mask_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_i32x16();
+    let r = vfixupimmps(a, b, c, IMM8, k, SAE);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=2507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm512_maskz_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+) -> __m512 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let c = c.as_i32x16();
+    let r = vfixupimmpsz(a, b, c, IMM8, k, SAE);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=2502)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm512_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_i64x8();
+    let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, SAE);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_pd&expand=2503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm512_mask_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_i64x8();
+    let r = vfixupimmpd(a, b, c, IMM8, k, SAE);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=2504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm512_maskz_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+) -> __m512d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let c = c.as_i64x8();
+    let r = vfixupimmpdz(a, b, c, IMM8, k, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ps&expand=2886)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(1, 2, 3)]
+pub unsafe fn _mm512_getmant_round_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m512,
+) -> __m512 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vgetmantps(a, SIGN << 2 | NORM, zero, 0b11111111_11111111, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ps&expand=2887)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub unsafe fn _mm512_mask_getmant_round_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let src = src.as_f32x16();
+    let r = vgetmantps(a, SIGN << 2 | NORM, src, k, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ps&expand=2888)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub unsafe fn _mm512_maskz_getmant_round_ps<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vgetmantps(a, SIGN << 2 | NORM, zero, k, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_pd&expand=2883)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(1, 2, 3)]
+pub unsafe fn _mm512_getmant_round_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vgetmantpd(a, SIGN << 2 | NORM, zero, 0b11111111, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_pd&expand=2884)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub unsafe fn _mm512_mask_getmant_round_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let src = src.as_f64x8();
+    let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_512_maskz_getmant_round_pd&expand=2885)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub unsafe fn _mm512_maskz_getmant_round_pd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vgetmantpd(a, SIGN << 2 | NORM, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_epi32&expand=1737)   
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm512_cvtps_epi32(a: __m512) -> __m512i {
+    transmute(vcvtps2dq(
+        a.as_f32x16(),
+        _mm512_setzero_si512().as_i32x16(),
+        0b11111111_11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_epi32&expand=1738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm512_mask_cvtps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    transmute(vcvtps2dq(
+        a.as_f32x16(),
+        src.as_i32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_epi32&expand=1739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
+    transmute(vcvtps2dq(
+        a.as_f32x16(),
+        _mm512_setzero_si512().as_i32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtps_epi32&expand=1735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm256_mask_cvtps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    let convert = _mm256_cvtps_epi32(a);
+    transmute(simd_select_bitmask(k, convert.as_i32x8(), src.as_i32x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtps_epi32&expand=1736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm256_maskz_cvtps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    let convert = _mm256_cvtps_epi32(a);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, convert.as_i32x8(), zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtps_epi32&expand=1732)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm_mask_cvtps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    let convert = _mm_cvtps_epi32(a);
+    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtps_epi32&expand=1733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm_maskz_cvtps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    let convert = _mm_cvtps_epi32(a);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, convert.as_i32x4(), zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_epu32&expand=1755)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm512_cvtps_epu32(a: __m512) -> __m512i {
+    transmute(vcvtps2udq(
+        a.as_f32x16(),
+        _mm512_setzero_si512().as_u32x16(),
+        0b11111111_11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_epu32&expand=1756)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm512_mask_cvtps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    transmute(vcvtps2udq(
+        a.as_f32x16(),
+        src.as_u32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm512_maskz_cvtps_epu32(k: __mmask16, a: __m512) -> __m512i {
+    transmute(vcvtps2udq(
+        a.as_f32x16(),
+        _mm512_setzero_si512().as_u32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_epu32&expand=1752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm256_cvtps_epu32(a: __m256) -> __m256i {
+    transmute(vcvtps2udq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtps_epu32&expand=1753)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm256_mask_cvtps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvtps2udq256(a.as_f32x8(), src.as_u32x8(), k))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtps_epu32&expand=1754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm256_maskz_cvtps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvtps2udq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        k,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_epu32&expand=1749)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm_cvtps_epu32(a: __m128) -> __m128i {
+    transmute(vcvtps2udq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtps_epu32&expand=1750)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm_mask_cvtps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvtps2udq128(a.as_f32x4(), src.as_u32x4(), k))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtps_epu32&expand=1751)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm_maskz_cvtps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvtps2udq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_pd&expand=1769)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub unsafe fn _mm512_cvtps_pd(a: __m256) -> __m512d {
+    transmute(vcvtps2pd(
+        a.as_f32x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_pd&expand=1770)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub unsafe fn _mm512_mask_cvtps_pd(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
+    transmute(vcvtps2pd(
+        a.as_f32x8(),
+        src.as_f64x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_pd&expand=1771)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub unsafe fn _mm512_maskz_cvtps_pd(k: __mmask8, a: __m256) -> __m512d {
+    transmute(vcvtps2pd(
+        a.as_f32x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpslo_pd&expand=1784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub unsafe fn _mm512_cvtpslo_pd(v2: __m512) -> __m512d {
+    transmute(vcvtps2pd(
+        _mm512_castps512_ps256(v2).as_f32x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpslo_pd&expand=1785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub unsafe fn _mm512_mask_cvtpslo_pd(src: __m512d, k: __mmask8, v2: __m512) -> __m512d {
+    transmute(vcvtps2pd(
+        _mm512_castps512_ps256(v2).as_f32x8(),
+        src.as_f64x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ps&expand=1712)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm512_cvtpd_ps(a: __m512d) -> __m256 {
+    transmute(vcvtpd2ps(
+        a.as_f64x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ps&expand=1713)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm512_mask_cvtpd_ps(src: __m256, k: __mmask8, a: __m512d) -> __m256 {
+    transmute(vcvtpd2ps(
+        a.as_f64x8(),
+        src.as_f32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ps&expand=1714)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
+    transmute(vcvtpd2ps(
+        a.as_f64x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ps&expand=1710)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm256_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m256d) -> __m128 {
+    let convert = _mm256_cvtpd_ps(a);
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ps&expand=1711)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm256_maskz_cvtpd_ps(k: __mmask8, a: __m256d) -> __m128 {
+    let convert = _mm256_cvtpd_ps(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), zero))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ps&expand=1707)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m128d) -> __m128 {
+    let convert = _mm_cvtpd_ps(a);
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ps&expand=1708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm_maskz_cvtpd_ps(k: __mmask8, a: __m128d) -> __m128 {
+    let convert = _mm_cvtpd_ps(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), zero))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epi32&expand=1675)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm512_cvtpd_epi32(a: __m512d) -> __m256i {
+    transmute(vcvtpd2dq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epi32&expand=1676)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm512_mask_cvtpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvtpd2dq(
+        a.as_f64x8(),
+        src.as_i32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epi32&expand=1677)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvtpd2dq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_epi32&expand=1673)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm256_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    let convert = _mm256_cvtpd_epi32(a);
+    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_epi32&expand=1674)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm256_maskz_cvtpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    let convert = _mm256_cvtpd_epi32(a);
+    transmute(simd_select_bitmask(
+        k,
+        convert.as_i32x4(),
+        _mm_setzero_si128().as_i32x4(),
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_epi32&expand=1670)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    let convert = _mm_cvtpd_epi32(a);
+    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_epi32&expand=1671)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm_maskz_cvtpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    let convert = _mm_cvtpd_epi32(a);
+    transmute(simd_select_bitmask(
+        k,
+        convert.as_i32x4(),
+        _mm_setzero_si128().as_i32x4(),
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epu32&expand=1693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm512_cvtpd_epu32(a: __m512d) -> __m256i {
+    transmute(vcvtpd2udq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epu32&expand=1694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm512_mask_cvtpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvtpd2udq(
+        a.as_f64x8(),
+        src.as_u32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epu32&expand=1695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm512_maskz_cvtpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvtpd2udq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_epu32&expand=1690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm256_cvtpd_epu32(a: __m256d) -> __m128i {
+    transmute(vcvtpd2udq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_epu32&expand=1691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm256_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvtpd2udq256(a.as_f64x4(), src.as_u32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_epu32&expand=1692)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm256_maskz_cvtpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvtpd2udq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epu32&expand=1687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm_cvtpd_epu32(a: __m128d) -> __m128i {
+    transmute(vcvtpd2udq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_epu32&expand=1688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvtpd2udq128(a.as_f64x2(), src.as_u32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_epu32&expand=1689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm_maskz_cvtpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvtpd2udq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
+/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_pslo&expand=1715)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
+    let r: f32x8 = vcvtpd2ps(
+        v2.as_f64x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    );
+    simd_shuffle16!(
+        r,
+        _mm256_setzero_ps().as_f32x8(),
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+    )
+}
+
+/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_pslo&expand=1716)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> __m512 {
+    let r: f32x8 = vcvtpd2ps(
+        v2.as_f64x8(),
+        _mm512_castps512_ps256(src).as_f32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    );
+    simd_shuffle16!(
+        r,
+        _mm256_setzero_ps().as_f32x8(),
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+    )
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi8_epi32&expand=1535)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub unsafe fn _mm512_cvtepi8_epi32(a: __m128i) -> __m512i {
+    let a = a.as_i8x16();
+    transmute::<i32x16, _>(simd_cast(a))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi8_epi32&expand=1536)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub unsafe fn _mm512_mask_cvtepi8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi8_epi32&expand=1537)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub unsafe fn _mm512_maskz_cvtepi8_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi8_epi32&expand=1533)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub unsafe fn _mm256_mask_cvtepi8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi8_epi32&expand=1534)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub unsafe fn _mm256_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi8_epi32&expand=1530)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub unsafe fn _mm_mask_cvtepi8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi8_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+}
+
+/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi8_epi32&expand=1531)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+pub unsafe fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi8_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi8_epi64&expand=1544)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
+    let a = a.as_i8x16();
+    let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    transmute::<i64x8, _>(simd_cast(v64))
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi8_epi64&expand=1545)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm512_mask_cvtepi8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi8_epi64&expand=1546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi8_epi64&expand=1542)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm256_mask_cvtepi8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi8_epi64&expand=1543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm256_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi8_epi64&expand=1539)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm_mask_cvtepi8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi8_epi64&expand=1540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu8_epi32&expand=1621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm512_cvtepu8_epi32(a: __m128i) -> __m512i {
+    let a = a.as_u8x16();
+    transmute::<i32x16, _>(simd_cast(a))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu8_epi32&expand=1622)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm512_mask_cvtepu8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu8_epi32&expand=1623)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu8_epi32&expand=1619)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm256_mask_cvtepu8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi32&expand=1620)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm256_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu8_epi32&expand=1616)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm_mask_cvtepu8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in th elow 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi32&expand=1617)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu8_epi64&expand=1630)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
+    let a = a.as_u8x16();
+    let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    transmute::<i64x8, _>(simd_cast(v64))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu8_epi64&expand=1631)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm512_mask_cvtepu8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu8_epi64&expand=1632)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu8_epi64&expand=1628)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm256_mask_cvtepu8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi64&expand=1629)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm256_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu8_epi64&expand=1625)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm_mask_cvtepu8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi64&expand=1626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_epi32&expand=1389)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub unsafe fn _mm512_cvtepi16_epi32(a: __m256i) -> __m512i {
+    let a = a.as_i16x16();
+    transmute::<i32x16, _>(simd_cast(a))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_epi32&expand=1390)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub unsafe fn _mm512_mask_cvtepi16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_epi32&expand=1391)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub unsafe fn _mm512_maskz_cvtepi16_epi32(k: __mmask16, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_epi32&expand=1387)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub unsafe fn _mm256_mask_cvtepi16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_epi32&expand=1388)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub unsafe fn _mm256_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_epi32&expand=1384)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub unsafe fn _mm_mask_cvtepi16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi16_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_epi32&expand=1385)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+pub unsafe fn _mm_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi16_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_epi64&expand=1398)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub unsafe fn _mm512_cvtepi16_epi64(a: __m128i) -> __m512i {
+    let a = a.as_i16x8();
+    transmute::<i64x8, _>(simd_cast(a))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_epi64&expand=1399)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub unsafe fn _mm512_mask_cvtepi16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_epi64&expand=1400)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub unsafe fn _mm512_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_epi64&expand=1396)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub unsafe fn _mm256_mask_cvtepi16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_epi64&expand=1397)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub unsafe fn _mm256_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_epi64&expand=1393)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub unsafe fn _mm_mask_cvtepi16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi16_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_epi64&expand=1394)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+pub unsafe fn _mm_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi16_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_epi32&expand=1553)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm512_cvtepu16_epi32(a: __m256i) -> __m512i {
+    let a = a.as_u16x16();
+    transmute::<i32x16, _>(simd_cast(a))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_epi32&expand=1554)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm512_mask_cvtepu16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_epi32&expand=1555)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_epi32&expand=1551)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm256_mask_cvtepu16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_epi32&expand=1552)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm256_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_epi32&expand=1548)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm_mask_cvtepu16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_epi32&expand=1549)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_epi64&expand=1562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm512_cvtepu16_epi64(a: __m128i) -> __m512i {
+    let a = a.as_u16x8();
+    transmute::<i64x8, _>(simd_cast(a))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_epi64&expand=1563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm512_mask_cvtepu16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_epi64&expand=1564)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_epi64&expand=1560)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm256_mask_cvtepu16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_epi64&expand=1561)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm256_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_epi64&expand=1557)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm_mask_cvtepu16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_epi64&expand=1558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_epi64&expand=1428)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub unsafe fn _mm512_cvtepi32_epi64(a: __m256i) -> __m512i {
+    let a = a.as_i32x8();
+    transmute::<i64x8, _>(simd_cast(a))
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_epi64&expand=1429)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub unsafe fn _mm512_mask_cvtepi32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_epi64&expand=1430)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub unsafe fn _mm512_maskz_cvtepi32_epi64(k: __mmask8, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_epi64&expand=1426)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub unsafe fn _mm256_mask_cvtepi32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_epi64&expand=1427)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub unsafe fn _mm256_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_epi64&expand=1423)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub unsafe fn _mm_mask_cvtepi32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi32_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_epi64&expand=1424)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+pub unsafe fn _mm_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi32_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_epi64&expand=1571)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm512_cvtepu32_epi64(a: __m256i) -> __m512i {
+    let a = a.as_u32x8();
+    transmute::<i64x8, _>(simd_cast(a))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_epi64&expand=1572)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm512_mask_cvtepu32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_epi64&expand=1573)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
+    let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_epi64&expand=1569)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm256_mask_cvtepu32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_epi64&expand=1570)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm256_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_epi64&expand=1566)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm_mask_cvtepu32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_epi64&expand=1567)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ps&expand=1455)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub unsafe fn _mm512_cvtepi32_ps(a: __m512i) -> __m512 {
+    let a = a.as_i32x16();
+    transmute::<f32x16, _>(simd_cast(a))
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ps&expand=1456)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub unsafe fn _mm512_mask_cvtepi32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
+    let convert = _mm512_cvtepi32_ps(a).as_f32x16();
+    transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ps&expand=1457)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub unsafe fn _mm512_maskz_cvtepi32_ps(k: __mmask16, a: __m512i) -> __m512 {
+    let convert = _mm512_cvtepi32_ps(a).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ps&expand=1453)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub unsafe fn _mm256_mask_cvtepi32_ps(src: __m256, k: __mmask8, a: __m256i) -> __m256 {
+    let convert = _mm256_cvtepi32_ps(a).as_f32x8();
+    transmute(simd_select_bitmask(k, convert, src.as_f32x8()))
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ps&expand=1454)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub unsafe fn _mm256_maskz_cvtepi32_ps(k: __mmask8, a: __m256i) -> __m256 {
+    let convert = _mm256_cvtepi32_ps(a).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ps&expand=1450)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub unsafe fn _mm_mask_cvtepi32_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    let convert = _mm_cvtepi32_ps(a).as_f32x4();
+    transmute(simd_select_bitmask(k, convert, src.as_f32x4()))
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ps&expand=1451)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub unsafe fn _mm_maskz_cvtepi32_ps(k: __mmask8, a: __m128i) -> __m128 {
+    let convert = _mm_cvtepi32_ps(a).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_pd&expand=1446)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm512_cvtepi32_pd(a: __m256i) -> __m512d {
+    let a = a.as_i32x8();
+    transmute::<f64x8, _>(simd_cast(a))
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_pd&expand=1447)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm512_mask_cvtepi32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
+    let convert = _mm512_cvtepi32_pd(a).as_f64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_pd&expand=1448)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm512_maskz_cvtepi32_pd(k: __mmask8, a: __m256i) -> __m512d {
+    let convert = _mm512_cvtepi32_pd(a).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_pd&expand=1444)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm256_mask_cvtepi32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
+    let convert = _mm256_cvtepi32_pd(a).as_f64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_pd&expand=1445)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm256_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m256d {
+    let convert = _mm256_cvtepi32_pd(a).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_pd&expand=1441)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm_mask_cvtepi32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    let convert = _mm_cvtepi32_pd(a).as_f64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_pd&expand=1442)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m128d {
+    let convert = _mm_cvtepi32_pd(a).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ps&expand=1583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps))]
+pub unsafe fn _mm512_cvtepu32_ps(a: __m512i) -> __m512 {
+    let a = a.as_u32x16();
+    transmute::<f32x16, _>(simd_cast(a))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ps&expand=1584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps))]
+pub unsafe fn _mm512_mask_cvtepu32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
+    let convert = _mm512_cvtepu32_ps(a).as_f32x16();
+    transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ps&expand=1585)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps))]
+pub unsafe fn _mm512_maskz_cvtepu32_ps(k: __mmask16, a: __m512i) -> __m512 {
+    let convert = _mm512_cvtepu32_ps(a).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_pd&expand=1580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm512_cvtepu32_pd(a: __m256i) -> __m512d {
+    let a = a.as_u32x8();
+    transmute::<f64x8, _>(simd_cast(a))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_pd&expand=1581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm512_mask_cvtepu32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
+    let convert = _mm512_cvtepu32_pd(a).as_f64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_pd&expand=1582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
+    let convert = _mm512_cvtepu32_pd(a).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_pd&expand=1577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm256_cvtepu32_pd(a: __m128i) -> __m256d {
+    let a = a.as_u32x4();
+    transmute::<f64x4, _>(simd_cast(a))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_pd&expand=1578)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm256_mask_cvtepu32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
+    let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_pd&expand=1579)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
+    let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_pd&expand=1574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
+    let a = a.as_u32x4();
+    let u64: u32x2 = simd_shuffle2!(a, a, [0, 1]);
+    transmute::<f64x2, _>(simd_cast(u64))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_pd&expand=1575)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm_mask_cvtepu32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    let convert = _mm_cvtepu32_pd(a).as_f64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_pd&expand=1576)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
+    let convert = _mm_cvtepu32_pd(a).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32lo_pd&expand=1464)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
+    let v2 = v2.as_i32x16();
+    let v256: i32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+    transmute::<f64x8, _>(simd_cast(v256))
+}
+
+/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32lo_pd&expand=1465)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
+    let convert = _mm512_cvtepi32lo_pd(v2).as_f64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+}
+
+/// Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32lo_pd&expand=1586)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
+    let v2 = v2.as_u32x16();
+    let v256: u32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+    transmute::<f64x8, _>(simd_cast(v256))
+}
+
+/// Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32lo_pd&expand=1587)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm512_mask_cvtepu32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
+    let convert = _mm512_cvtepu32lo_pd(v2).as_f64x8();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_epi16&expand=1419)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm512_cvtepi32_epi16(a: __m512i) -> __m256i {
+    let a = a.as_i32x16();
+    transmute::<i16x16, _>(simd_cast(a))
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_epi16&expand=1420)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm512_mask_cvtepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
+    transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_epi16&expand=1421)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm512_maskz_cvtepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi16&expand=1416)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm256_cvtepi32_epi16(a: __m256i) -> __m128i {
+    let a = a.as_i32x8();
+    transmute::<i16x8, _>(simd_cast(a))
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_epi16&expand=1417)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm256_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_epi16&expand=1418)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm256_maskz_cvtepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi16&expand=1413)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm_cvtepi32_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovdw128(
+        a.as_i32x4(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_epi16&expand=1414)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovdw128(a.as_i32x4(), src.as_i16x8(), k))
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_epi16&expand=1415)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm_maskz_cvtepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovdw128(a.as_i32x4(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_epi8&expand=1437)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm512_cvtepi32_epi8(a: __m512i) -> __m128i {
+    let a = a.as_i32x16();
+    transmute::<i8x16, _>(simd_cast(a))
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_epi8&expand=1438)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm512_mask_cvtepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
+    transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_epi8&expand=1439)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm512_maskz_cvtepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi8&expand=1434)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm256_cvtepi32_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovdb256(
+        a.as_i32x8(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_epi8&expand=1435)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm256_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovdb256(a.as_i32x8(), src.as_i8x16(), k))
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_epi8&expand=1436)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm256_maskz_cvtepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovdb256(a.as_i32x8(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi8&expand=1431)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm_cvtepi32_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovdb128(
+        a.as_i32x4(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_epi8&expand=1432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovdb128(a.as_i32x4(), src.as_i8x16(), k))
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_epi8&expand=1433)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm_maskz_cvtepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovdb128(a.as_i32x4(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_epi32&expand=1481)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm512_cvtepi64_epi32(a: __m512i) -> __m256i {
+    let a = a.as_i64x8();
+    transmute::<i32x8, _>(simd_cast(a))
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_epi32&expand=1482)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm512_mask_cvtepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_epi32&expand=1483)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm512_maskz_cvtepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_epi32&expand=1478)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm256_cvtepi64_epi32(a: __m256i) -> __m128i {
+    let a = a.as_i64x4();
+    transmute::<i32x4, _>(simd_cast(a))
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_epi32&expand=1479)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm256_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_epi32&expand=1480)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm256_maskz_cvtepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_epi32&expand=1475)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm_cvtepi64_epi32(a: __m128i) -> __m128i {
+    transmute(vpmovqd128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_epi32&expand=1476)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovqd128(a.as_i64x2(), src.as_i32x4(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_epi32&expand=1477)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm_maskz_cvtepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovqd128(a.as_i64x2(), _mm_setzero_si128().as_i32x4(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_epi16&expand=1472)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm512_cvtepi64_epi16(a: __m512i) -> __m128i {
+    let a = a.as_i64x8();
+    transmute::<i16x8, _>(simd_cast(a))
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_epi16&expand=1473)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm512_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_epi16&expand=1474)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm512_maskz_cvtepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_epi16&expand=1469)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm256_cvtepi64_epi16(a: __m256i) -> __m128i {
+    transmute(vpmovqw256(
+        a.as_i64x4(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_epi16&expand=1470)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm256_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovqw256(a.as_i64x4(), src.as_i16x8(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_epi16&expand=1471)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm256_maskz_cvtepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovqw256(a.as_i64x4(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_epi16&expand=1466)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm_cvtepi64_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovqw128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_epi16&expand=1467)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovqw128(a.as_i64x2(), src.as_i16x8(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_epi16&expand=1468)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm_maskz_cvtepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovqw128(a.as_i64x2(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_epi8&expand=1490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm512_cvtepi64_epi8(a: __m512i) -> __m128i {
+    transmute(vpmovqb(
+        a.as_i64x8(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_epi8&expand=1491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm512_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovqb(a.as_i64x8(), src.as_i8x16(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_epi8&expand=1492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm512_maskz_cvtepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovqb(a.as_i64x8(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_epi8&expand=1487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm256_cvtepi64_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovqb256(
+        a.as_i64x4(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_epi8&expand=1488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm256_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovqb256(a.as_i64x4(), src.as_i8x16(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_epi8&expand=1489)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm256_maskz_cvtepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovqb256(a.as_i64x4(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_epi8&expand=1484)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm_cvtepi64_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovqb128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_epi8&expand=1485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovqb128(a.as_i64x2(), src.as_i8x16(), k))
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_epi8&expand=1486)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm_maskz_cvtepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovqb128(a.as_i64x2(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi32_epi16&expand=1819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm512_cvtsepi32_epi16(a: __m512i) -> __m256i {
+    transmute(vpmovsdw(
+        a.as_i32x16(),
+        _mm256_setzero_si256().as_i16x16(),
+        0b11111111_11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_epi16&expand=1820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm512_mask_cvtsepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    transmute(vpmovsdw(a.as_i32x16(), src.as_i16x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi32_epi16&expand=1819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    transmute(vpmovsdw(
+        a.as_i32x16(),
+        _mm256_setzero_si256().as_i16x16(),
+        k,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi32_epi16&expand=1816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_cvtsepi32_epi16(a: __m256i) -> __m128i {
+    transmute(vpmovsdw256(
+        a.as_i32x8(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_epi16&expand=1817)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsdw256(a.as_i32x8(), src.as_i16x8(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi32_epi16&expand=1818)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_maskz_cvtsepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsdw256(a.as_i32x8(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi32_epi16&expand=1813)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_cvtsepi32_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovsdw128(
+        a.as_i32x4(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_epi16&expand=1814)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsdw128(a.as_i32x4(), src.as_i16x8(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi32_epi16&expand=1815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_maskz_cvtsepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsdw128(a.as_i32x4(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi32_epi8&expand=1828)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm512_cvtsepi32_epi8(a: __m512i) -> __m128i {
+    transmute(vpmovsdb(
+        a.as_i32x16(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111_11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_epi8&expand=1829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm512_mask_cvtsepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    transmute(vpmovsdb(a.as_i32x16(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi32_epi8&expand=1830)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    transmute(vpmovsdb(a.as_i32x16(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi32_epi8&expand=1825)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_cvtsepi32_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovsdb256(
+        a.as_i32x8(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_epi8&expand=1826)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsdb256(a.as_i32x8(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi32_epi8&expand=1827)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_maskz_cvtsepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsdb256(a.as_i32x8(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi32_epi8&expand=1822)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_cvtsepi32_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovsdb128(
+        a.as_i32x4(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_epi8&expand=1823)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsdb128(a.as_i32x4(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi32_epi8&expand=1824)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_maskz_cvtsepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsdb128(a.as_i32x4(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi32&expand=1852)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm512_cvtsepi64_epi32(a: __m512i) -> __m256i {
+    transmute(vpmovsqd(
+        a.as_i64x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_epi32&expand=1853)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm512_mask_cvtsepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    transmute(vpmovsqd(a.as_i64x8(), src.as_i32x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi64_epi32&expand=1854)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    transmute(vpmovsqd(a.as_i64x8(), _mm256_setzero_si256().as_i32x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi64_epi32&expand=1849)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_cvtsepi64_epi32(a: __m256i) -> __m128i {
+    transmute(vpmovsqd256(
+        a.as_i64x4(),
+        _mm_setzero_si128().as_i32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_epi32&expand=1850)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqd256(a.as_i64x4(), src.as_i32x4(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi64_epi32&expand=1851)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_maskz_cvtsepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqd256(a.as_i64x4(), _mm_setzero_si128().as_i32x4(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi64_epi32&expand=1846)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_cvtsepi64_epi32(a: __m128i) -> __m128i {
+    transmute(vpmovsqd128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_epi32&expand=1847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqd128(a.as_i64x2(), src.as_i32x4(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi64_epi32&expand=1848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_maskz_cvtsepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqd128(a.as_i64x2(), _mm_setzero_si128().as_i32x4(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi16&expand=1843)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm512_cvtsepi64_epi16(a: __m512i) -> __m128i {
+    transmute(vpmovsqw(
+        a.as_i64x8(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_epi16&expand=1844)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm512_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovsqw(a.as_i64x8(), src.as_i16x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi64_epi16&expand=1845)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovsqw(a.as_i64x8(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi64_epi16&expand=1840)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_cvtsepi64_epi16(a: __m256i) -> __m128i {
+    transmute(vpmovsqw256(
+        a.as_i64x4(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_epi16&expand=1841)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqw256(a.as_i64x4(), src.as_i16x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi64_epi16&expand=1842)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_maskz_cvtsepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqw256(a.as_i64x4(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi64_epi16&expand=1837)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_cvtsepi64_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovsqw128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_epi16&expand=1838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqw128(a.as_i64x2(), src.as_i16x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi64_epi16&expand=1839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_maskz_cvtsepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqw128(a.as_i64x2(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi8&expand=1861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm512_cvtsepi64_epi8(a: __m512i) -> __m128i {
+    transmute(vpmovsqb(
+        a.as_i64x8(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_epi8&expand=1862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm512_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovsqb(a.as_i64x8(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi64_epi8&expand=1863)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovsqb(a.as_i64x8(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi64_epi8&expand=1858)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_cvtsepi64_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovsqb256(
+        a.as_i64x4(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_epi8&expand=1859)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqb256(a.as_i64x4(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi64_epi8&expand=1860)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_maskz_cvtsepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqb256(a.as_i64x4(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi64_epi8&expand=1855)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_cvtsepi64_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovsqb128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_epi8&expand=1856)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqb128(a.as_i64x2(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi64_epi8&expand=1857)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_maskz_cvtsepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqb128(a.as_i64x2(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi32_epi16&expand=2054)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm512_cvtusepi32_epi16(a: __m512i) -> __m256i {
+    transmute(vpmovusdw(
+        a.as_u32x16(),
+        _mm256_setzero_si256().as_u16x16(),
+        0b11111111_11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_epi16&expand=2055)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm512_mask_cvtusepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    transmute(vpmovusdw(a.as_u32x16(), src.as_u16x16(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi32_epi16&expand=2056)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    transmute(vpmovusdw(
+        a.as_u32x16(),
+        _mm256_setzero_si256().as_u16x16(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi32_epi16&expand=2051)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_cvtusepi32_epi16(a: __m256i) -> __m128i {
+    transmute(vpmovusdw256(
+        a.as_u32x8(),
+        _mm_setzero_si128().as_u16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_epi16&expand=2052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusdw256(a.as_u32x8(), src.as_u16x8(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi32_epi16&expand=2053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_maskz_cvtusepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusdw256(
+        a.as_u32x8(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi32_epi16&expand=2048)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_cvtusepi32_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovusdw128(
+        a.as_u32x4(),
+        _mm_setzero_si128().as_u16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_epi16&expand=2049)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusdw128(a.as_u32x4(), src.as_u16x8(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi32_epi16&expand=2050)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_maskz_cvtusepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusdw128(
+        a.as_u32x4(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi32_epi8&expand=2063)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm512_cvtusepi32_epi8(a: __m512i) -> __m128i {
+    transmute(vpmovusdb(
+        a.as_u32x16(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111_11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_epi8&expand=2064)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm512_mask_cvtusepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    transmute(vpmovusdb(a.as_u32x16(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi32_epi8&expand=2065)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    transmute(vpmovusdb(a.as_u32x16(), _mm_setzero_si128().as_u8x16(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi32_epi8&expand=2060)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_cvtusepi32_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovusdb256(
+        a.as_u32x8(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_epi8&expand=2061)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusdb256(a.as_u32x8(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi32_epi8&expand=2062)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_maskz_cvtusepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusdb256(
+        a.as_u32x8(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi32_epi8&expand=2057)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_cvtusepi32_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovusdb128(
+        a.as_u32x4(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_epi8&expand=2058)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusdb128(a.as_u32x4(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi32_epi8&expand=2059)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_maskz_cvtusepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusdb128(
+        a.as_u32x4(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi32&expand=2087)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm512_cvtusepi64_epi32(a: __m512i) -> __m256i {
+    transmute(vpmovusqd(
+        a.as_u64x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_epi32&expand=2088)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm512_mask_cvtusepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    transmute(vpmovusqd(a.as_u64x8(), src.as_u32x8(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi64_epi32&expand=2089)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    transmute(vpmovusqd(
+        a.as_u64x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi64_epi32&expand=2084)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_cvtusepi64_epi32(a: __m256i) -> __m128i {
+    transmute(vpmovusqd256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_epi32&expand=2085)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqd256(a.as_u64x4(), src.as_u32x4(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi64_epi32&expand=2086)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_maskz_cvtusepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqd256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi64_epi32&expand=2081)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_cvtusepi64_epi32(a: __m128i) -> __m128i {
+    transmute(vpmovusqd128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_epi32&expand=2082)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqd128(a.as_u64x2(), src.as_u32x4(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi64_epi32&expand=2083)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_maskz_cvtusepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqd128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi16&expand=2078)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm512_cvtusepi64_epi16(a: __m512i) -> __m128i {
+    transmute(vpmovusqw(
+        a.as_u64x8(),
+        _mm_setzero_si128().as_u16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_epi16&expand=2079)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm512_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovusqw(a.as_u64x8(), src.as_u16x8(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi64_epi16&expand=2080)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovusqw(a.as_u64x8(), _mm_setzero_si128().as_u16x8(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi64_epi16&expand=2075)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_cvtusepi64_epi16(a: __m256i) -> __m128i {
+    transmute(vpmovusqw256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_epi16&expand=2076)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqw256(a.as_u64x4(), src.as_u16x8(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi64_epi16&expand=2077)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_maskz_cvtusepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqw256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi64_epi16&expand=2072)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_cvtusepi64_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovusqw128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_epi16&expand=2073)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqw128(a.as_u64x2(), src.as_u16x8(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi64_epi16&expand=2074)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_maskz_cvtusepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqw128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi8&expand=2096)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm512_cvtusepi64_epi8(a: __m512i) -> __m128i {
+    transmute(vpmovusqb(
+        a.as_u64x8(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_epi8&expand=2097)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm512_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovusqb(a.as_u64x8(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi64_epi8&expand=2098)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    transmute(vpmovusqb(a.as_u64x8(), _mm_setzero_si128().as_u8x16(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi64_epi8&expand=2093)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_cvtusepi64_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovusqb256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_epi8&expand=2094)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqb256(a.as_u64x4(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi64_epi8&expand=2095)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_maskz_cvtusepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqb256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi64_epi8&expand=2090)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_cvtusepi64_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovusqb128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_epi8&expand=2091)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqb128(a.as_u64x2(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi64_epi8&expand=2092)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqb128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_epi32&expand=1335)   
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    let r = vcvtps2dq(a, zero, 0b11111111_11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let src = src.as_i32x16();
+    let r = vcvtps2dq(a, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    let r = vcvtps2dq(a, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_epu32&expand=1341)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_si512().as_u32x16();
+    let r = vcvtps2udq(a, zero, 0b11111111_11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epu32&expand=1342)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let src = src.as_u32x16();
+    let r = vcvtps2udq(a, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_si512().as_u32x16();
+    let r = vcvtps2udq(a, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_pd&expand=1347)   
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vcvtps2pd(a, zero, 0b11111111, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m256,
+) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x8();
+    let src = src.as_f64x8();
+    let r = vcvtps2pd(a, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256) -> __m512d {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let r = vcvtps2pd(a, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_epi32&expand=1315)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    let r = vcvtpd2dq(a, zero, 0b11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_epi32&expand=1316)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let src = src.as_i32x8();
+    let r = vcvtpd2dq(a, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epi32&expand=1317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    let r = vcvtpd2dq(a, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_epu32&expand=1321)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_si256().as_u32x8();
+    let r = vcvtpd2udq(a, zero, 0b11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_epu32&expand=1322)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let src = src.as_u32x8();
+    let r = vcvtpd2udq(a, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epu32&expand=1323)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_si256().as_u32x8();
+    let r = vcvtpd2udq(a, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ps&expand=1327)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    let r = vcvtpd2ps(a, zero, 0b11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ps&expand=1328)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let src = src.as_f32x8();
+    let r = vcvtpd2ps(a, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ps&expand=1329)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    let r = vcvtpd2ps(a, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ps&expand=1294)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_i32x16();
+    let r = vcvtdq2ps(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ps&expand=1295)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_i32x16();
+    let r = vcvtdq2ps(a, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+}
+
+/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ps&expand=1296)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_i32x16();
+    let r = vcvtdq2ps(a, ROUNDING);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ps&expand=1303)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_u32x16();
+    let r = vcvtudq2ps(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ps&expand=1304)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_u32x16();
+    let r = vcvtudq2ps(a, ROUNDING);
+    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ps&expand=1305)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_u32x16();
+    let r = vcvtudq2ps(a, ROUNDING);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_ph&expand=1354)   
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_ph<const SAE: i32>(a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    let r = vcvtps2ph(a, SAE, zero, 0b11111111_11111111);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_ph&expand=1355)   
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_ph<const SAE: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m512,
+) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let src = src.as_i16x16();
+    let r = vcvtps2ph(a, SAE, src, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_ph&expand=1356)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    let r = vcvtps2ph(a, SAE, zero, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvt_roundps_ph&expand=1352)   
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_cvt_roundps_ph<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m256,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let src = src.as_i16x8();
+    let r = vcvtps2ph256(a, IMM8, src, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvt_roundps_ph&expand=1353)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    let r = vcvtps2ph256(a, IMM8, zero, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundps_ph&expand=1350)   
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_cvt_roundps_ph<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let src = src.as_i16x8();
+    let r = vcvtps2ph128(a, IMM8, src, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundps_ph&expand=1351)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let zero = _mm_setzero_si128().as_i16x8();
+    let r = vcvtps2ph128(a, IMM8, zero, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_ph&expand=1778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtps_ph<const SAE: i32>(a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    let r = vcvtps2ph(a, SAE, zero, 0b11111111_11111111);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_ph&expand=1779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtps_ph<const SAE: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m512,
+) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let src = src.as_i16x16();
+    let r = vcvtps2ph(a, SAE, src, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_ph&expand=1780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    let r = vcvtps2ph(a, SAE, zero, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtps_ph&expand=1776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_cvtps_ph<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m256,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let src = src.as_i16x8();
+    let r = vcvtps2ph256(a, IMM8, src, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtps_ph&expand=1777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    let r = vcvtps2ph256(a, IMM8, zero, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtps_ph&expand=1773)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let src = src.as_i16x8();
+    let r = vcvtps2ph128(a, IMM8, src, k);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtps_ph&expand=1774)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let zero = _mm_setzero_si128().as_i16x8();
+    let r = vcvtps2ph128(a, IMM8, zero, k);
+    transmute(r)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_ps&expand=1332)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundph_ps<const SAE: i32>(a: __m256i) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_i16x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vcvtph2ps(a, zero, 0b11111111_11111111, SAE);
+    transmute(r)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_ps&expand=1333)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundph_ps<const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m256i,
+) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_i16x16();
+    let src = src.as_f32x16();
+    let r = vcvtph2ps(a, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_ps&expand=1334)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256i) -> __m512 {
+    static_assert_sae!(SAE);
+    let a = a.as_i16x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let r = vcvtph2ps(a, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_ps&expand=1723)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm512_cvtph_ps(a: __m256i) -> __m512 {
+    transmute(vcvtph2ps(
+        a.as_i16x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        0b11111111_11111111,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_ps&expand=1724)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm512_mask_cvtph_ps(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
+    transmute(vcvtph2ps(
+        a.as_i16x16(),
+        src.as_f32x16(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_ps&expand=1725)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
+    transmute(vcvtph2ps(
+        a.as_i16x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_ps&expand=1721)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm256_mask_cvtph_ps(src: __m256, k: __mmask8, a: __m128i) -> __m256 {
+    let convert = _mm256_cvtph_ps(a);
+    transmute(simd_select_bitmask(k, convert.as_f32x8(), src.as_f32x8()))
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_ps&expand=1722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm256_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m256 {
+    let convert = _mm256_cvtph_ps(a);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, convert.as_f32x8(), zero))
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_ps&expand=1718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm_mask_cvtph_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    let convert = _mm_cvtph_ps(a);
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_ps&expand=1719)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
+    let convert = _mm_cvtph_ps(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epi32&expand=1916)   
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    let r = vcvttps2dq(a, zero, 0b11111111_11111111, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epi32&expand=1917)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let src = src.as_i32x16();
+    let r = vcvttps2dq(a, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    let r = vcvttps2dq(a, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epu32&expand=1922)   
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_si512().as_u32x16();
+    let r = vcvttps2udq(a, zero, 0b11111111_11111111, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epu32&expand=1923)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let src = src.as_u32x16();
+    let r = vcvttps2udq(a, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epu32&expand=1924)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x16();
+    let zero = _mm512_setzero_si512().as_u32x16();
+    let r = vcvttps2udq(a, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epi32&expand=1904)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    let r = vcvttpd2dq(a, zero, 0b11111111, SAE);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epi32&expand=1905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let src = src.as_i32x8();
+    let r = vcvttpd2dq(a, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    let r = vcvttpd2dq(a, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epu32&expand=1910)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    let r = vcvttpd2udq(a, zero, 0b11111111, SAE);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epu32&expand=1911)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundpd_epu32<const SAE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let src = src.as_i32x8();
+    let r = vcvttpd2udq(a, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttps_epi32&expand=1984)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm512_cvttps_epi32(a: __m512) -> __m512i {
+    transmute(vcvttps2dq(
+        a.as_f32x16(),
+        _mm512_setzero_si512().as_i32x16(),
+        0b11111111_11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttps_epi32&expand=1985)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    transmute(vcvttps2dq(
+        a.as_f32x16(),
+        src.as_i32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttps_epi32&expand=1986)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
+    transmute(vcvttps2dq(
+        a.as_f32x16(),
+        _mm512_setzero_si512().as_i32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttps_epi32&expand=1982)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm256_mask_cvttps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvttps2dq256(a.as_f32x8(), src.as_i32x8(), k))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttps_epi32&expand=1983)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm256_maskz_cvttps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvttps2dq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        k,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttps_epi32&expand=1979)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm_mask_cvttps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvttps2dq128(a.as_f32x4(), src.as_i32x4(), k))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttps_epi32&expand=1980)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm_maskz_cvttps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvttps2dq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttps_epu32&expand=2002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
+    transmute(vcvttps2udq(
+        a.as_f32x16(),
+        _mm512_setzero_si512().as_u32x16(),
+        0b11111111_11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttps_epu32&expand=2003)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    transmute(vcvttps2udq(
+        a.as_f32x16(),
+        src.as_u32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttps_epu32&expand=2004)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i {
+    transmute(vcvttps2udq(
+        a.as_f32x16(),
+        _mm512_setzero_si512().as_u32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttps_epu32&expand=1999)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm256_cvttps_epu32(a: __m256) -> __m256i {
+    transmute(vcvttps2udq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttps_epu32&expand=2000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm256_mask_cvttps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvttps2udq256(a.as_f32x8(), src.as_u32x8(), k))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttps_epu32&expand=2001)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm256_maskz_cvttps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvttps2udq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        k,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_epu32&expand=1996)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm_cvttps_epu32(a: __m128) -> __m128i {
+    transmute(vcvttps2udq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttps_epu32&expand=1997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm_mask_cvttps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvttps2udq128(a.as_f32x4(), src.as_u32x4(), k))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttps_epu32&expand=1998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvttps2udq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundpd_epu32&expand=1912)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    let r = vcvttpd2udq(a, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.  
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttpd_epi32&expand=1947)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i {
+    transmute(vcvttpd2dq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttpd_epi32&expand=1948)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvttpd2dq(
+        a.as_f64x8(),
+        src.as_i32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttpd_epi32&expand=1949)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvttpd2dq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttpd_epi32&expand=1945)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm256_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvttpd2dq256(a.as_f64x4(), src.as_i32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttpd_epi32&expand=1946)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm256_maskz_cvttpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvttpd2dq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttpd_epi32&expand=1942)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvttpd2dq128(a.as_f64x2(), src.as_i32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttpd_epi32&expand=1943)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm_maskz_cvttpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvttpd2dq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttpd_epu32&expand=1965)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i {
+    transmute(vcvttpd2udq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttpd_epu32&expand=1966)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvttpd2udq(
+        a.as_f64x8(),
+        src.as_i32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttpd_epu32&expand=1967)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvttpd2udq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttpd_epu32&expand=1962)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm256_cvttpd_epu32(a: __m256d) -> __m128i {
+    transmute(vcvttpd2udq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_i32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttpd_epu32&expand=1963)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm256_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvttpd2udq256(a.as_f64x4(), src.as_i32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttpd_epu32&expand=1964)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm256_maskz_cvttpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvttpd2udq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epu32&expand=1959)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm_cvttpd_epu32(a: __m128d) -> __m128i {
+    transmute(vcvttpd2udq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_i32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttpd_epu32&expand=1960)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvttpd2udq128(a.as_f64x2(), src.as_i32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttpd_epu32&expand=1961)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvttpd2udq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
+/// Returns vector of type `__m512d` with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_pd&expand=5018)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm512_setzero_pd() -> __m512d {
+    // All-0 is a properly initialized __m512d
+    mem::zeroed()
+}
+
+/// Returns vector of type `__m512d` with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ps&expand=5021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm512_setzero_ps() -> __m512 {
+    // All-0 is a properly initialized __m512
+    mem::zeroed()
+}
+
+/// Return vector of type __m512 with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero&expand=5014)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm512_setzero() -> __m512 {
+    // All-0 is a properly initialized __m512
+    mem::zeroed()
+}
+
+/// Returns vector of type `__m512i` with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_si512&expand=5024)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm512_setzero_si512() -> __m512i {
+    // All-0 is a properly initialized __m512i
+    mem::zeroed()
+}
+
+/// Return vector of type __m512i with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_epi32&expand=5015)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm512_setzero_epi32() -> __m512i {
+    // All-0 is a properly initialized __m512i
+    mem::zeroed()
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values in reverse
+/// order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_epi32&expand=4991)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr_epi32(
+    e15: i32,
+    e14: i32,
+    e13: i32,
+    e12: i32,
+    e11: i32,
+    e10: i32,
+    e9: i32,
+    e8: i32,
+    e7: i32,
+    e6: i32,
+    e5: i32,
+    e4: i32,
+    e3: i32,
+    e2: i32,
+    e1: i32,
+    e0: i32,
+) -> __m512i {
+    let r = i32x16(
+        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
+    );
+    transmute(r)
+}
+
+/// Set packed 8-bit integers in dst with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi8&expand=4915)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_epi8(
+    e63: i8,
+    e62: i8,
+    e61: i8,
+    e60: i8,
+    e59: i8,
+    e58: i8,
+    e57: i8,
+    e56: i8,
+    e55: i8,
+    e54: i8,
+    e53: i8,
+    e52: i8,
+    e51: i8,
+    e50: i8,
+    e49: i8,
+    e48: i8,
+    e47: i8,
+    e46: i8,
+    e45: i8,
+    e44: i8,
+    e43: i8,
+    e42: i8,
+    e41: i8,
+    e40: i8,
+    e39: i8,
+    e38: i8,
+    e37: i8,
+    e36: i8,
+    e35: i8,
+    e34: i8,
+    e33: i8,
+    e32: i8,
+    e31: i8,
+    e30: i8,
+    e29: i8,
+    e28: i8,
+    e27: i8,
+    e26: i8,
+    e25: i8,
+    e24: i8,
+    e23: i8,
+    e22: i8,
+    e21: i8,
+    e20: i8,
+    e19: i8,
+    e18: i8,
+    e17: i8,
+    e16: i8,
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m512i {
+    let r = i8x64(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
+        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, e32, e33, e34, e35, e36, e37,
+        e38, e39, e40, e41, e42, e43, e44, e45, e46, e47, e48, e49, e50, e51, e52, e53, e54, e55,
+        e56, e57, e58, e59, e60, e61, e62, e63,
+    );
+    transmute(r)
+}
+
+/// Set packed 16-bit integers in dst with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi16&expand=4905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_epi16(
+    e31: i16,
+    e30: i16,
+    e29: i16,
+    e28: i16,
+    e27: i16,
+    e26: i16,
+    e25: i16,
+    e24: i16,
+    e23: i16,
+    e22: i16,
+    e21: i16,
+    e20: i16,
+    e19: i16,
+    e18: i16,
+    e17: i16,
+    e16: i16,
+    e15: i16,
+    e14: i16,
+    e13: i16,
+    e12: i16,
+    e11: i16,
+    e10: i16,
+    e9: i16,
+    e8: i16,
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m512i {
+    let r = i16x32(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
+        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+    );
+    transmute(r)
+}
+
+/// Set packed 32-bit integers in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set4_epi32&expand=4982)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
+    _mm512_set_epi32(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set4_ps&expand=4985)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
+    _mm512_set_ps(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set4_pd&expand=4984)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
+    _mm512_set_pd(d, c, b, a, d, c, b, a)
+}
+
+/// Set packed 32-bit integers in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_epi32&expand=5009)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
+    _mm512_set_epi32(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_ps&expand=5012)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
+    _mm512_set_ps(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_pd&expand=5011)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
+    _mm512_set_pd(a, b, c, d, a, b, c, d)
+}
+
+/// Set packed 64-bit integers in dst with the supplied values.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi64&expand=4910)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_epi64(
+    e0: i64,
+    e1: i64,
+    e2: i64,
+    e3: i64,
+    e4: i64,
+    e5: i64,
+    e6: i64,
+    e7: i64,
+) -> __m512i {
+    _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Set packed 64-bit integers in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_epi64&expand=4993)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr_epi64(
+    e0: i64,
+    e1: i64,
+    e2: i64,
+    e3: i64,
+    e4: i64,
+    e5: i64,
+    e6: i64,
+    e7: i64,
+) -> __m512i {
+    let r = i64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_pd&expand=3002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_pd<const SCALE: i32>(offsets: __m256i, slice: *const u8) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vgatherdpd(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_pd&expand=3003)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_pd<const SCALE: i32>(
+    src: __m512d,
+    mask: __mmask8,
+    offsets: __m256i,
+    slice: *const u8,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vgatherdpd(src, slice, offsets, mask as i8, SCALE);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_pd&expand=3092)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_pd<const SCALE: i32>(offsets: __m512i, slice: *const u8) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqpd(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_pd&expand=3093)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_pd<const SCALE: i32>(
+    src: __m512d,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const u8,
+) -> __m512d {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqpd(src, slice, offsets, mask as i8, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_ps&expand=3100)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const u8) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqps(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_ps&expand=3101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_ps<const SCALE: i32>(
+    src: __m256,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const u8,
+) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vgatherqps(src, slice, offsets, mask as i8, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_ps&expand=3010)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const u8) -> __m512 {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vgatherdps(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_ps&expand=3011)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_ps<const SCALE: i32>(
+    src: __m512,
+    mask: __mmask16,
+    offsets: __m512i,
+    slice: *const u8,
+) -> __m512 {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x16();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vgatherdps(src, slice, offsets, mask as i16, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi32&expand=2986)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_epi32<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const u8,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi32&expand=2987)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_epi32<const SCALE: i32>(
+    src: __m512i,
+    mask: __mmask16,
+    offsets: __m512i,
+    slice: *const u8,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x16();
+    let mask = mask as i16;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    let r = vpgatherdd(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64&expand=2994)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i32gather_epi64<const SCALE: i32>(
+    offsets: __m256i,
+    slice: *const u8,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi64&expand=2995)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32gather_epi64<const SCALE: i32>(
+    src: __m512i,
+    mask: __mmask8,
+    offsets: __m256i,
+    slice: *const u8,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    let r = vpgatherdq(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64&expand=3084)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_epi64<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const u8,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64&expand=3085)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_epi64<const SCALE: i32>(
+    src: __m512i,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const u8,
+) -> __m512i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqq(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi32&expand=3074)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_i64gather_epi32<const SCALE: i32>(
+    offsets: __m512i,
+    slice: *const u8,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let zeros = _mm256_setzero_si256().as_i32x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqd(zeros, slice, offsets, neg_one, SCALE);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi32&expand=3075)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64gather_epi32<const SCALE: i32>(
+    src: __m256i,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const u8,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    let r = vpgatherqd(src, slice, offsets, mask, SCALE);
+    transmute(r)
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_pd&expand=3044)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_pd<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m256i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vscatterdpd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_pd&expand=3045)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_pd<const SCALE: i32>(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m256i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vscatterdpd(slice, mask as i8, offsets, src, SCALE);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_pd&expand=3122)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_pd<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m512i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqpd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_pd&expand=3123)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_pd<const SCALE: i32>(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m512d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f64x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqpd(slice, mask as i8, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_ps&expand=3050)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_ps<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m512i,
+    src: __m512,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x16();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vscatterdps(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_ps&expand=3051)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_ps<const SCALE: i32>(
+    slice: *mut u8,
+    mask: __mmask16,
+    offsets: __m512i,
+    src: __m512,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x16();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vscatterdps(slice, mask as i16, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps&expand=3128)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_ps<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m512i,
+    src: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqps(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_ps&expand=3129)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_ps<const SCALE: i32>(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_f32x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vscatterqps(slice, mask as i8, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_epi64&expand=3038)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m256i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vpscatterdq(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi64&expand=3039)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m256i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    vpscatterdq(slice, mask, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi64&expand=3116)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_epi64<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqq(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi64&expand=3117)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_epi64<const SCALE: i32>(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqq(slice, mask, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_epi32&expand=3032)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i32scatter_epi32<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x16();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vpscatterdd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi32&expand=3033)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i32scatter_epi32<const SCALE: i32>(
+    slice: *mut u8,
+    mask: __mmask16,
+    offsets: __m512i,
+    src: __m512i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x16();
+    let mask = mask as i16;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    vpscatterdd(slice, mask, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32&expand=3108)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_i64scatter_epi32<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m512i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqd(slice, neg_one, offsets, src, SCALE);
+}
+
+/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi32&expand=3109)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_i64scatter_epi32<const SCALE: i32>(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i32x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    vpscatterqd(slice, mask, offsets, src, SCALE);
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi32&expand=1198)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm512_mask_compress_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    transmute(vpcompressd(a.as_i32x16(), src.as_i32x16(), k))
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi32&expand=1199)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm512_maskz_compress_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    transmute(vpcompressd(
+        a.as_i32x16(),
+        _mm512_setzero_si512().as_i32x16(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi32&expand=1196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm256_mask_compress_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    transmute(vpcompressd256(a.as_i32x8(), src.as_i32x8(), k))
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi32&expand=1197)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm256_maskz_compress_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    transmute(vpcompressd256(
+        a.as_i32x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi32&expand=1194)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm_mask_compress_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpcompressd128(a.as_i32x4(), src.as_i32x4(), k))
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi32&expand=1195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm_maskz_compress_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpcompressd128(
+        a.as_i32x4(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi64&expand=1204)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm512_mask_compress_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    transmute(vpcompressq(a.as_i64x8(), src.as_i64x8(), k))
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi64&expand=1205)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm512_maskz_compress_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    transmute(vpcompressq(
+        a.as_i64x8(),
+        _mm512_setzero_si512().as_i64x8(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi64&expand=1202)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm256_mask_compress_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    transmute(vpcompressq256(a.as_i64x4(), src.as_i64x4(), k))
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi64&expand=1203)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm256_maskz_compress_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    transmute(vpcompressq256(
+        a.as_i64x4(),
+        _mm256_setzero_si256().as_i64x4(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi64&expand=1200)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm_mask_compress_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpcompressq128(a.as_i64x2(), src.as_i64x2(), k))
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi64&expand=1201)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm_maskz_compress_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpcompressq128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i64x2(),
+        k,
+    ))
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_ps&expand=1222)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm512_mask_compress_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    transmute(vcompressps(a.as_f32x16(), src.as_f32x16(), k))
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_ps&expand=1223)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm512_maskz_compress_ps(k: __mmask16, a: __m512) -> __m512 {
+    transmute(vcompressps(
+        a.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        k,
+    ))
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_ps&expand=1220)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm256_mask_compress_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    transmute(vcompressps256(a.as_f32x8(), src.as_f32x8(), k))
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_ps&expand=1221)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm256_maskz_compress_ps(k: __mmask8, a: __m256) -> __m256 {
+    transmute(vcompressps256(
+        a.as_f32x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        k,
+    ))
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_ps&expand=1218)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm_mask_compress_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    transmute(vcompressps128(a.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_ps&expand=1219)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm_maskz_compress_ps(k: __mmask8, a: __m128) -> __m128 {
+    transmute(vcompressps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_pd&expand=1216)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm512_mask_compress_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vcompresspd(a.as_f64x8(), src.as_f64x8(), k))
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_pd&expand=1217)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm512_maskz_compress_pd(k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vcompresspd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_pd&expand=1214)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm256_mask_compress_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vcompresspd256(a.as_f64x4(), src.as_f64x4(), k))
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_pd&expand=1215)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm256_maskz_compress_pd(k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vcompresspd256(
+        a.as_f64x4(),
+        _mm256_setzero_pd().as_f64x4(),
+        k,
+    ))
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_pd&expand=1212)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm_mask_compress_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vcompresspd128(a.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_pd&expand=1213)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vcompresspd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask16, a: __m512i) {
+    vcompressstored(base_addr as *mut _, a.as_i32x16(), k)
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m256i) {
+    vcompressstored256(base_addr as *mut _, a.as_i32x8(), k)
+}
+
+/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressd))]
+pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m128i) {
+    vcompressstored128(base_addr as *mut _, a.as_i32x4(), k)
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m512i) {
+    vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k)
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m256i) {
+    vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k)
+}
+
+/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressq))]
+pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m128i) {
+    vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k)
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask16, a: __m512) {
+    vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k)
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m256) {
+    vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k)
+}
+
+/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompressps))]
+pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m128) {
+    vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k)
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m512d) {
+    vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k)
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m256d) {
+    vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k)
+}
+
+/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcompresspd))]
+pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m128d) {
+    vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k)
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi32&expand=2316)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub unsafe fn _mm512_mask_expand_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    transmute(vpexpandd(a.as_i32x16(), src.as_i32x16(), k))
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi32&expand=2317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub unsafe fn _mm512_maskz_expand_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    transmute(vpexpandd(
+        a.as_i32x16(),
+        _mm512_setzero_si512().as_i32x16(),
+        k,
+    ))
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi32&expand=2314)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub unsafe fn _mm256_mask_expand_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    transmute(vpexpandd256(a.as_i32x8(), src.as_i32x8(), k))
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi32&expand=2315)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub unsafe fn _mm256_maskz_expand_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    transmute(vpexpandd256(
+        a.as_i32x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        k,
+    ))
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi32&expand=2312)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub unsafe fn _mm_mask_expand_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpexpandd128(a.as_i32x4(), src.as_i32x4(), k))
+}
+
+/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi32&expand=2313)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandd))]
+pub unsafe fn _mm_maskz_expand_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpexpandd128(
+        a.as_i32x4(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi64&expand=2322)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub unsafe fn _mm512_mask_expand_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    transmute(vpexpandq(a.as_i64x8(), src.as_i64x8(), k))
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi64&expand=2323)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub unsafe fn _mm512_maskz_expand_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    transmute(vpexpandq(
+        a.as_i64x8(),
+        _mm512_setzero_si512().as_i64x8(),
+        k,
+    ))
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi64&expand=2320)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub unsafe fn _mm256_mask_expand_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    transmute(vpexpandq256(a.as_i64x4(), src.as_i64x4(), k))
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi64&expand=2321)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub unsafe fn _mm256_maskz_expand_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    transmute(vpexpandq256(
+        a.as_i64x4(),
+        _mm256_setzero_si256().as_i64x4(),
+        k,
+    ))
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi64&expand=2318)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub unsafe fn _mm_mask_expand_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpexpandq128(a.as_i64x2(), src.as_i64x2(), k))
+}
+
+/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi64&expand=2319)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandq))]
+pub unsafe fn _mm_maskz_expand_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpexpandq128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i64x2(),
+        k,
+    ))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_ps&expand=2340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub unsafe fn _mm512_mask_expand_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    transmute(vexpandps(a.as_f32x16(), src.as_f32x16(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_ps&expand=2341)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub unsafe fn _mm512_maskz_expand_ps(k: __mmask16, a: __m512) -> __m512 {
+    transmute(vexpandps(a.as_f32x16(), _mm512_setzero_ps().as_f32x16(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_ps&expand=2338)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub unsafe fn _mm256_mask_expand_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    transmute(vexpandps256(a.as_f32x8(), src.as_f32x8(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_ps&expand=2339)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub unsafe fn _mm256_maskz_expand_ps(k: __mmask8, a: __m256) -> __m256 {
+    transmute(vexpandps256(
+        a.as_f32x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        k,
+    ))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_ps&expand=2336)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub unsafe fn _mm_mask_expand_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    transmute(vexpandps128(a.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_ps&expand=2337)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandps))]
+pub unsafe fn _mm_maskz_expand_ps(k: __mmask8, a: __m128) -> __m128 {
+    transmute(vexpandps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_pd&expand=2334)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub unsafe fn _mm512_mask_expand_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vexpandpd(a.as_f64x8(), src.as_f64x8(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_pd&expand=2335)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub unsafe fn _mm512_maskz_expand_pd(k: __mmask8, a: __m512d) -> __m512d {
+    transmute(vexpandpd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_pd&expand=2332)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub unsafe fn _mm256_mask_expand_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vexpandpd256(a.as_f64x4(), src.as_f64x4(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_pd&expand=2333)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub unsafe fn _mm256_maskz_expand_pd(k: __mmask8, a: __m256d) -> __m256d {
+    transmute(vexpandpd256(
+        a.as_f64x4(),
+        _mm256_setzero_pd().as_f64x4(),
+        k,
+    ))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_pd&expand=2330)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub unsafe fn _mm_mask_expand_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vexpandpd128(a.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_pd&expand=2331)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vexpandpd))]
+pub unsafe fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
+    transmute(vexpandpd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi32&expand=4685)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vprold(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi32&expand=4683)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_rol_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vprold(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rol_epi32&expand=4684)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vprold(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rol_epi32&expand=4682)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let r = vprold256(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rol_epi32&expand=4680)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_rol_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let r = vprold256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rol_epi32&expand=4681)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let r = vprold256(a, IMM8);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rol_epi32&expand=4679)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let r = vprold128(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rol_epi32&expand=4677)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_rol_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let r = vprold128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rol_epi32&expand=4678)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let r = vprold128(a, IMM8);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi32&expand=4721)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vprord(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi32&expand=4719)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_ror_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vprord(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi32&expand=4720)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vprord(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ror_epi32&expand=4718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let r = vprord256(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ror_epi32&expand=4716)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_ror_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let r = vprord256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ror_epi32&expand=4717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let r = vprord256(a, IMM8);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ror_epi32&expand=4715)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let r = vprord128(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ror_epi32&expand=4713)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_ror_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let r = vprord128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ror_epi32&expand=4714)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let r = vprord128(a, IMM8);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi64&expand=4694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x8();
+    let r = vprolq(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi64&expand=4692)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_rol_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x8();
+    let r = vprolq(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rol_epi64&expand=4693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x8();
+    let r = vprolq(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rol_epi64&expand=4691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x4();
+    let r = vprolq256(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rol_epi64&expand=4689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_rol_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x4();
+    let r = vprolq256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rol_epi64&expand=4690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x4();
+    let r = vprolq256(a, IMM8);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rol_epi64&expand=4688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x2();
+    let r = vprolq128(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rol_epi64&expand=4686)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_rol_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x2();
+    let r = vprolq128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rol_epi64&expand=4687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x2();
+    let r = vprolq128(a, IMM8);
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi64&expand=4730)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x8();
+    let r = vprorq(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi64&expand=4728)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_ror_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x8();
+    let r = vprorq(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi64&expand=4729)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x8();
+    let r = vprorq(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ror_epi64&expand=4727)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x4();
+    let r = vprorq256(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ror_epi64&expand=4725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_ror_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x4();
+    let r = vprorq256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ror_epi64&expand=4726)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x4();
+    let r = vprorq256(a, IMM8);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ror_epi64&expand=4724)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x2();
+    let r = vprorq128(a, IMM8);
+    transmute(r)
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ror_epi64&expand=4722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_ror_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x2();
+    let r = vprorq128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ror_epi64&expand=4723)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i64x2();
+    let r = vprorq128(a, IMM8);
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_slli_epi32&expand=5310)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vpsllid(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_slli_epi32&expand=5308)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_slli_epi32<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i32x16();
+    let shf = vpsllid(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_slli_epi32&expand=5309)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_slli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i32x16();
+    let shf = vpsllid(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_slli_epi32&expand=5305)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_slli_epi32<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psllid256(a.as_i32x8(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_slli_epi32&expand=5306)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psllid256(a.as_i32x8(), imm8);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_slli_epi32&expand=5302)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_slli_epi32<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psllid128(a.as_i32x4(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_slli_epi32&expand=5303)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psllid128(a.as_i32x4(), imm8);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srli_epi32&expand=5522)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vpsrlid(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srli_epi32&expand=5520)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srli_epi32<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i32x16();
+    let shf = vpsrlid(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srli_epi32&expand=5521)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i32x16();
+    let shf = vpsrlid(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srli_epi32&expand=5517)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_srli_epi32<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psrlid256(a.as_i32x8(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srli_epi32&expand=5518)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psrlid256(a.as_i32x8(), imm8);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srli_epi32&expand=5514)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_srli_epi32<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psrlid128(a.as_i32x4(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srli_epi32&expand=5515)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psrlid128(a.as_i32x4(), imm8);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_slli_epi64&expand=5319)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_slli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x8();
+    let r = vpslliq(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_slli_epi64&expand=5317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_slli_epi64<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x8();
+    let shf = vpslliq(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_slli_epi64&expand=5318)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x8();
+    let shf = vpslliq(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_slli_epi64&expand=5314)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_slli_epi64<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = pslliq256(a.as_i64x4(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_slli_epi64&expand=5315)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = pslliq256(a.as_i64x4(), imm8);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_slli_epi64&expand=5311)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_slli_epi64<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = pslliq128(a.as_i64x2(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_slli_epi64&expand=5312)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = pslliq128(a.as_i64x2(), imm8);
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srli_epi64&expand=5531)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x8();
+    let r = vpsrliq(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srli_epi64&expand=5529)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srli_epi64<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x8();
+    let shf = vpsrliq(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srli_epi64&expand=5530)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x8();
+    let shf = vpsrliq(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srli_epi64&expand=5526)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_srli_epi64<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psrliq256(a.as_i64x4(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srli_epi64&expand=5527)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psrliq256(a.as_i64x4(), imm8);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srli_epi64&expand=5523)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_srli_epi64<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psrliq128(a.as_i64x2(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srli_epi64&expand=5524)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let imm8 = IMM8 as i32;
+    let r = psrliq128(a.as_i64x2(), imm8);
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sll_epi32&expand=5280)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub unsafe fn _mm512_sll_epi32(a: __m512i, count: __m128i) -> __m512i {
+    transmute(vpslld(a.as_i32x16(), count.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sll_epi32&expand=5278)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub unsafe fn _mm512_mask_sll_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    count: __m128i,
+) -> __m512i {
+    let shf = _mm512_sll_epi32(a, count).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sll_epi32&expand=5279)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub unsafe fn _mm512_maskz_sll_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    let shf = _mm512_sll_epi32(a, count).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sll_epi32&expand=5275)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub unsafe fn _mm256_mask_sll_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m128i,
+) -> __m256i {
+    let shf = _mm256_sll_epi32(a, count).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sll_epi32&expand=5276)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub unsafe fn _mm256_maskz_sll_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    let shf = _mm256_sll_epi32(a, count).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sll_epi32&expand=5272)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub unsafe fn _mm_mask_sll_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sll_epi32(a, count).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sll_epi32&expand=5273)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpslld))]
+pub unsafe fn _mm_maskz_sll_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sll_epi32(a, count).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srl_epi32&expand=5492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub unsafe fn _mm512_srl_epi32(a: __m512i, count: __m128i) -> __m512i {
+    transmute(vpsrld(a.as_i32x16(), count.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srl_epi32&expand=5490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub unsafe fn _mm512_mask_srl_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    count: __m128i,
+) -> __m512i {
+    let shf = _mm512_srl_epi32(a, count).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srl_epi32&expand=5491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub unsafe fn _mm512_maskz_srl_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    let shf = _mm512_srl_epi32(a, count).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srl_epi32&expand=5487)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub unsafe fn _mm256_mask_srl_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m128i,
+) -> __m256i {
+    let shf = _mm256_srl_epi32(a, count).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srl_epi32&expand=5488)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub unsafe fn _mm256_maskz_srl_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    let shf = _mm256_srl_epi32(a, count).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srl_epi32&expand=5484)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub unsafe fn _mm_mask_srl_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srl_epi32(a, count).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srl_epi32&expand=5485)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub unsafe fn _mm_maskz_srl_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srl_epi32(a, count).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sll_epi64&expand=5289)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub unsafe fn _mm512_sll_epi64(a: __m512i, count: __m128i) -> __m512i {
+    transmute(vpsllq(a.as_i64x8(), count.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sll_epi64&expand=5287)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub unsafe fn _mm512_mask_sll_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    count: __m128i,
+) -> __m512i {
+    let shf = _mm512_sll_epi64(a, count).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sll_epi64&expand=5288)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub unsafe fn _mm512_maskz_sll_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    let shf = _mm512_sll_epi64(a, count).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sll_epi64&expand=5284)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub unsafe fn _mm256_mask_sll_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m128i,
+) -> __m256i {
+    let shf = _mm256_sll_epi64(a, count).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sll_epi64&expand=5285)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub unsafe fn _mm256_maskz_sll_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    let shf = _mm256_sll_epi64(a, count).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sll_epi64&expand=5281)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub unsafe fn _mm_mask_sll_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sll_epi64(a, count).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sll_epi64&expand=5282)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub unsafe fn _mm_maskz_sll_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sll_epi64(a, count).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srl_epi64&expand=5501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub unsafe fn _mm512_srl_epi64(a: __m512i, count: __m128i) -> __m512i {
+    transmute(vpsrlq(a.as_i64x8(), count.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srl_epi64&expand=5499)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub unsafe fn _mm512_mask_srl_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    count: __m128i,
+) -> __m512i {
+    let shf = _mm512_srl_epi64(a, count).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srl_epi64&expand=5500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub unsafe fn _mm512_maskz_srl_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    let shf = _mm512_srl_epi64(a, count).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srl_epi64&expand=5496)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub unsafe fn _mm256_mask_srl_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m128i,
+) -> __m256i {
+    let shf = _mm256_srl_epi64(a, count).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srl_epi64&expand=5497)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub unsafe fn _mm256_maskz_srl_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    let shf = _mm256_srl_epi64(a, count).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srl_epi64&expand=5493)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub unsafe fn _mm_mask_srl_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srl_epi64(a, count).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srl_epi64&expand=5494)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub unsafe fn _mm_maskz_srl_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srl_epi64(a, count).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sra_epi32&expand=5407)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub unsafe fn _mm512_sra_epi32(a: __m512i, count: __m128i) -> __m512i {
+    transmute(vpsrad(a.as_i32x16(), count.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sra_epi32&expand=5405)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub unsafe fn _mm512_mask_sra_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    count: __m128i,
+) -> __m512i {
+    let shf = _mm512_sra_epi32(a, count).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sra_epi32&expand=5406)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub unsafe fn _mm512_maskz_sra_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    let shf = _mm512_sra_epi32(a, count).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sra_epi32&expand=5402)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub unsafe fn _mm256_mask_sra_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m128i,
+) -> __m256i {
+    let shf = _mm256_sra_epi32(a, count).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sra_epi32&expand=5403)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub unsafe fn _mm256_maskz_sra_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    let shf = _mm256_sra_epi32(a, count).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sra_epi32&expand=5399)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub unsafe fn _mm_mask_sra_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sra_epi32(a, count).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sra_epi32&expand=5400)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub unsafe fn _mm_maskz_sra_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sra_epi32(a, count).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sra_epi64&expand=5416)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub unsafe fn _mm512_sra_epi64(a: __m512i, count: __m128i) -> __m512i {
+    transmute(vpsraq(a.as_i64x8(), count.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sra_epi64&expand=5414)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub unsafe fn _mm512_mask_sra_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    count: __m128i,
+) -> __m512i {
+    let shf = _mm512_sra_epi64(a, count).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sra_epi64&expand=5415)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub unsafe fn _mm512_maskz_sra_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    let shf = _mm512_sra_epi64(a, count).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi64&expand=5413)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub unsafe fn _mm256_sra_epi64(a: __m256i, count: __m128i) -> __m256i {
+    transmute(vpsraq256(a.as_i64x4(), count.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sra_epi64&expand=5411)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub unsafe fn _mm256_mask_sra_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m128i,
+) -> __m256i {
+    let shf = _mm256_sra_epi64(a, count).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sra_epi64&expand=5412)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub unsafe fn _mm256_maskz_sra_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    let shf = _mm256_sra_epi64(a, count).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi64&expand=5410)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub unsafe fn _mm_sra_epi64(a: __m128i, count: __m128i) -> __m128i {
+    transmute(vpsraq128(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sra_epi64&expand=5408)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub unsafe fn _mm_mask_sra_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sra_epi64(a, count).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sra_epi64&expand=5409)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq))]
+pub unsafe fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sra_epi64(a, count).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srai_epi32&expand=5436)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vpsraid512(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srai_epi32&expand=5434)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vpsraid512(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srai_epi32&expand=5435)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i32x16();
+    let r = vpsraid512(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srai_epi32&expand=5431)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_srai_epi32<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    let imm8 = IMM8 as i32;
+    let r = psraid256(a.as_i32x8(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srai_epi32&expand=5432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    let imm8 = IMM8 as i32;
+    let r = psraid256(a.as_i32x8(), imm8);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srai_epi32&expand=5428)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_srai_epi32<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    let imm8 = IMM8 as i32;
+    let r = psraid128(a.as_i32x4(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srai_epi32&expand=5429)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    let imm8 = IMM8 as i32;
+    let r = psraid128(a.as_i32x4(), imm8);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srai_epi64&expand=5445)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srai_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x8();
+    let r = vpsraiq(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srai_epi64&expand=5443)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srai_epi64<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x8();
+    let shf = vpsraiq(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srai_epi64&expand=5444)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x8();
+    let shf = vpsraiq(a, IMM8);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi64&expand=5442)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_srai_epi64<const IMM8: u32>(a: __m256i) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x4();
+    let r = vpsraiq256(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srai_epi64&expand=5440)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_srai_epi64<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x4();
+    let shf = vpsraiq256(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srai_epi64&expand=5441)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x4();
+    let shf = vpsraiq256(a, IMM8);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi64&expand=5439)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_srai_epi64<const IMM8: u32>(a: __m128i) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x2();
+    let r = vpsraiq128(a, IMM8);
+    transmute(r)
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srai_epi64&expand=5437)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_srai_epi64<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x2();
+    let shf = vpsraiq128(a, IMM8);
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srai_epi64&expand=5438)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm_u8!(IMM8);
+    let a = a.as_i64x2();
+    let shf = vpsraiq128(a, IMM8);
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srav_epi32&expand=5465)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub unsafe fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
+    transmute(vpsravd(a.as_i32x16(), count.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srav_epi32&expand=5463)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub unsafe fn _mm512_mask_srav_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    count: __m512i,
+) -> __m512i {
+    let shf = _mm512_srav_epi32(a, count).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srav_epi32&expand=5464)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub unsafe fn _mm512_maskz_srav_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    let shf = _mm512_srav_epi32(a, count).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srav_epi32&expand=5460)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub unsafe fn _mm256_mask_srav_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m256i,
+) -> __m256i {
+    let shf = _mm256_srav_epi32(a, count).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srav_epi32&expand=5461)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub unsafe fn _mm256_maskz_srav_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    let shf = _mm256_srav_epi32(a, count).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srav_epi32&expand=5457)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub unsafe fn _mm_mask_srav_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    count: __m128i,
+) -> __m128i {
+    let shf = _mm_srav_epi32(a, count).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srav_epi32&expand=5458)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+pub unsafe fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srav_epi32(a, count).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srav_epi64&expand=5474)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub unsafe fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
+    transmute(vpsravq(a.as_i64x8(), count.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srav_epi64&expand=5472)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub unsafe fn _mm512_mask_srav_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    count: __m512i,
+) -> __m512i {
+    let shf = _mm512_srav_epi64(a, count).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srav_epi64&expand=5473)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub unsafe fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    let shf = _mm512_srav_epi64(a, count).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi64&expand=5471)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub unsafe fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
+    transmute(vpsravq256(a.as_i64x4(), count.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srav_epi64&expand=5469)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub unsafe fn _mm256_mask_srav_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m256i,
+) -> __m256i {
+    let shf = _mm256_srav_epi64(a, count).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srav_epi64&expand=5470)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub unsafe fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    let shf = _mm256_srav_epi64(a, count).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi64&expand=5468)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub unsafe fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
+    transmute(vpsravq128(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srav_epi64&expand=5466)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub unsafe fn _mm_mask_srav_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    count: __m128i,
+) -> __m128i {
+    let shf = _mm_srav_epi64(a, count).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srav_epi64&expand=5467)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsravq))]
+pub unsafe fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srav_epi64(a, count).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rolv_epi32&expand=4703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub unsafe fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vprolvd(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rolv_epi32&expand=4701)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub unsafe fn _mm512_mask_rolv_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let rol = _mm512_rolv_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rolv_epi32&expand=4702)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub unsafe fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let rol = _mm512_rolv_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, rol, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rolv_epi32&expand=4700)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub unsafe fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vprolvd256(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rolv_epi3&expand=4698)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub unsafe fn _mm256_mask_rolv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let rol = _mm256_rolv_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, rol, src.as_i32x8()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rolv_epi32&expand=4699)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub unsafe fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let rol = _mm256_rolv_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, rol, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rolv_epi32&expand=4697)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub unsafe fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(vprolvd128(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rolv_epi32&expand=4695)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub unsafe fn _mm_mask_rolv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let rol = _mm_rolv_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, rol, src.as_i32x4()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rolv_epi32&expand=4696)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvd))]
+pub unsafe fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let rol = _mm_rolv_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, rol, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi32&expand=4739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub unsafe fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vprorvd(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi32&expand=4737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub unsafe fn _mm512_mask_rorv_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let ror = _mm512_rorv_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi32&expand=4738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub unsafe fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let ror = _mm512_rorv_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, ror, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rorv_epi32&expand=4736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub unsafe fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vprorvd256(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rorv_epi32&expand=4734)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub unsafe fn _mm256_mask_rorv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let ror = _mm256_rorv_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, ror, src.as_i32x8()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rorv_epi32&expand=4735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub unsafe fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let ror = _mm256_rorv_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, ror, zero))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rorv_epi32&expand=4733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub unsafe fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(vprorvd128(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rorv_epi32&expand=4731)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub unsafe fn _mm_mask_rorv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let ror = _mm_rorv_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, ror, src.as_i32x4()))
+}
+
+/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rorv_epi32&expand=4732)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvd))]
+pub unsafe fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let ror = _mm_rorv_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, ror, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rolv_epi64&expand=4712)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub unsafe fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vprolvq(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rolv_epi64&expand=4710)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub unsafe fn _mm512_mask_rolv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let rol = _mm512_rolv_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rolv_epi64&expand=4711)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub unsafe fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let rol = _mm512_rolv_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, rol, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rolv_epi64&expand=4709)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub unsafe fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vprolvq256(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rolv_epi64&expand=4707)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub unsafe fn _mm256_mask_rolv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let rol = _mm256_rolv_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, rol, src.as_i64x4()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rolv_epi64&expand=4708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub unsafe fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let rol = _mm256_rolv_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, rol, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rolv_epi64&expand=4706)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub unsafe fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(vprolvq128(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rolv_epi64&expand=4704)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub unsafe fn _mm_mask_rolv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let rol = _mm_rolv_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, rol, src.as_i64x2()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rolv_epi64&expand=4705)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprolvq))]
+pub unsafe fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let rol = _mm_rolv_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, rol, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi64&expand=4748)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub unsafe fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vprorvq(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi64&expand=4746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub unsafe fn _mm512_mask_rorv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let ror = _mm512_rorv_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi64&expand=4747)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub unsafe fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let ror = _mm512_rorv_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, ror, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rorv_epi64&expand=4745)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub unsafe fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vprorvq256(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rorv_epi64&expand=4743)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub unsafe fn _mm256_mask_rorv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let ror = _mm256_rorv_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, ror, src.as_i64x4()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rorv_epi64&expand=4744)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub unsafe fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let ror = _mm256_rorv_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, ror, zero))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rorv_epi64&expand=4742)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub unsafe fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(vprorvq128(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rorv_epi64&expand=4740)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub unsafe fn _mm_mask_rorv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let ror = _mm_rorv_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, ror, src.as_i64x2()))
+}
+
+/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rorv_epi64&expand=4741)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vprorvq))]
+pub unsafe fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let ror = _mm_rorv_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, ror, zero))
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sllv_epi32&expand=5342)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub unsafe fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
+    transmute(vpsllvd(a.as_i32x16(), count.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sllv_epi32&expand=5340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub unsafe fn _mm512_mask_sllv_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    count: __m512i,
+) -> __m512i {
+    let shf = _mm512_sllv_epi32(a, count).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sllv_epi32&expand=5341)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub unsafe fn _mm512_maskz_sllv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    let shf = _mm512_sllv_epi32(a, count).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sllv_epi32&expand=5337)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub unsafe fn _mm256_mask_sllv_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m256i,
+) -> __m256i {
+    let shf = _mm256_sllv_epi32(a, count).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sllv_epi32&expand=5338)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub unsafe fn _mm256_maskz_sllv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    let shf = _mm256_sllv_epi32(a, count).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sllv_epi32&expand=5334)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub unsafe fn _mm_mask_sllv_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    count: __m128i,
+) -> __m128i {
+    let shf = _mm_sllv_epi32(a, count).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sllv_epi32&expand=5335)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+pub unsafe fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sllv_epi32(a, count).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srlv_epi32&expand=5554)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub unsafe fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
+    transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srlv_epi32&expand=5552)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub unsafe fn _mm512_mask_srlv_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    count: __m512i,
+) -> __m512i {
+    let shf = _mm512_srlv_epi32(a, count).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srlv_epi32&expand=5553)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub unsafe fn _mm512_maskz_srlv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    let shf = _mm512_srlv_epi32(a, count).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srlv_epi32&expand=5549)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub unsafe fn _mm256_mask_srlv_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m256i,
+) -> __m256i {
+    let shf = _mm256_srlv_epi32(a, count).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srlv_epi32&expand=5550)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub unsafe fn _mm256_maskz_srlv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    let shf = _mm256_srlv_epi32(a, count).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srlv_epi32&expand=5546)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub unsafe fn _mm_mask_srlv_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    count: __m128i,
+) -> __m128i {
+    let shf = _mm_srlv_epi32(a, count).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srlv_epi32&expand=5547)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+pub unsafe fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srlv_epi32(a, count).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sllv_epi64&expand=5351)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub unsafe fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
+    transmute(vpsllvq(a.as_i64x8(), count.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sllv_epi64&expand=5349)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub unsafe fn _mm512_mask_sllv_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    count: __m512i,
+) -> __m512i {
+    let shf = _mm512_sllv_epi64(a, count).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sllv_epi64&expand=5350)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub unsafe fn _mm512_maskz_sllv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    let shf = _mm512_sllv_epi64(a, count).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sllv_epi64&expand=5346)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub unsafe fn _mm256_mask_sllv_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m256i,
+) -> __m256i {
+    let shf = _mm256_sllv_epi64(a, count).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sllv_epi64&expand=5347)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub unsafe fn _mm256_maskz_sllv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    let shf = _mm256_sllv_epi64(a, count).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sllv_epi64&expand=5343)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub unsafe fn _mm_mask_sllv_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    count: __m128i,
+) -> __m128i {
+    let shf = _mm_sllv_epi64(a, count).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sllv_epi64&expand=5344)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+pub unsafe fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_sllv_epi64(a, count).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srlv_epi64&expand=5563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub unsafe fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
+    transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srlv_epi64&expand=5561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub unsafe fn _mm512_mask_srlv_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    count: __m512i,
+) -> __m512i {
+    let shf = _mm512_srlv_epi64(a, count).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srlv_epi64&expand=5562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub unsafe fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    let shf = _mm512_srlv_epi64(a, count).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srlv_epi64&expand=5558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub unsafe fn _mm256_mask_srlv_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    count: __m256i,
+) -> __m256i {
+    let shf = _mm256_srlv_epi64(a, count).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srlv_epi64&expand=5559)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub unsafe fn _mm256_maskz_srlv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    let shf = _mm256_srlv_epi64(a, count).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srlv_epi64&expand=5555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub unsafe fn _mm_mask_srlv_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    count: __m128i,
+) -> __m128i {
+    let shf = _mm_srlv_epi64(a, count).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srlv_epi64&expand=5556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+pub unsafe fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    let shf = _mm_srlv_epi64(a, count).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_ps&expand=4170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
+    static_assert_imm8!(MASK);
+    simd_shuffle16!(
+        a,
+        a,
+        <const MASK: i32> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            ((MASK as u32 >> 4) & 0b11),
+            ((MASK as u32 >> 6) & 0b11),
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+            (MASK as u32 & 0b11) + 8,
+            ((MASK as u32 >> 2) & 0b11) + 8,
+            ((MASK as u32 >> 4) & 0b11) + 8,
+            ((MASK as u32 >> 6) & 0b11) + 8,
+            (MASK as u32 & 0b11) + 12,
+            ((MASK as u32 >> 2) & 0b11) + 12,
+            ((MASK as u32 >> 4) & 0b11) + 12,
+            ((MASK as u32 >> 6) & 0b11) + 12,
+        ],
+    )
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_ps&expand=4168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_permute_ps<const MASK: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+) -> __m512 {
+    static_assert_imm8!(MASK);
+    let r = _mm512_permute_ps::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_ps&expand=4169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512) -> __m512 {
+    static_assert_imm8!(MASK);
+    let r = _mm512_permute_ps::<MASK>(a);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permute_ps&expand=4165)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_permute_ps<const MASK: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+) -> __m256 {
+    let r = _mm256_permute_ps::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permute_ps&expand=4166)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -> __m256 {
+    let r = _mm256_permute_ps::<MASK>(a);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permute_ps&expand=4162)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    let r = _mm_permute_ps::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permute_ps&expand=4163)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> __m128 {
+    let r = _mm_permute_ps::<MASK>(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_pd&expand=4161)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01_10_01))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
+    static_assert_imm8!(MASK);
+    simd_shuffle8!(
+        a,
+        a,
+        <const MASK: i32> [
+            MASK as u32 & 0b1,
+            ((MASK as u32 >> 1) & 0b1),
+            ((MASK as u32 >> 2) & 0b1) + 2,
+            ((MASK as u32 >> 3) & 0b1) + 2,
+            ((MASK as u32 >> 4) & 0b1) + 4,
+            ((MASK as u32 >> 5) & 0b1) + 4,
+            ((MASK as u32 >> 6) & 0b1) + 6,
+            ((MASK as u32 >> 7) & 0b1) + 6,
+        ],
+    )
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_pd&expand=4159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01_10_01))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_permute_pd<const MASK: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    static_assert_imm8!(MASK);
+    let r = _mm512_permute_pd::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_pd&expand=4160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01_10_01))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    static_assert_imm8!(MASK);
+    let r = _mm512_permute_pd::<MASK>(a);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permute_pd&expand=4156)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_permute_pd<const MASK: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    static_assert_imm4!(MASK);
+    let r = _mm256_permute_pd::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permute_pd&expand=4157)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    static_assert_imm4!(MASK);
+    let r = _mm256_permute_pd::<MASK>(a);
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permute_pd&expand=4153)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0b01))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_permute_pd<const IMM2: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+) -> __m128d {
+    static_assert_imm2!(IMM2);
+    let r = _mm_permute_pd::<IMM2>(a);
+    transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permute_pd&expand=4154)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0b01))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    static_assert_imm2!(IMM2);
+    let r = _mm_permute_pd::<IMM2>(a);
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, r.as_f64x2(), zero))
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_epi64&expand=4208)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
+    simd_shuffle8!(
+        a,
+        a,
+        <const MASK: i32> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            ((MASK as u32 >> 4) & 0b11),
+            ((MASK as u32 >> 6) & 0b11),
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+        ],
+    )
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_epi64&expand=4206)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_permutex_epi64<const MASK: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_permutex_epi64::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_epi64&expand=4207)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_permutex_epi64::<MASK>(a);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex_epi64&expand=4205)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
+    simd_shuffle4!(
+        a,
+        a,
+        <const MASK: i32> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            ((MASK as u32 >> 4) & 0b11),
+            ((MASK as u32 >> 6) & 0b11),
+        ],
+    )
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex_epi6&expand=4203)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_permutex_epi64<const MASK: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_permutex_epi64::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+}
+
+/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex_epi64&expand=4204)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_permutex_epi64::<MASK>(a);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_pd&expand=4214)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
+    static_assert_imm8!(MASK);
+    simd_shuffle8!(
+        a,
+        a,
+        <const MASK: i32> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            ((MASK as u32 >> 4) & 0b11),
+            ((MASK as u32 >> 6) & 0b11),
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+        ],
+    )
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_pd&expand=4212)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_permutex_pd<const MASK: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512d {
+    let r = _mm512_permutex_pd::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_pd&expand=4213)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    let r = _mm512_permutex_pd::<MASK>(a);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex_pd&expand=4211)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
+    static_assert_imm8!(MASK);
+    simd_shuffle4!(
+        a,
+        a,
+        <const MASK: i32> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            ((MASK as u32 >> 4) & 0b11),
+            ((MASK as u32 >> 6) & 0b11),
+        ],
+    )
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex_pd&expand=4209)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_permutex_pd<const MASK: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+) -> __m256d {
+    static_assert_imm8!(MASK);
+    let r = _mm256_permutex_pd::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex_pd&expand=4210)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    static_assert_imm8!(MASK);
+    let r = _mm256_permutex_pd::<MASK>(a);
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_epi32&expand=4182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
+pub unsafe fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i {
+    transmute(vpermd(a.as_i32x16(), idx.as_i32x16()))
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_epi32&expand=4181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub unsafe fn _mm512_mask_permutevar_epi32(
+    src: __m512i,
+    k: __mmask16,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutevar_epi32(idx, a).as_i32x16();
+    transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_ps&expand=4200)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub unsafe fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 {
+    transmute(vpermilps(a.as_f32x16(), b.as_i32x16()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_ps&expand=4198)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub unsafe fn _mm512_mask_permutevar_ps(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512i,
+) -> __m512 {
+    let permute = _mm512_permutevar_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_ps&expand=4199)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub unsafe fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) -> __m512 {
+    let permute = _mm512_permutevar_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm256_mask_permutevar_ps&expand=4195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub unsafe fn _mm256_mask_permutevar_ps(src: __m256, k: __mmask8, a: __m256, b: __m256i) -> __m256 {
+    let permute = _mm256_permutevar_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutevar_ps&expand=4196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub unsafe fn _mm256_maskz_permutevar_ps(k: __mmask8, a: __m256, b: __m256i) -> __m256 {
+    let permute = _mm256_permutevar_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutevar_ps&expand=4192)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub unsafe fn _mm_mask_permutevar_ps(src: __m128, k: __mmask8, a: __m128, b: __m128i) -> __m128 {
+    let permute = _mm_permutevar_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, permute, src.as_f32x4()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutevar_ps&expand=4193)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub unsafe fn _mm_maskz_permutevar_ps(k: __mmask8, a: __m128, b: __m128i) -> __m128 {
+    let permute = _mm_permutevar_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_pd&expand=4191)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d {
+    transmute(vpermilpd(a.as_f64x8(), b.as_i64x8()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_pd&expand=4189)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm512_mask_permutevar_pd(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512i,
+) -> __m512d {
+    let permute = _mm512_permutevar_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_pd&expand=4190)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
+    let permute = _mm512_permutevar_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutevar_pd&expand=4186)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm256_mask_permutevar_pd(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256i,
+) -> __m256d {
+    let permute = _mm256_permutevar_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutevar_pd&expand=4187)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm256_maskz_permutevar_pd(k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
+    let permute = _mm256_permutevar_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutevar_pd&expand=4183)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm_mask_permutevar_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
+    let permute = _mm_permutevar_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, permute, src.as_f64x2()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutevar_pd&expand=4184)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm_maskz_permutevar_pd(k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
+    let permute = _mm_permutevar_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi32&expand=4301)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
+pub unsafe fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i {
+    transmute(vpermd(a.as_i32x16(), idx.as_i32x16()))
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi32&expand=4299)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub unsafe fn _mm512_mask_permutexvar_epi32(
+    src: __m512i,
+    k: __mmask16,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
+    transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi32&expand=4300)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m512i) -> __m512i {
+    let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_epi32&expand=4298)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
+pub unsafe fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
+    transmute(_mm256_permutevar8x32_epi32(a, idx)) // llvm use llvm.x86.avx2.permd
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_epi32&expand=4296)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub unsafe fn _mm256_mask_permutexvar_epi32(
+    src: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
+    transmute(simd_select_bitmask(k, permute, src.as_i32x8()))
+}
+
+/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_epi32&expand=4297)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermd))]
+pub unsafe fn _mm256_maskz_permutexvar_epi32(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
+    let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi64&expand=4307)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
+pub unsafe fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i {
+    transmute(vpermq(a.as_i64x8(), idx.as_i64x8()))
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi64&expand=4305)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub unsafe fn _mm512_mask_permutexvar_epi64(
+    src: __m512i,
+    k: __mmask8,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
+    transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi64&expand=4306)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub unsafe fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m512i) -> __m512i {
+    let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_epi64&expand=4304)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
+pub unsafe fn _mm256_permutexvar_epi64(idx: __m256i, a: __m256i) -> __m256i {
+    transmute(vpermq256(a.as_i64x4(), idx.as_i64x4()))
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_epi64&expand=4302)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub unsafe fn _mm256_mask_permutexvar_epi64(
+    src: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
+    transmute(simd_select_bitmask(k, permute, src.as_i64x4()))
+}
+
+/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_epi64&expand=4303)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermq))]
+pub unsafe fn _mm256_maskz_permutexvar_epi64(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
+    let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_ps&expand=4200)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub unsafe fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 {
+    transmute(vpermps(a.as_f32x16(), idx.as_i32x16()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_ps&expand=4326)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub unsafe fn _mm512_mask_permutexvar_ps(
+    src: __m512,
+    k: __mmask16,
+    idx: __m512i,
+    a: __m512,
+) -> __m512 {
+    let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
+    transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_ps&expand=4327)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
+    let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ps&expand=4325)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub unsafe fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
+    transmute(_mm256_permutevar8x32_ps(a, idx)) //llvm.x86.avx2.permps
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_ps&expand=4323)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub unsafe fn _mm256_mask_permutexvar_ps(
+    src: __m256,
+    k: __mmask8,
+    idx: __m256i,
+    a: __m256,
+) -> __m256 {
+    let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
+    transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_ps&expand=4324)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub unsafe fn _mm256_maskz_permutexvar_ps(k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
+    let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_pd&expand=4322)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub unsafe fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d {
+    transmute(vpermpd(a.as_f64x8(), idx.as_i64x8()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_pd&expand=4320)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub unsafe fn _mm512_mask_permutexvar_pd(
+    src: __m512d,
+    k: __mmask8,
+    idx: __m512i,
+    a: __m512d,
+) -> __m512d {
+    let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
+    transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_pd&expand=4321)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub unsafe fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
+    let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_pd&expand=4319)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub unsafe fn _mm256_permutexvar_pd(idx: __m256i, a: __m256d) -> __m256d {
+    transmute(vpermpd256(a.as_f64x4(), idx.as_i64x4()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_pd&expand=4317)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub unsafe fn _mm256_mask_permutexvar_pd(
+    src: __m256d,
+    k: __mmask8,
+    idx: __m256i,
+    a: __m256d,
+) -> __m256d {
+    let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
+    transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_pd&expand=4318)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermpd))]
+pub unsafe fn _mm256_maskz_permutexvar_pd(k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
+    let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi32&expand=4238)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub unsafe fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    transmute(vpermi2d(a.as_i32x16(), idx.as_i32x16(), b.as_i32x16()))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi32&expand=4235)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermt2d))]
+pub unsafe fn _mm512_mask_permutex2var_epi32(
+    a: __m512i,
+    k: __mmask16,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+    transmute(simd_select_bitmask(k, permute, a.as_i32x16()))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi32&expand=4237)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub unsafe fn _mm512_maskz_permutex2var_epi32(
+    k: __mmask16,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi32&expand=4236)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermi2d))]
+pub unsafe fn _mm512_mask2_permutex2var_epi32(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask16,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+    transmute(simd_select_bitmask(k, permute, idx.as_i32x16()))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_epi32&expand=4234)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub unsafe fn _mm256_permutex2var_epi32(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    transmute(vpermi2d256(a.as_i32x8(), idx.as_i32x8(), b.as_i32x8()))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_epi32&expand=4231)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2d))]
+pub unsafe fn _mm256_mask_permutex2var_epi32(
+    a: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+    transmute(simd_select_bitmask(k, permute, a.as_i32x8()))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_epi32&expand=4233)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub unsafe fn _mm256_maskz_permutex2var_epi32(
+    k: __mmask8,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_epi32&expand=4232)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermi2d))]
+pub unsafe fn _mm256_mask2_permutex2var_epi32(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask8,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+    transmute(simd_select_bitmask(k, permute, idx.as_i32x8()))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_epi32&expand=4230)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub unsafe fn _mm_permutex2var_epi32(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    transmute(vpermi2d128(a.as_i32x4(), idx.as_i32x4(), b.as_i32x4()))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_epi32&expand=4227)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2d))]
+pub unsafe fn _mm_mask_permutex2var_epi32(
+    a: __m128i,
+    k: __mmask8,
+    idx: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+    transmute(simd_select_bitmask(k, permute, a.as_i32x4()))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_epi32&expand=4229)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
+pub unsafe fn _mm_maskz_permutex2var_epi32(
+    k: __mmask8,
+    a: __m128i,
+    idx: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_epi32&expand=4228)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermi2d))]
+pub unsafe fn _mm_mask2_permutex2var_epi32(
+    a: __m128i,
+    idx: __m128i,
+    k: __mmask8,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+    transmute(simd_select_bitmask(k, permute, idx.as_i32x4()))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi64&expand=4250)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub unsafe fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    transmute(vpermi2q(a.as_i64x8(), idx.as_i64x8(), b.as_i64x8()))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi64&expand=4247)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermt2q))]
+pub unsafe fn _mm512_mask_permutex2var_epi64(
+    a: __m512i,
+    k: __mmask8,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+    transmute(simd_select_bitmask(k, permute, a.as_i64x8()))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi64&expand=4249)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub unsafe fn _mm512_maskz_permutex2var_epi64(
+    k: __mmask8,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi64&expand=4248)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermi2q))]
+pub unsafe fn _mm512_mask2_permutex2var_epi64(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask8,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+    transmute(simd_select_bitmask(k, permute, idx.as_i64x8()))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_epi64&expand=4246)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub unsafe fn _mm256_permutex2var_epi64(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    transmute(vpermi2q256(a.as_i64x4(), idx.as_i64x4(), b.as_i64x4()))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_epi64&expand=4243)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2q))]
+pub unsafe fn _mm256_mask_permutex2var_epi64(
+    a: __m256i,
+    k: __mmask8,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+    transmute(simd_select_bitmask(k, permute, a.as_i64x4()))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_epi64&expand=4245)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub unsafe fn _mm256_maskz_permutex2var_epi64(
+    k: __mmask8,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_epi64&expand=4244)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermi2q))]
+pub unsafe fn _mm256_mask2_permutex2var_epi64(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask8,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+    transmute(simd_select_bitmask(k, permute, idx.as_i64x4()))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_epi64&expand=4242)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub unsafe fn _mm_permutex2var_epi64(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    transmute(vpermi2q128(a.as_i64x2(), idx.as_i64x2(), b.as_i64x2()))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_epi64&expand=4239)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2q))]
+pub unsafe fn _mm_mask_permutex2var_epi64(
+    a: __m128i,
+    k: __mmask8,
+    idx: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+    transmute(simd_select_bitmask(k, permute, a.as_i64x2()))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_epi64&expand=4241)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
+pub unsafe fn _mm_maskz_permutex2var_epi64(
+    k: __mmask8,
+    a: __m128i,
+    idx: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_epi64&expand=4240)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermi2q))]
+pub unsafe fn _mm_mask2_permutex2var_epi64(
+    a: __m128i,
+    idx: __m128i,
+    k: __mmask8,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+    transmute(simd_select_bitmask(k, permute, idx.as_i64x2()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ps&expand=4286)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub unsafe fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m512 {
+    transmute(vpermi2ps(a.as_f32x16(), idx.as_i32x16(), b.as_f32x16()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_ps&expand=4283)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermt2ps))]
+pub unsafe fn _mm512_mask_permutex2var_ps(
+    a: __m512,
+    k: __mmask16,
+    idx: __m512i,
+    b: __m512,
+) -> __m512 {
+    let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+    transmute(simd_select_bitmask(k, permute, a.as_f32x16()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_ps&expand=4285)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub unsafe fn _mm512_maskz_permutex2var_ps(
+    k: __mmask16,
+    a: __m512,
+    idx: __m512i,
+    b: __m512,
+) -> __m512 {
+    let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_ps&expand=4284)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
+pub unsafe fn _mm512_mask2_permutex2var_ps(
+    a: __m512,
+    idx: __m512i,
+    k: __mmask16,
+    b: __m512,
+) -> __m512 {
+    let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+    let idx = _mm512_castsi512_ps(idx).as_f32x16();
+    transmute(simd_select_bitmask(k, permute, idx))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ps&expand=4282)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub unsafe fn _mm256_permutex2var_ps(a: __m256, idx: __m256i, b: __m256) -> __m256 {
+    transmute(vpermi2ps256(a.as_f32x8(), idx.as_i32x8(), b.as_f32x8()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_ps&expand=4279)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2ps))]
+pub unsafe fn _mm256_mask_permutex2var_ps(
+    a: __m256,
+    k: __mmask8,
+    idx: __m256i,
+    b: __m256,
+) -> __m256 {
+    let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+    transmute(simd_select_bitmask(k, permute, a.as_f32x8()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_ps&expand=4281)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub unsafe fn _mm256_maskz_permutex2var_ps(
+    k: __mmask8,
+    a: __m256,
+    idx: __m256i,
+    b: __m256,
+) -> __m256 {
+    let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_ps&expand=4280)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
+pub unsafe fn _mm256_mask2_permutex2var_ps(
+    a: __m256,
+    idx: __m256i,
+    k: __mmask8,
+    b: __m256,
+) -> __m256 {
+    let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+    let idx = _mm256_castsi256_ps(idx).as_f32x8();
+    transmute(simd_select_bitmask(k, permute, idx))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ps&expand=4278)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub unsafe fn _mm_permutex2var_ps(a: __m128, idx: __m128i, b: __m128) -> __m128 {
+    transmute(vpermi2ps128(a.as_f32x4(), idx.as_i32x4(), b.as_f32x4()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_ps&expand=4275)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2ps))]
+pub unsafe fn _mm_mask_permutex2var_ps(a: __m128, k: __mmask8, idx: __m128i, b: __m128) -> __m128 {
+    let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+    transmute(simd_select_bitmask(k, permute, a.as_f32x4()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_ps&expand=4277)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
+pub unsafe fn _mm_maskz_permutex2var_ps(k: __mmask8, a: __m128, idx: __m128i, b: __m128) -> __m128 {
+    let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_ps&expand=4276)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
+pub unsafe fn _mm_mask2_permutex2var_ps(a: __m128, idx: __m128i, k: __mmask8, b: __m128) -> __m128 {
+    let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+    let idx = _mm_castsi128_ps(idx).as_f32x4();
+    transmute(simd_select_bitmask(k, permute, idx))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_pd&expand=4274)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub unsafe fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
+    transmute(vpermi2pd(a.as_f64x8(), idx.as_i64x8(), b.as_f64x8()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_pd&expand=4271)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermt2pd))]
+pub unsafe fn _mm512_mask_permutex2var_pd(
+    a: __m512d,
+    k: __mmask8,
+    idx: __m512i,
+    b: __m512d,
+) -> __m512d {
+    let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+    transmute(simd_select_bitmask(k, permute, a.as_f64x8()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_pd&expand=4273)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub unsafe fn _mm512_maskz_permutex2var_pd(
+    k: __mmask8,
+    a: __m512d,
+    idx: __m512i,
+    b: __m512d,
+) -> __m512d {
+    let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_pd&expand=4272)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
+pub unsafe fn _mm512_mask2_permutex2var_pd(
+    a: __m512d,
+    idx: __m512i,
+    k: __mmask8,
+    b: __m512d,
+) -> __m512d {
+    let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+    let idx = _mm512_castsi512_pd(idx).as_f64x8();
+    transmute(simd_select_bitmask(k, permute, idx))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_pd&expand=4270)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub unsafe fn _mm256_permutex2var_pd(a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
+    transmute(vpermi2pd256(a.as_f64x4(), idx.as_i64x4(), b.as_f64x4()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_pd&expand=4267)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2pd))]
+pub unsafe fn _mm256_mask_permutex2var_pd(
+    a: __m256d,
+    k: __mmask8,
+    idx: __m256i,
+    b: __m256d,
+) -> __m256d {
+    let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+    transmute(simd_select_bitmask(k, permute, a.as_f64x4()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_pd&expand=4269)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub unsafe fn _mm256_maskz_permutex2var_pd(
+    k: __mmask8,
+    a: __m256d,
+    idx: __m256i,
+    b: __m256d,
+) -> __m256d {
+    let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_pd&expand=4268)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
+pub unsafe fn _mm256_mask2_permutex2var_pd(
+    a: __m256d,
+    idx: __m256i,
+    k: __mmask8,
+    b: __m256d,
+) -> __m256d {
+    let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+    let idx = _mm256_castsi256_pd(idx).as_f64x4();
+    transmute(simd_select_bitmask(k, permute, idx))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_pd&expand=4266)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub unsafe fn _mm_permutex2var_pd(a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
+    transmute(vpermi2pd128(a.as_f64x2(), idx.as_i64x2(), b.as_f64x2()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_pd&expand=4263)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2pd))]
+pub unsafe fn _mm_mask_permutex2var_pd(
+    a: __m128d,
+    k: __mmask8,
+    idx: __m128i,
+    b: __m128d,
+) -> __m128d {
+    let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+    transmute(simd_select_bitmask(k, permute, a.as_f64x2()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_pd&expand=4265)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
+pub unsafe fn _mm_maskz_permutex2var_pd(
+    k: __mmask8,
+    a: __m128d,
+    idx: __m128i,
+    b: __m128d,
+) -> __m128d {
+    let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_pd&expand=4264)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
+pub unsafe fn _mm_mask2_permutex2var_pd(
+    a: __m128d,
+    idx: __m128i,
+    k: __mmask8,
+    b: __m128d,
+) -> __m128d {
+    let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+    let idx = _mm_castsi128_pd(idx).as_f64x2();
+    transmute(simd_select_bitmask(k, permute, idx))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_epi32&expand=5150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))] //should be vpshufd
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r: i32x16 = simd_shuffle16!(
+        a.as_i32x16(),
+        a.as_i32x16(),
+        <const MASK: _MM_PERM_ENUM> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            (MASK as u32 >> 4) & 0b11,
+            (MASK as u32 >> 6) & 0b11,
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+            (MASK as u32 & 0b11) + 8,
+            ((MASK as u32 >> 2) & 0b11) + 8,
+            ((MASK as u32 >> 4) & 0b11) + 8,
+            ((MASK as u32 >> 6) & 0b11) + 8,
+            (MASK as u32 & 0b11) + 12,
+            ((MASK as u32 >> 2) & 0b11) + 12,
+            ((MASK as u32 >> 4) & 0b11) + 12,
+            ((MASK as u32 >> 6) & 0b11) + 12,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_epi32::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_epi32::<MASK>(a);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_epi32&expand=5145)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_epi32::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_epi32&expand=5146)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_epi32::<MASK>(a);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_epi32&expand=5142)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_epi32::<MASK>(a);
+    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_epi32&expand=5143)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_epi32::<MASK>(a);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_ps&expand=5203)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_imm8!(MASK);
+    simd_shuffle16!(
+        a,
+        b,
+        <const MASK: i32> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            ((MASK as u32 >> 4) & 0b11) + 16,
+            ((MASK as u32 >> 6) & 0b11) + 16,
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 20,
+            ((MASK as u32 >> 6) & 0b11) + 20,
+            (MASK as u32 & 0b11) + 8,
+            ((MASK as u32 >> 2) & 0b11) + 8,
+            ((MASK as u32 >> 4) & 0b11) + 24,
+            ((MASK as u32 >> 6) & 0b11) + 24,
+            (MASK as u32 & 0b11) + 12,
+            ((MASK as u32 >> 2) & 0b11) + 12,
+            ((MASK as u32 >> 4) & 0b11) + 28,
+            ((MASK as u32 >> 6) & 0b11) + 28,
+        ],
+    )
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_ps&expand=5201)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_ps<const MASK: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_ps::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_ps&expand=5202)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_ps<const MASK: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_ps::<MASK>(a, b);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_ps&expand=5198)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_ps<const MASK: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_ps::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_ps&expand=5199)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_ps<const MASK: i32>(
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_ps::<MASK>(a, b);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_ps&expand=5195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shuffle_ps<const MASK: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_ps::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_ps&expand=5196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_ps::<MASK>(a, b);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_pd&expand=5192)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_imm8!(MASK);
+    simd_shuffle8!(
+        a,
+        b,
+        <const MASK: i32> [
+            MASK as u32 & 0b1,
+            ((MASK as u32 >> 1) & 0b1) + 8,
+            ((MASK as u32 >> 2) & 0b1) + 2,
+            ((MASK as u32 >> 3) & 0b1) + 10,
+            ((MASK as u32 >> 4) & 0b1) + 4,
+            ((MASK as u32 >> 5) & 0b1) + 12,
+            ((MASK as u32 >> 6) & 0b1) + 6,
+            ((MASK as u32 >> 7) & 0b1) + 14,
+        ],
+    )
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_pd&expand=5190)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_pd<const MASK: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_pd::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_pd&expand=5191)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_pd<const MASK: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_pd::<MASK>(a, b);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_pd&expand=5187)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_pd<const MASK: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_pd::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_pd&expand=5188)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_pd<const MASK: i32>(
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_pd::<MASK>(a, b);
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_pd&expand=5184)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shuffle_pd<const MASK: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_pd::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_pd&expand=5185)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_pd::<MASK>(a, b);
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, r.as_f64x2(), zero))
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32&expand=5177)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_01_01_01))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let r: i32x16 = simd_shuffle16!(
+        a,
+        b,
+        <const MASK: i32> [
+            (MASK as u32 & 0b11) * 4 + 0,
+            (MASK as u32 & 0b11) * 4 + 1,
+            (MASK as u32 & 0b11) * 4 + 2,
+            (MASK as u32 & 0b11) * 4 + 3,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x&expand=5175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32&expand=5176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i32x4&expand=5174)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b11))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let r: i32x8 = simd_shuffle8!(
+        a,
+        b,
+        <const MASK: i32> [
+            (MASK as u32 & 0b1) * 4 + 0,
+            (MASK as u32 & 0b1) * 4 + 1,
+            (MASK as u32 & 0b1) * 4 + 2,
+            (MASK as u32 & 0b1) * 4 + 3,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i32x4&expand=5172)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i32x4&expand=5173)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i64x2&expand=5183)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let r: i64x8 = simd_shuffle8!(
+        a,
+        b,
+        <const MASK: i32> [
+            (MASK as u32 & 0b11) * 2 + 0,
+            (MASK as u32 & 0b11) * 2 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i64x&expand=5181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i64&expand=5182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i64x2&expand=5180)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshufi64x2
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    let r: i64x4 = simd_shuffle4!(
+        a,
+        b,
+        <const MASK: i32> [
+            (MASK as u32 & 0b1) * 2 + 0,
+            (MASK as u32 & 0b1) * 2 + 1,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i64x2&expand=5178)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i64x2&expand=5179)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f32x4&expand=5165)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b1011))] //should be vshuff32x4, but generate vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_imm8!(MASK);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r: f32x16 = simd_shuffle16!(
+        a,
+        b,
+        <const MASK: i32> [
+            (MASK as u32 & 0b11) * 4 + 0,
+            (MASK as u32 & 0b11) * 4 + 1,
+            (MASK as u32 & 0b11) * 4 + 2,
+            (MASK as u32 & 0b11) * 4 + 3,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32&expand=5163)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32&expand=5164)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f32x4&expand=5162)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm8!(MASK);
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    let r: f32x8 = simd_shuffle8!(
+        a,
+        b,
+        <const MASK: i32> [
+            (MASK as u32 & 0b1) * 4 + 0,
+            (MASK as u32 & 0b1) * 4 + 1,
+            (MASK as u32 & 0b1) * 4 + 2,
+            (MASK as u32 & 0b1) * 4 + 3,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f32x4&expand=5160)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f32x4&expand=5161)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f64x2&expand=5171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_imm8!(MASK);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r: f64x8 = simd_shuffle8!(
+        a,
+        b,
+        <const MASK: i32> [
+            (MASK as u32 & 0b11) * 2 + 0,
+            (MASK as u32 & 0b11) * 2 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f64x2&expand=5168)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_imm8!(MASK);
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    let r: f64x4 = simd_shuffle4!(
+        a,
+        b,
+        <const MASK: i32> [
+            (MASK as u32 & 0b1) * 2 + 0,
+            (MASK as u32 & 0b1) * 2 + 1,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+        ],
+    );
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f64x2&expand=5166)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f64x2&expand=5167)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf32x4_ps&expand=2442)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf32x4, IMM8 = 3)
+)]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
+    static_assert_imm2!(IMM8);
+    match IMM8 & 0x3 {
+        0 => simd_shuffle4!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
+        2 => simd_shuffle4!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
+        _ => simd_shuffle4!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extractf32x4_ps&expand=2443)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf32x4, IMM8 = 3)
+)]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_extractf32x4_ps<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m512,
+) -> __m128 {
+    static_assert_imm2!(IMM8);
+    let r = _mm512_extractf32x4_ps::<IMM8>(a);
+    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extractf32x4_ps&expand=2444)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf32x4, IMM8 = 3)
+)]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m128 {
+    static_assert_imm2!(IMM8);
+    let r = _mm512_extractf32x4_ps::<IMM8>(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf32x4_ps&expand=2439)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextract, IMM8 = 1) //should be vextractf32x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
+    static_assert_imm1!(IMM8);
+    match IMM8 & 0x1 {
+        0 => simd_shuffle4!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
+    }
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_extractf32x4_ps&expand=2440)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf32x4, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_extractf32x4_ps<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m256,
+) -> __m128 {
+    static_assert_imm1!(IMM8);
+    let r = _mm256_extractf32x4_ps::<IMM8>(a);
+    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_extractf32x4_ps&expand=2441)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf32x4, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128 {
+    static_assert_imm1!(IMM8);
+    let r = _mm256_extractf32x4_ps::<IMM8>(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
+}
+
+/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti64x4_epi64&expand=2473)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf64x4, IMM1 = 1) //should be vextracti64x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
+    static_assert_imm1!(IMM1);
+    match IMM1 {
+        0 => simd_shuffle4!(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti64x4_epi64&expand=2474)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextracti64x4, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_extracti64x4_epi64<const IMM1: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m256i {
+    static_assert_imm1!(IMM1);
+    let r = _mm512_extracti64x4_epi64::<IMM1>(a);
+    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+}
+
+/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti64x4_epi64&expand=2475)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextracti64x4, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: __m512i) -> __m256i {
+    static_assert_imm1!(IMM1);
+    let r = _mm512_extracti64x4_epi64::<IMM1>(a);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
+}
+
+/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf64x4_pd&expand=2454)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf64x4, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
+    static_assert_imm1!(IMM8);
+    match IMM8 & 0x1 {
+        0 => simd_shuffle4!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
+    }
+}
+
+/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extractf64x4_pd&expand=2455)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf64x4, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_extractf64x4_pd<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256d {
+    static_assert_imm1!(IMM8);
+    let r = _mm512_extractf64x4_pd::<IMM8>(a);
+    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+}
+
+/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extractf64x4_pd&expand=2456)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf64x4, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_extractf64x4_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m256d {
+    static_assert_imm1!(IMM8);
+    let r = _mm512_extractf64x4_pd::<IMM8>(a);
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti32x4_epi32&expand=2461)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextractf32x4, IMM2 = 3) //should be vextracti32x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i {
+    static_assert_imm2!(IMM2);
+    let a = a.as_i32x16();
+    let undefined = _mm512_undefined_epi32().as_i32x16();
+    let extract: i32x4 = match IMM2 {
+        0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
+        2 => simd_shuffle4!(a, undefined, [8, 9, 10, 11]),
+        _ => simd_shuffle4!(a, undefined, [12, 13, 14, 15]),
+    };
+    transmute(extract)
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti32x4_epi32&expand=2462)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextracti32x4, IMM2 = 3)
+)]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_extracti32x4_epi32<const IMM2: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m128i {
+    static_assert_imm2!(IMM2);
+    let r = _mm512_extracti32x4_epi32::<IMM2>(a);
+    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti32x4_epi32&expand=2463)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextracti32x4, IMM2 = 3)
+)]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_extracti32x4_epi32<const IMM2: i32>(k: __mmask8, a: __m512i) -> __m128i {
+    static_assert_imm2!(IMM2);
+    let r = _mm512_extracti32x4_epi32::<IMM2>(a);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti32x4_epi32&expand=2458)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextract, IMM1 = 1) //should be vextracti32x4
+)]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_imm1!(IMM1);
+    let a = a.as_i32x8();
+    let undefined = _mm256_undefined_si256().as_i32x8();
+    let extract: i32x4 = match IMM1 {
+        0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
+    };
+    transmute(extract)
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_extracti32x4_epi32&expand=2459)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextracti32x4, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_extracti32x4_epi32<const IMM1: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m128i {
+    static_assert_imm1!(IMM1);
+    let r = _mm256_extracti32x4_epi32::<IMM1>(a);
+    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+}
+
+/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_extracti32x4_epi32&expand=2460)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vextracti32x4, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: __m256i) -> __m128i {
+    static_assert_imm1!(IMM1);
+    let r = _mm256_extracti32x4_epi32::<IMM1>(a);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_moveldup_ps&expand=3862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
+    let r: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+    transmute(r)
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_moveldup_ps&expand=3860)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_moveldup_ps&expand=3861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub unsafe fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
+    let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_moveldup_ps&expand=3857)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub unsafe fn _mm256_mask_moveldup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    let mov = _mm256_moveldup_ps(a);
+    transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_moveldup_ps&expand=3858)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub unsafe fn _mm256_maskz_moveldup_ps(k: __mmask8, a: __m256) -> __m256 {
+    let mov = _mm256_moveldup_ps(a);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, mov.as_f32x8(), zero))
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_moveldup_ps&expand=3854)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub unsafe fn _mm_mask_moveldup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    let mov = _mm_moveldup_ps(a);
+    transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_moveldup_ps&expand=3855)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovsldup))]
+pub unsafe fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
+    let mov = _mm_moveldup_ps(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, mov.as_f32x4(), zero))
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_movehdup_ps&expand=3852)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
+    let r: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+    transmute(r)
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movehdup&expand=3850)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_moveh&expand=3851)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub unsafe fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
+    let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_movehdup_ps&expand=3847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub unsafe fn _mm256_mask_movehdup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    let mov = _mm256_movehdup_ps(a);
+    transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_movehdup_ps&expand=3848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub unsafe fn _mm256_maskz_movehdup_ps(k: __mmask8, a: __m256) -> __m256 {
+    let mov = _mm256_movehdup_ps(a);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, mov.as_f32x8(), zero))
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_movehdup_ps&expand=3844)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub unsafe fn _mm_mask_movehdup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    let mov = _mm_movehdup_ps(a);
+    transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_movehdup_ps&expand=3845)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovshdup))]
+pub unsafe fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
+    let mov = _mm_movehdup_ps(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, mov.as_f32x4(), zero))
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movedup_pd&expand=3843)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
+    let r: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+    transmute(r)
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movedup_pd&expand=3841)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+    transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movedup_pd&expand=3842)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
+    let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_movedup_pd&expand=3838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub unsafe fn _mm256_mask_movedup_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    let mov = _mm256_movedup_pd(a);
+    transmute(simd_select_bitmask(k, mov.as_f64x4(), src.as_f64x4()))
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_movedup_pd&expand=3839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub unsafe fn _mm256_maskz_movedup_pd(k: __mmask8, a: __m256d) -> __m256d {
+    let mov = _mm256_movedup_pd(a);
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, mov.as_f64x4(), zero))
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_movedup_pd&expand=3835)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub unsafe fn _mm_mask_movedup_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    let mov = _mm_movedup_pd(a);
+    transmute(simd_select_bitmask(k, mov.as_f64x2(), src.as_f64x2()))
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_movedup_pd&expand=3836)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub unsafe fn _mm_maskz_movedup_pd(k: __mmask8, a: __m128d) -> __m128d {
+    let mov = _mm_movedup_pd(a);
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, mov.as_f64x2(), zero))
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=3174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))] //should be vinserti32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
+    static_assert_imm2!(IMM8);
+    let a = a.as_i32x16();
+    let b = _mm512_castsi128_si512(b).as_i32x16();
+    let ret: i32x16 = match IMM8 & 0b11 {
+        0 => simd_shuffle16!(
+            a,
+            b,
+            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        1 => simd_shuffle16!(
+            a,
+            b,
+            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        2 => simd_shuffle16!(
+            a,
+            b,
+            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+        ),
+        _ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
+    };
+    transmute(ret)
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti32x4&expand=3175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_inserti32x4<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m128i,
+) -> __m512i {
+    static_assert_imm2!(IMM8);
+    let r = _mm512_inserti32x4::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti32x4&expand=3176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_inserti32x4<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m128i,
+) -> __m512i {
+    static_assert_imm2!(IMM8);
+    let r = _mm512_inserti32x4::<IMM8>(a, b);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti32x4&expand=3171)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsert, IMM8 = 1) //should be vinserti32x4
+)]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
+    static_assert_imm1!(IMM8);
+    let a = a.as_i32x8();
+    let b = _mm256_castsi128_si256(b).as_i32x8();
+    let ret: i32x8 = match IMM8 & 0b1 {
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+    };
+    transmute(ret)
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_inserti32x4&expand=3172)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinserti32x4, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_inserti32x4<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m128i,
+) -> __m256i {
+    static_assert_imm1!(IMM8);
+    let r = _mm256_inserti32x4::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_inserti32x4&expand=3173)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinserti32x4, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_inserti32x4<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m128i,
+) -> __m256i {
+    static_assert_imm1!(IMM8);
+    let r = _mm256_inserti32x4::<IMM8>(a, b);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=3186)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))] //should be vinserti64x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
+    static_assert_imm1!(IMM8);
+    let b = _mm512_castsi256_si512(b);
+    match IMM8 & 0b1 {
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti64x4&expand=3187)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_inserti64x4<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m256i,
+) -> __m512i {
+    static_assert_imm1!(IMM8);
+    let r = _mm512_inserti64x4::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti64x4&expand=3188)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_inserti64x4<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m256i,
+) -> __m512i {
+    static_assert_imm1!(IMM8);
+    let r = _mm512_inserti64x4::<IMM8>(a, b);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf32x4&expand=3155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m512 {
+    static_assert_imm2!(IMM8);
+    let b = _mm512_castps128_ps512(b);
+    match IMM8 & 0b11 {
+        0 => simd_shuffle16!(
+            a,
+            b,
+            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        1 => simd_shuffle16!(
+            a,
+            b,
+            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        2 => simd_shuffle16!(
+            a,
+            b,
+            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+        ),
+        _ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf32x4&expand=3156)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_insertf32x4<const IMM8: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m128,
+) -> __m512 {
+    static_assert_imm2!(IMM8);
+    let r = _mm512_insertf32x4::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf32x4&expand=3157)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_insertf32x4<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m128,
+) -> __m512 {
+    static_assert_imm2!(IMM8);
+    let r = _mm512_insertf32x4::<IMM8>(a, b);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
+}
+
+/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf32x4&expand=3152)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsert, IMM8 = 1) //should be vinsertf32x4
+)]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m256 {
+    static_assert_imm1!(IMM8);
+    let b = _mm256_castps128_ps256(b);
+    match IMM8 & 0b1 {
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_insertf32x4&expand=3153)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf32x4, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_insertf32x4<const IMM8: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m128,
+) -> __m256 {
+    static_assert_imm1!(IMM8);
+    let r = _mm256_insertf32x4::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_insertf32x4&expand=3154)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vinsertf32x4, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_insertf32x4<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256,
+    b: __m128,
+) -> __m256 {
+    static_assert_imm1!(IMM8);
+    let r = _mm256_insertf32x4::<IMM8>(a, b);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=3167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m512d {
+    static_assert_imm1!(IMM8);
+    let b = _mm512_castpd256_pd512(b);
+    match IMM8 & 0b1 {
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf64x4&expand=3168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_insertf64x4<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m256d,
+) -> __m512d {
+    static_assert_imm1!(IMM8);
+    let r = _mm512_insertf64x4::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf64x4&expand=3169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_insertf64x4<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m256d,
+) -> __m512d {
+    static_assert_imm1!(IMM8);
+    let r = _mm512_insertf64x4::<IMM8>(a, b);
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi32&expand=6021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq
+pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    #[rustfmt::skip]
+    let r: i32x16 = simd_shuffle16!(
+        a, b,
+        [ 2, 18, 3, 19,
+          2 + 4, 18 + 4, 3 + 4, 19 + 4,
+          2 + 8, 18 + 8, 3 + 8, 19 + 8,
+          2 + 12, 18 + 12, 3 + 12, 19 + 12],
+    );
+    transmute(r)
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=6019)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm512_mask_unpackhi_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=6020)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_epi32&expand=6016)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm256_mask_unpackhi_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x8()))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_epi32&expand=6017)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm256_maskz_unpackhi_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_epi32&expand=6013)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm_mask_unpackhi_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x4()))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_epi32&expand=6014)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=6030)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
+pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
+    simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=6028)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm512_mask_unpackhi_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=6029)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_epi64&expand=6025)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm256_mask_unpackhi_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x4()))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_epi64&expand=6026)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm256_maskz_unpackhi_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_epi64&expand=6022)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm_mask_unpackhi_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x2()))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_epi64&expand=6023)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=6060)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
+    #[rustfmt::skip]
+    simd_shuffle16!(
+        a, b,
+        [ 2, 18, 3, 19,
+          2 + 4, 18 + 4, 3 + 4, 19 + 4,
+          2 + 8, 18 + 8, 3 + 8, 19 + 8,
+          2 + 12, 18 + 12, 3 + 12, 19 + 12],
+    )
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=6058)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=6059)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_ps&expand=6055)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm256_mask_unpackhi_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x8()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_ps&expand=6056)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm256_maskz_unpackhi_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_ps&expand=6052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm_mask_unpackhi_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x4()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_ps&expand=6053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=6048)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
+    simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=6046)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm512_mask_unpackhi_pd(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=6047)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_pd&expand=6043)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm256_mask_unpackhi_pd(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x4()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_pd&expand=6044)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm256_maskz_unpackhi_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_pd&expand=6040)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm_mask_unpackhi_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x2()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_pd&expand=6041)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm_maskz_unpackhi_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=6078)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq
+pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    #[rustfmt::skip]
+    let r: i32x16 = simd_shuffle16!(
+        a, b,
+        [ 0, 16, 1, 17,
+          0 + 4, 16 + 4, 1 + 4, 17 + 4,
+          0 + 8, 16 + 8, 1 + 8, 17 + 8,
+          0 + 12, 16 + 12, 1 + 12, 17 + 12],
+    );
+    transmute(r)
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=6076)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm512_mask_unpacklo_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x16()))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=6077)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_epi32&expand=6073)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm256_mask_unpacklo_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x8()))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_epi32&expand=6074)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm256_maskz_unpacklo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_epi32&expand=6070)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm_mask_unpacklo_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x4()))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_epi32&expand=6071)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=6087)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
+pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
+    simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=6085)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm512_mask_unpacklo_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x8()))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=6086)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_epi64&expand=6082)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm256_mask_unpacklo_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x4()))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_epi64&expand=6083)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm256_maskz_unpacklo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_epi64&expand=6079)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm_mask_unpacklo_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x2()))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_epi64&expand=6080)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_ps&expand=6117)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
+    #[rustfmt::skip]
+    simd_shuffle16!(a, b,
+                   [ 0, 16, 1, 17,
+                     0 + 4, 16 + 4, 1 + 4, 17 + 4,
+                     0 + 8, 16 + 8, 1 + 8, 17 + 8,
+                     0 + 12, 16 + 12, 1 + 12, 17 + 12],
+    )
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_ps&expand=6115)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x16()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_ps&expand=6116)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_ps&expand=6112)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm256_mask_unpacklo_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x8()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_ps&expand=6113)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm256_maskz_unpacklo_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_ps&expand=6109)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm_mask_unpacklo_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x4()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_ps&expand=6110)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=6105)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
+    simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=6103)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm512_mask_unpacklo_pd(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x8()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=6104)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_pd&expand=6100)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm256_mask_unpacklo_pd(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x4()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_pd&expand=6101)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm256_maskz_unpacklo_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_pd&expand=6097)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm_mask_unpacklo_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x2()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_pd&expand=6098)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
+    simd_shuffle16!(
+        a,
+        _mm_set1_ps(-1.),
+        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
+    )
+}
+
+/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps256_ps512&expand=623)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
+    simd_shuffle16!(
+        a,
+        _mm256_set1_ps(-1.),
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+    )
+}
+
+/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextps128_ps512&expand=6196)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
+    simd_shuffle16!(
+        a,
+        _mm_set1_ps(0.),
+        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
+    )
+}
+
+/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextps256_ps512&expand=6197)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
+    simd_shuffle16!(
+        a,
+        _mm256_set1_ps(0.),
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+    )
+}
+
+/// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps128&expand=624)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
+}
+
+/// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps256&expand=625)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 {
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_pd&expand=616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps_pd(a: __m512) -> __m512d {
+    transmute(a.as_m512())
+}
+
+/// Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_si512&expand=619)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
+    transmute(a.as_m512())
+}
+
+/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd128_pd512&expand=609)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
+    simd_shuffle8!(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
+}
+
+/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd256_pd512&expand=611)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
+    simd_shuffle8!(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
+}
+
+/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextpd128_pd512&expand=6193)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
+    simd_shuffle8!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
+}
+
+/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextpd256_pd512&expand=6194)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
+    simd_shuffle8!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
+}
+
+/// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd128&expand=612)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
+    simd_shuffle2!(a, a, [0, 1])
+}
+
+/// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd256&expand=613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
+}
+
+/// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_ps&expand=604)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd_ps(a: __m512d) -> __m512 {
+    transmute(a.as_m512d())
+}
+
+/// Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_si512&expand=607)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
+    transmute(a.as_m512d())
+}
+
+/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi128_si512&expand=629)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
+    simd_shuffle8!(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
+}
+
+/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi256_si512&expand=633)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
+    simd_shuffle8!(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
+}
+
+/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextsi128_si512&expand=6199)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
+    simd_shuffle8!(a, _mm_set1_epi64x(0), [0, 1, 2, 2, 2, 2, 2, 2])
+}
+
+/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextsi256_si512&expand=6200)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
+    simd_shuffle8!(a, _mm256_set1_epi64x(0), [0, 1, 2, 3, 4, 4, 4, 4])
+}
+
+/// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si128&expand=636)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
+    simd_shuffle2!(a, a, [0, 1])
+}
+
+/// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si256&expand=637)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
+}
+
+/// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_ps&expand=635)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
+    transmute(a)
+}
+
+/// Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_pd&expand=634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
+    transmute(a)
+}
+
+/// Copy the lower 32-bit integer in a to dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsi512_si32&expand=1882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(vmovd))]
+pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
+    let extract: i32 = simd_extract(a.as_i32x16(), 0);
+    transmute(extract)
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=545)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
+    let a = _mm512_castsi128_si512(a).as_i32x16();
+    let ret: i32x16 = simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+    transmute(ret)
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=547)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastd_epi32&expand=543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm256_mask_broadcastd_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastd_epi32&expand=544)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm256_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastd_epi32&expand=540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm_mask_broadcastd_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x4()))
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastd_epi32&expand=541)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq
+pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
+    simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastq_epi64&expand=558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm256_mask_broadcastq_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i64x4()))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastq_epi64&expand=559)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm256_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastq_epi64&expand=555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm_mask_broadcastq_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i64x2()))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastq_epi64&expand=556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
+    simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastss_ps&expand=579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastss_ps&expand=580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
+    let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastss_ps&expand=576)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm256_mask_broadcastss_ps(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastss_ps&expand=577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm256_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m256 {
+    let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastss_ps&expand=573)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm_mask_broadcastss_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x4()))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastss_ps&expand=574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
+    let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=567)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
+    simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=568)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
+    let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
+    let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastsd_pd&expand=565)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm256_mask_broadcastsd_pd(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
+    let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f64x4()))
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastsd_pd&expand=566)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
+    let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=510)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
+pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
+    let a = a.as_i32x4();
+    let ret: i32x16 = simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
+    transmute(ret)
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=511)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
+pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=512)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
+pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_i32x4&expand=507)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+pub unsafe fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
+    let a = a.as_i32x4();
+    let ret: i32x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
+    transmute(ret)
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcast_i32x4&expand=508)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+pub unsafe fn _mm256_mask_broadcast_i32x4(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcast_i32x4&expand=509)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+pub unsafe fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
+pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
+pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
+pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
+    let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f32x4&expand=483)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
+pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
+    simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=484)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
+pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=485)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
+pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
+    let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_f32x4&expand=480)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
+pub unsafe fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcast_f32x4&expand=481)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
+pub unsafe fn _mm256_mask_broadcast_f32x4(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcast_f32x4&expand=482)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
+pub unsafe fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
+    let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
+pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
+pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
+    let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497)
+#[inline]
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
+pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
+    let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=435)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub unsafe fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16()))
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_epi32&expand=434)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub unsafe fn _mm256_mask_blend_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(k, b.as_i32x8(), a.as_i32x8()))
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_epi32&expand=432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub unsafe fn _mm_mask_blend_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(k, b.as_i32x4(), a.as_i32x4()))
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=438)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub unsafe fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8()))
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_epi64&expand=437)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub unsafe fn _mm256_mask_blend_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(k, b.as_i64x4(), a.as_i64x4()))
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_epi64&expand=436)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub unsafe fn _mm_mask_blend_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(k, b.as_i64x2(), a.as_i64x2()))
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=451)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub unsafe fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16()))
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ps&expand=450)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub unsafe fn _mm256_mask_blend_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    transmute(simd_select_bitmask(k, b.as_f32x8(), a.as_f32x8()))
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ps&expand=448)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub unsafe fn _mm_mask_blend_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(simd_select_bitmask(k, b.as_f32x4(), a.as_f32x4()))
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=446)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub unsafe fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8()))
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_pd&expand=445)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub unsafe fn _mm256_mask_blend_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    transmute(simd_select_bitmask(k, b.as_f64x4(), a.as_f64x4()))
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_pd&expand=443)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub unsafe fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(simd_select_bitmask(k, b.as_f64x2(), a.as_f64x2()))
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_alignr_epi32&expand=245)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let imm8: i32 = IMM8 % 16;
+    let r: i32x16 = match imm8 {
+        0 => simd_shuffle16!(
+            a,
+            b,
+            [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,],
+        ),
+        1 => simd_shuffle16!(
+            a,
+            b,
+            [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,],
+        ),
+        2 => simd_shuffle16!(
+            a,
+            b,
+            [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
+        ),
+        3 => simd_shuffle16!(
+            a,
+            b,
+            [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
+        ),
+        4 => simd_shuffle16!(
+            a,
+            b,
+            [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
+        ),
+        5 => simd_shuffle16!(
+            a,
+            b,
+            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
+        ),
+        6 => simd_shuffle16!(
+            a,
+            b,
+            [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
+        ),
+        7 => simd_shuffle16!(
+            a,
+            b,
+            [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
+        ),
+        8 => simd_shuffle16!(
+            a,
+            b,
+            [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
+        ),
+        9 => simd_shuffle16!(
+            a,
+            b,
+            [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
+        ),
+        10 => simd_shuffle16!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        11 => simd_shuffle16!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+        12 => simd_shuffle16!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+        13 => simd_shuffle16!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
+        14 => simd_shuffle16!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
+        _ => simd_shuffle16!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_alignr_epi32&expand=246)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_alignr_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_alignr_epi32::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_alignr_epi32&expand=247)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_alignr_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_alignr_epi32::<IMM8>(a, b);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi32&expand=242)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let imm8: i32 = IMM8 % 16;
+    let r: i32x8 = match imm8 {
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+        2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+        3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+        4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+        5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+        6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+        7 => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        8 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        9 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        10 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        11 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        12 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        13 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        14 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        _ => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_alignr_epi32&expand=243)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_alignr_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let r = _mm256_alignr_epi32::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_alignr_epi32&expand=244)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_alignr_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let r = _mm256_alignr_epi32::<IMM8>(a, b);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi32&expand=239)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let imm8: i32 = IMM8 % 8;
+    let r: i32x4 = match imm8 {
+        0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4!(a, b, [1, 2, 3, 0]),
+        6 => simd_shuffle4!(a, b, [2, 3, 0, 1]),
+        _ => simd_shuffle4!(a, b, [3, 0, 1, 2]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_alignr_epi32&expand=240)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_alignr_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let r = _mm_alignr_epi32::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_alignr_epi32&expand=241)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_alignr_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let r = _mm_alignr_epi32::<IMM8>(a, b);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_alignr_epi64&expand=254)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8: i32 = IMM8 % 8;
+    let r: i64x8 = match imm8 {
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+        2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+        3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+        4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+        5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+        6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+        _ => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_alignr_epi64&expand=255)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_alignr_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_alignr_epi64::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+}
+
+/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_alignr_epi64&expand=256)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_alignr_epi64::<IMM8>(a, b);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi64&expand=251)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8: i32 = IMM8 % 8;
+    let r: i64x4 = match imm8 {
+        0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        6 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        _ => simd_shuffle4!(a, b, [3, 4, 5, 6]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_alignr_epi64&expand=252)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_alignr_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let r = _mm256_alignr_epi64::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_alignr_epi64&expand=253)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let r = _mm256_alignr_epi64::<IMM8>(a, b);
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi64&expand=248)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8: i32 = IMM8 % 4;
+    let r: i64x2 = match imm8 {
+        0 => simd_shuffle2!(a, b, [2, 3]),
+        1 => simd_shuffle2!(a, b, [3, 0]),
+        2 => simd_shuffle2!(a, b, [0, 1]),
+        _ => simd_shuffle2!(a, b, [1, 2]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_alignr_epi64&expand=249)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_alignr_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let r = _mm_alignr_epi64::<IMM8>(a, b);
+    transmute(simd_select_bitmask(k, r.as_i64x2(), src.as_i64x2()))
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_alignr_epi64&expand=250)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_alignr_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let r = _mm_alignr_epi64::<IMM8>(a, b);
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, r.as_i64x2(), zero))
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_epi32&expand=272)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandq))] //should be vpandd, but generate vpandq
+pub unsafe fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_and(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi32&expand=273)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub unsafe fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let and = _mm512_and_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, and, src.as_i32x16()))
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi32&expand=274)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub unsafe fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let and = _mm512_and_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, and, zero))
+}
+
+/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_and_epi32&expand=270)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub unsafe fn _mm256_mask_and_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let and = simd_and(a.as_i32x8(), b.as_i32x8());
+    transmute(simd_select_bitmask(k, and, src.as_i32x8()))
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_and_epi32&expand=271)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub unsafe fn _mm256_maskz_and_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let and = simd_and(a.as_i32x8(), b.as_i32x8());
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, and, zero))
+}
+
+/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_and_epi32&expand=268)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub unsafe fn _mm_mask_and_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let and = simd_and(a.as_i32x4(), b.as_i32x4());
+    transmute(simd_select_bitmask(k, and, src.as_i32x4()))
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_and_epi32&expand=269)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandd))]
+pub unsafe fn _mm_maskz_and_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let and = simd_and(a.as_i32x4(), b.as_i32x4());
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, and, zero))
+}
+
+/// Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_epi64&expand=279)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_and(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi64&expand=280)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let and = _mm512_and_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, and, src.as_i64x8()))
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi64&expand=281)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let and = _mm512_and_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, and, zero))
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_and_epi64&expand=277)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm256_mask_and_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let and = simd_and(a.as_i64x4(), b.as_i64x4());
+    transmute(simd_select_bitmask(k, and, src.as_i64x4()))
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_and_epi64&expand=278)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm256_maskz_and_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let and = simd_and(a.as_i64x4(), b.as_i64x4());
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, and, zero))
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_and_epi64&expand=275)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm_mask_and_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let and = simd_and(a.as_i64x2(), b.as_i64x2());
+    transmute(simd_select_bitmask(k, and, src.as_i64x2()))
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_and_epi64&expand=276)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm_maskz_and_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let and = simd_and(a.as_i64x2(), b.as_i64x2());
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, and, zero))
+}
+
+/// Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_si512&expand=302)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandq))]
+pub unsafe fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_and(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_or_epi32&expand=4042)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub unsafe fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_or(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_or_epi32&expand=4040)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub unsafe fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let or = _mm512_or_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, or, src.as_i32x16()))
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_or_epi32&expand=4041)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub unsafe fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let or = _mm512_or_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, or, zero))
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_epi32&expand=4039)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vor))] //should be vpord
+pub unsafe fn _mm256_or_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_or_epi32&expand=4037)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub unsafe fn _mm256_mask_or_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let or = _mm256_or_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, or, src.as_i32x8()))
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_or_epi32&expand=4038)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub unsafe fn _mm256_maskz_or_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let or = _mm256_or_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, or, zero))
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_epi32&expand=4036)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vor))] //should be vpord
+pub unsafe fn _mm_or_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_or(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_or_epi32&expand=4034)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub unsafe fn _mm_mask_or_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let or = _mm_or_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, or, src.as_i32x4()))
+}
+
+/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_or_epi32&expand=4035)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpord))]
+pub unsafe fn _mm_maskz_or_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let or = _mm_or_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, or, zero))
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_or_epi64&expand=4051)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub unsafe fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_or(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_or_epi64&expand=4049)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub unsafe fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let or = _mm512_or_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, or, src.as_i64x8()))
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_or_epi64&expand=4050)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub unsafe fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let or = _mm512_or_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, or, zero))
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_epi64&expand=4048)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vor))] //should be vporq
+pub unsafe fn _mm256_or_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_or(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_or_epi64&expand=4046)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub unsafe fn _mm256_mask_or_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let or = _mm256_or_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, or, src.as_i64x4()))
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_or_epi64&expand=4047)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub unsafe fn _mm256_maskz_or_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let or = _mm256_or_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, or, zero))
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_epi64&expand=4045)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vor))] //should be vporq
+pub unsafe fn _mm_or_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_or(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_or_epi64&expand=4043)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub unsafe fn _mm_mask_or_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let or = _mm_or_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, or, src.as_i64x2()))
+}
+
+/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_or_epi64&expand=4044)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub unsafe fn _mm_maskz_or_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let or = _mm_or_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, or, zero))
+}
+
+/// Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_or_si512&expand=4072)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vporq))]
+pub unsafe fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_or(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_epi32&expand=6142)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpxorq))] //should be vpxord
+pub unsafe fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_xor(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_xor_epi32&expand=6140)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub unsafe fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let xor = _mm512_xor_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, xor, src.as_i32x16()))
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_xor_epi32&expand=6141)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub unsafe fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let xor = _mm512_xor_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, xor, zero))
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_epi32&expand=6139)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
+pub unsafe fn _mm256_xor_epi32(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_xor(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_xor_epi32&expand=6137)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub unsafe fn _mm256_mask_xor_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let xor = _mm256_xor_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, xor, src.as_i32x8()))
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_xor_epi32&expand=6138)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub unsafe fn _mm256_maskz_xor_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let xor = _mm256_xor_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, xor, zero))
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_epi32&expand=6136)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
+pub unsafe fn _mm_xor_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_xor(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_xor_epi32&expand=6134)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub unsafe fn _mm_mask_xor_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let xor = _mm_xor_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, xor, src.as_i32x4()))
+}
+
+/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_xor_epi32&expand=6135)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpxord))]
+pub unsafe fn _mm_maskz_xor_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let xor = _mm_xor_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, xor, zero))
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_epi64&expand=6151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub unsafe fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_xor(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_xor_epi64&expand=6149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub unsafe fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let xor = _mm512_xor_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, xor, src.as_i64x8()))
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_xor_epi64&expand=6150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub unsafe fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let xor = _mm512_xor_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, xor, zero))
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_epi64&expand=6148)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
+pub unsafe fn _mm256_xor_epi64(a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_xor_epi64&expand=6146)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub unsafe fn _mm256_mask_xor_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let xor = _mm256_xor_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, xor, src.as_i64x4()))
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_xor_epi64&expand=6147)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub unsafe fn _mm256_maskz_xor_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let xor = _mm256_xor_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, xor, zero))
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_epi64&expand=6145)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
+pub unsafe fn _mm_xor_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_xor(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_xor_epi64&expand=6143)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub unsafe fn _mm_mask_xor_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let xor = _mm_xor_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, xor, src.as_i64x2()))
+}
+
+/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_xor_epi64&expand=6144)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub unsafe fn _mm_maskz_xor_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let xor = _mm_xor_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, xor, zero))
+}
+
+/// Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_si512&expand=6172)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpxorq))]
+pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_xor(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi32&expand=310)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
+pub unsafe fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi32(_mm512_xor_epi32(a, _mm512_set1_epi32(u32::MAX as i32)), b)
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi32&expand=311)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub unsafe fn _mm512_mask_andnot_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi32&expand=312)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub unsafe fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, andnot, zero))
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_andnot_epi32&expand=308)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub unsafe fn _mm256_mask_andnot_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
+    let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
+    transmute(simd_select_bitmask(k, andnot, src.as_i32x8()))
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_andnot_epi32&expand=309)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub unsafe fn _mm256_maskz_andnot_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
+    let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, andnot, zero))
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_andnot_epi32&expand=306)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub unsafe fn _mm_mask_andnot_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
+    let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
+    transmute(simd_select_bitmask(k, andnot, src.as_i32x4()))
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_andnot_epi32&expand=307)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub unsafe fn _mm_maskz_andnot_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
+    let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, andnot, zero))
+}
+
+/// Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in a and then AND with b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi64&expand=317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
+pub unsafe fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi64&expand=318)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm512_mask_andnot_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi64&expand=319)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, andnot, zero))
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_andnot_epi64&expand=315)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm256_mask_andnot_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
+    let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
+    transmute(simd_select_bitmask(k, andnot, src.as_i64x4()))
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_andnot_epi64&expand=316)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm256_maskz_andnot_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
+    let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, andnot, zero))
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_andnot_epi64&expand=313)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm_mask_andnot_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
+    let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
+    transmute(simd_select_bitmask(k, andnot, src.as_i64x2()))
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_andnot_epi64&expand=314)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm_maskz_andnot_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
+    let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, andnot, zero))
+}
+
+/// Compute the bitwise NOT of 512 bits (representing integer data) in a and then AND with b, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_si512&expand=340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
+}
+
+/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kand_mask16&expand=3212)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
+pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    transmute(a & b)
+}
+
+/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kand&expand=3210)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
+pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
+    transmute(a & b)
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kor_mask16&expand=3239)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
+pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    transmute(a | b)
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kor&expand=3237)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
+pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
+    transmute(a | b)
+}
+
+/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kxor_mask16&expand=3291)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
+pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    transmute(a ^ b)
+}
+
+/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kxor&expand=3289)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
+pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
+    transmute(a ^ b)
+}
+
+/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=knot_mask16&expand=3233)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
+    transmute(a ^ 0b11111111_11111111)
+}
+
+/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_knot&expand=3231)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 {
+    transmute(a ^ 0b11111111_11111111)
+}
+
+/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kandn_mask16&expand=3218)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(not))] // generate normal and, not code instead of kandnw
+pub unsafe fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_kand(_mm512_knot(a), b)
+}
+
+/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kandn&expand=3216)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandw
+pub unsafe fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_kand(_mm512_knot(a), b)
+}
+
+/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kxnor_mask16&expand=3285)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal xor, not code instead of kxnorw
+pub unsafe fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_knot(_mm512_kxor(a, b))
+}
+
+/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kxnor&expand=3283)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kandw
+pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
+    _mm512_knot(_mm512_kxor(a, b))
+}
+
+/// Copy 16-bit mask a to k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm512_kmov&expand=3228)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
+pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
+    let r: u16 = a;
+    transmute(r)
+}
+
+/// Converts integer mask into bitmask, storing the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_int2mask&expand=3189)
+#[inline]
+#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
+pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
+    let r: u16 = mask as u16;
+    transmute(r)
+}
+
+/// Converts bit mask k1 into an integer value, storing the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2int&expand=3544)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
+pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
+    let r: i32 = k1 as i32;
+    transmute(r)
+}
+
+/// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kunpackb&expand=3280)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckbw
+pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
+    let a = a & 0b00000000_11111111;
+    let b = b & 0b11111111_00000000;
+    transmute(a | b)
+}
+
+/// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kortestc&expand=3247)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(cmp))] // generate normal and code instead of kortestw
+pub unsafe fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
+    let r = a | b;
+    if r == 0b11111111_11111111 {
+        1
+    } else {
+        0
+    }
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_test_epi32_mask&expand=5888)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm256_test_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_test_epi32_mask&expand=5887)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm256_mask_test_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_epi32_mask&expand=5886)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm_test_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_test_epi32_mask&expand=5885)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm_mask_test_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_test_epi64_mask&expand=5894)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm256_test_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_test_epi64_mask&expand=5893)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm256_mask_test_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_epi64_mask&expand=5892)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm_test_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_test_epi64_mask&expand=5891)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm_mask_test_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5921)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5920)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testn_epi32_mask&expand=5919)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm256_testn_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_testn_epi32_mask&expand=5918)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm256_mask_testn_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testn_epi32_mask&expand=5917)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm_testn_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_testn_epi32_mask&expand=5916)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm_mask_testn_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi64_mask&expand=5927)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5926)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testn_epi64_mask&expand=5925)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm256_testn_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_testn_epi64_mask&expand=5924)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm256_mask_testn_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    let and = _mm256_and_si256(a, b);
+    let zero = _mm256_setzero_si256();
+    _mm256_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testn_epi64_mask&expand=5923)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm_testn_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_testn_epi64_mask&expand=5922)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    let and = _mm_and_si128(a, b);
+    let zero = _mm_setzero_si128();
+    _mm_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
+/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_ps&expand=5671)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovntps))]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m512, a);
+}
+
+/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_pd&expand=5667)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m512d, a);
+}
+
+/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_si512&expand=5675)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m512i, a);
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ps&expand=4931)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_ps(
+    e0: f32,
+    e1: f32,
+    e2: f32,
+    e3: f32,
+    e4: f32,
+    e5: f32,
+    e6: f32,
+    e7: f32,
+    e8: f32,
+    e9: f32,
+    e10: f32,
+    e11: f32,
+    e12: f32,
+    e13: f32,
+    e14: f32,
+    e15: f32,
+) -> __m512 {
+    _mm512_setr_ps(
+        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
+    )
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ps&expand=5008)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr_ps(
+    e0: f32,
+    e1: f32,
+    e2: f32,
+    e3: f32,
+    e4: f32,
+    e5: f32,
+    e6: f32,
+    e7: f32,
+    e8: f32,
+    e9: f32,
+    e10: f32,
+    e11: f32,
+    e12: f32,
+    e13: f32,
+    e14: f32,
+    e15: f32,
+) -> __m512 {
+    let r = f32x16::new(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    );
+    transmute(r)
+}
+
+/// Broadcast 64-bit float `a` to all elements of `dst`.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_pd&expand=4975)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
+    transmute(f64x8::splat(a))
+}
+
+/// Broadcast 32-bit float `a` to all elements of `dst`.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ps&expand=4981)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_ps(a: f32) -> __m512 {
+    transmute(f32x16::splat(a))
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi32&expand=4908)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_epi32(
+    e15: i32,
+    e14: i32,
+    e13: i32,
+    e12: i32,
+    e11: i32,
+    e10: i32,
+    e9: i32,
+    e8: i32,
+    e7: i32,
+    e6: i32,
+    e5: i32,
+    e4: i32,
+    e3: i32,
+    e2: i32,
+    e1: i32,
+    e0: i32,
+) -> __m512i {
+    _mm512_setr_epi32(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+
+/// Broadcast 8-bit integer a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi8&expand=4972)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_epi8(a: i8) -> __m512i {
+    transmute(i8x64::splat(a))
+}
+
+/// Broadcast the low packed 16-bit integer from a to all all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi16&expand=4944)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_epi16(a: i16) -> __m512i {
+    transmute(i16x32::splat(a))
+}
+
+/// Broadcast 32-bit integer `a` to all elements of `dst`.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i {
+    transmute(i32x16::splat(a))
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=4951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
+    let r = _mm512_set1_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=4952)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
+    let r = _mm512_set1_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_set1_epi32&expand=4948)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm256_mask_set1_epi32(src: __m256i, k: __mmask8, a: i32) -> __m256i {
+    let r = _mm256_set1_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_set1_epi32&expand=4949)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm256_maskz_set1_epi32(k: __mmask8, a: i32) -> __m256i {
+    let r = _mm256_set1_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_set1_epi32&expand=4945)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm_mask_set1_epi32(src: __m128i, k: __mmask8, a: i32) -> __m128i {
+    let r = _mm_set1_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_set1_epi32&expand=4946)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm_maskz_set1_epi32(k: __mmask8, a: i32) -> __m128i {
+    let r = _mm_set1_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 64-bit integer `a` to all elements of `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi64&expand=4961)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
+    transmute(i64x8::splat(a))
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=4959)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
+    let r = _mm512_set1_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=4960)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
+    let r = _mm512_set1_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_set1_epi64&expand=4957)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm256_mask_set1_epi64(src: __m256i, k: __mmask8, a: i64) -> __m256i {
+    let r = _mm256_set1_epi64x(a).as_i64x4();
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_set1_epi64&expand=4958)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm256_maskz_set1_epi64(k: __mmask8, a: i64) -> __m256i {
+    let r = _mm256_set1_epi64x(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_set1_epi64&expand=4954)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm_mask_set1_epi64(src: __m128i, k: __mmask8, a: i64) -> __m128i {
+    let r = _mm_set1_epi64x(a).as_i64x2();
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_set1_epi64&expand=4955)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm_maskz_set1_epi64(k: __mmask8, a: i64) -> __m128i {
+    let r = _mm_set1_epi64x(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Set packed 64-bit integers in dst with the repeated 4 element sequence.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set4_epi64&expand=4983)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
+    let r = i64x8::new(d, c, b, a, d, c, b, a);
+    transmute(r)
+}
+
+/// Set packed 64-bit integers in dst with the repeated 4 element sequence in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_epi64&expand=5010)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
+    let r = i64x8::new(a, b, c, d, a, b, c, d);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_ps_mask&expand=1074)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_LT_OS>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_ps_mask&expand=1075)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_mask_cmplt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_LT_OS>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnlt_ps_mask&expand=1154)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_NLT_US>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnlt_ps_mask&expand=1155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_mask_cmpnlt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_NLT_US>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_ps_mask&expand=1013)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_LE_OS>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_ps_mask&expand=1014)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_mask_cmple_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_LE_OS>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnle_ps_mask&expand=1146)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_NLE_US>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnle_ps_mask&expand=1147)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_mask_cmpnle_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_NLE_US>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_ps_mask&expand=828)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_ps_mask&expand=829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_mask_cmpeq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_EQ_OQ>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_ps_mask&expand=1130)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_NEQ_UQ>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_ps_mask&expand=1131)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_mask_cmpneq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_NEQ_UQ>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ps_mask&expand=749)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_ps_mask<const IMM8: i32>(a: __m512, b: __m512) -> __mmask16 {
+    static_assert_imm5!(IMM8);
+    let neg_one = -1;
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vcmpps(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ps_mask&expand=750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_ps_mask<const IMM8: i32>(
+    k1: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __mmask16 {
+    static_assert_imm5!(IMM8);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vcmpps(a, b, IMM8, k1 as i16, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ps_mask&expand=747)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_ps_mask<const IMM8: i32>(a: __m256, b: __m256) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let neg_one = -1;
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    let r = vcmpps256(a, b, IMM8, neg_one);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ps_mask&expand=748)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_ps_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    let r = vcmpps256(a, b, IMM8, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ps_mask&expand=745)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_ps_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let neg_one = -1;
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let r = vcmpps128(a, b, IMM8, neg_one);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ps_mask&expand=746)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_ps_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let r = vcmpps128(a, b, IMM8, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ps_mask&expand=753)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm512_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
+    a: __m512,
+    b: __m512,
+) -> __mmask16 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let neg_one = -1;
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vcmpps(a, b, IMM5, neg_one, SAE);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ps_mask&expand=754)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm512_mask_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
+    m: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __mmask16 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r = vcmpps(a, b, IMM5, m as i16, SAE);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpord_ps_mask&expand=1162)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmps
+pub unsafe fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_ORD_Q>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpord_ps_mask&expand=1163)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_mask_cmpord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_ORD_Q>(k1, a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpunord_ps_mask&expand=1170)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask::<_CMP_UNORD_Q>(a, b)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpunord_ps_mask&expand=1171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
+pub unsafe fn _mm512_mask_cmpunord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask::<_CMP_UNORD_Q>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_pd_mask&expand=1071)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_LT_OS>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_pd_mask&expand=1072)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_mask_cmplt_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_LT_OS>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnlt_pd_mask&expand=1151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_NLT_US>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnlt_pd_mask&expand=1152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_NLT_US>(m, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_pd_mask&expand=1010)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_LE_OS>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_pd_mask&expand=1011)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_mask_cmple_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_LE_OS>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnle_pd_mask&expand=1143)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_NLE_US>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnle_pd_mask&expand=1144)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_mask_cmpnle_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_NLE_US>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_pd_mask&expand=822)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_pd_mask&expand=823)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_mask_cmpeq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_EQ_OQ>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_pd_mask&expand=1127)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_NEQ_UQ>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_pd_mask&expand=1128)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_mask_cmpneq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_NEQ_UQ>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_pd_mask&expand=741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_pd_mask<const IMM8: i32>(a: __m512d, b: __m512d) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let neg_one = -1;
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vcmppd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_pd_mask&expand=742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_pd_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vcmppd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_pd_mask&expand=739)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_pd_mask<const IMM8: i32>(a: __m256d, b: __m256d) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let neg_one = -1;
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    let r = vcmppd256(a, b, IMM8, neg_one);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_pd_mask&expand=740)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_pd_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    let r = vcmppd256(a, b, IMM8, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_pd_mask&expand=737)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_pd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let neg_one = -1;
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let r = vcmppd128(a, b, IMM8, neg_one);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_pd_mask&expand=738)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_pd_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let r = vcmppd128(a, b, IMM8, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_pd_mask&expand=751)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm512_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
+    a: __m512d,
+    b: __m512d,
+) -> __mmask8 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let neg_one = -1;
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vcmppd(a, b, IMM5, neg_one, SAE);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_pd_mask&expand=752)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm512_mask_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __mmask8 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r = vcmppd(a, b, IMM5, k1 as i8, SAE);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpord_pd_mask&expand=1159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_ORD_Q>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpord_pd_mask&expand=1160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_mask_cmpord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_ORD_Q>(k1, a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpunord_pd_mask&expand=1167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask::<_CMP_UNORD_Q>(a, b)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpunord_pd_mask&expand=1168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
+pub unsafe fn _mm512_mask_cmpunord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask::<_CMP_UNORD_Q>(k1, a, b)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss_mask&expand=763)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_ss_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let neg_one = -1;
+    let r = vcmpss(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ss_mask&expand=764)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_ss_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let r = vcmpss(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
+    a: __m128,
+    b: __m128,
+) -> __mmask8 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let neg_one = -1;
+    let r = vcmpss(a, b, IMM5, neg_one, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not seti).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=758)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_mask_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __mmask8 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let r = vcmpss(a, b, IMM5, k1 as i8, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=760)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_sd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let neg_one = -1;
+    let r = vcmpsd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=761)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_sd_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __mmask8 {
+    static_assert_imm5!(IMM8);
+    let r = vcmpsd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=755)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
+    a: __m128d,
+    b: __m128d,
+) -> __mmask8 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let neg_one = -1;
+    let r = vcmpsd(a, b, IMM5, neg_one, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=756)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_mask_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
+    k1: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __mmask8 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let r = vcmpsd(a, b, IMM5, k1 as i8, SAE);
+    transmute(r)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epu32_mask&expand=1056)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epu32_mask&expand=1057)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmplt_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epu32_mask&expand=1054)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<u32x8, _>(simd_lt(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epu32_mask&expand=1055)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmplt_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epu32_mask&expand=1052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u32x4, _>(simd_lt(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epu32_mask&expand=1053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmplt_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epu32_mask&expand=933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_gt(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epu32_mask&expand=934)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpgt_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epu32_mask&expand=931)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<u32x8, _>(simd_gt(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epu32_mask&expand=932)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpgt_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epu32_mask&expand=929)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u32x4, _>(simd_gt(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epu32_mask&expand=930)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpgt_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epu32_mask&expand=995)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_le(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epu32_mask&expand=996)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmple_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epu32_mask&expand=993)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<u32x8, _>(simd_le(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epu32_mask&expand=994)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmple_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epu32_mask&expand=991)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u32x4, _>(simd_le(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epu32_mask&expand=992)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmple_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epu32_mask&expand=873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_ge(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epu32_mask&expand=874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpge_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epu32_mask&expand=871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<u32x8, _>(simd_ge(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epu32_mask&expand=872)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpge_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epu32_mask&expand=869)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u32x4, _>(simd_ge(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epu32_mask&expand=870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpge_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epu32_mask&expand=807)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_eq(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epu32_mask&expand=808)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpeq_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epu32_mask&expand=805)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<u32x8, _>(simd_eq(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epu32_mask&expand=806)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpeq_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epu32_mask&expand=803)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u32x4, _>(simd_eq(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epu32_mask&expand=804)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpeq_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epu32_mask&expand=1112)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_ne(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epu32_mask&expand=1113)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpneq_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epu32_mask&expand=1110)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<u32x8, _>(simd_ne(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epu32_mask&expand=1111)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpneq_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epu32_mask&expand=1108)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<u32x4, _>(simd_ne(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epu32_mask&expand=1109)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
+pub unsafe fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpneq_epu32_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu32_mask&expand=721)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm512_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    a: __m512i,
+    b: __m512i,
+) -> __mmask16 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let r = vpcmpud(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu32_mask&expand=722)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm512_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask16 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let r = vpcmpud(a, b, IMM3, k1 as i16);
+    transmute(r)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epu32_mask&expand=719)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm256_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let r = vpcmpud256(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epu32_mask&expand=720)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let r = vpcmpud256(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epu32_mask&expand=717)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let r = vpcmpud128(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epu32_mask&expand=718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let r = vpcmpud128(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epi32_mask&expand=1029)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epi32_mask&expand=1031)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmplt_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epi32_mask&expand=1027)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<i32x8, _>(simd_lt(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epi32_mask&expand=1028)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmplt_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi32_mask&expand=1025)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epi32_mask&expand=1026)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmplt_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epi32_mask&expand=905)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_gt(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epi32_mask&expand=906)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpgt_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32_mask&expand=903)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epi32_mask&expand=904)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpgt_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi32_mask&expand=901)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epi32_mask&expand=902)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpgt_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epi32_mask&expand=971)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_le(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epi32_mask&expand=972)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmple_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epi32_mask&expand=969)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<i32x8, _>(simd_le(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epi32_mask&expand=970)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmple_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epi32_mask&expand=967)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i32x4, _>(simd_le(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epi32_mask&expand=968)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmple_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epi32_mask&expand=849)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_ge(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epi32_mask&expand=850)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpge_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epi32_mask&expand=847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<i32x8, _>(simd_ge(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epi32_mask&expand=848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpge_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epi32_mask&expand=845)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i32x4, _>(simd_ge(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epi32_mask&expand=846)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpge_epi32_mask(a, b) & k1
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epi32_mask&expand=779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_eq(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epi32_mask&expand=780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpeq_epi32_mask(a, b) & k1
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32_mask&expand=777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epi32_mask&expand=778)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpeq_epi32_mask(a, b) & k1
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi32_mask&expand=775)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epi32_mask&expand=776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpeq_epi32_mask(a, b) & k1
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epi32_mask&expand=1088)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_ne(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epi32_mask&expand=1089)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpneq_epi32_mask(a, b) & k1
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epi32_mask&expand=1086)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<i32x8, _>(simd_ne(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epi32_mask&expand=1087)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpneq_epi32_mask(a, b) & k1
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epi32_mask&expand=1084)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<i32x4, _>(simd_ne(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epi32_mask&expand=1085)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
+pub unsafe fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpneq_epi32_mask(a, b) & k1
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi32_mask&expand=697)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm512_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    a: __m512i,
+    b: __m512i,
+) -> __mmask16 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let r = vpcmpd(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi32_mask&expand=698)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm512_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask16 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let r = vpcmpd(a, b, IMM3, k1 as i16);
+    transmute(r)
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=#text=_mm256_cmp_epi32_mask&expand=695)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm256_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let r = vpcmpd256(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epi32_mask&expand=696)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let r = vpcmpd256(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epi32_mask&expand=693)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let r = vpcmpd128(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epi32_mask&expand=694)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let r = vpcmpd128(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epu64_mask&expand=1062)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epu64_mask&expand=1063)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmplt_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epu64_mask&expand=1060)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_lt(a.as_u64x4(), b.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epu64_mask&expand=1061)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmplt_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epu64_mask&expand=1058)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_lt(a.as_u64x2(), b.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epu64_mask&expand=1059)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmplt_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epu64_mask&expand=939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epu64_mask&expand=940)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpgt_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epu64_mask&expand=937)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_gt(a.as_u64x4(), b.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epu64_mask&expand=938)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpgt_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epu64_mask&expand=935)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_gt(a.as_u64x2(), b.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epu64_mask&expand=936)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpgt_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epu64_mask&expand=1001)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epu64_mask&expand=1002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmple_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epu64_mask&expand=999)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_le(a.as_u64x4(), b.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epu64_mask&expand=1000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmple_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epu64_mask&expand=997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_le(a.as_u64x2(), b.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epu64_mask&expand=998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmple_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epu64_mask&expand=879)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epu64_mask&expand=880)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpge_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epu64_mask&expand=877)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_ge(a.as_u64x4(), b.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epu64_mask&expand=878)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpge_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epu64_mask&expand=875)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_ge(a.as_u64x2(), b.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epu64_mask&expand=876)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpge_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epu64_mask&expand=813)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epu64_mask&expand=814)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpeq_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epu64_mask&expand=811)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_eq(a.as_u64x4(), b.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epu64_mask&expand=812)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpeq_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epu64_mask&expand=809)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_eq(a.as_u64x2(), b.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epu64_mask&expand=810)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpeq_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epu64_mask&expand=1118)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epu64_mask&expand=1119)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpneq_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epu64_mask&expand=1116)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_ne(a.as_u64x4(), b.as_u64x4()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epu64_mask&expand=1117)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpneq_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epu64_mask&expand=1114)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_ne(a.as_u64x2(), b.as_u64x2()))
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epu64_mask&expand=1115)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
+pub unsafe fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpneq_epu64_mask(a, b) & k1
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu64_mask&expand=727)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm512_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    a: __m512i,
+    b: __m512i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let r = vpcmpuq(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu64_mask&expand=728)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm512_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let r = vpcmpuq(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epu64_mask&expand=725)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm256_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    let r = vpcmpuq256(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epu64_mask&expand=726)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    let r = vpcmpuq256(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epu64_mask&expand=723)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i64x2();
+    let b = b.as_i64x2();
+    let r = vpcmpuq128(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epu64_mask&expand=724)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i64x2();
+    let b = b.as_i64x2();
+    let r = vpcmpuq128(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epi64_mask&expand=1037)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epi64_mask&expand=1038)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmplt_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epi64_mask&expand=1035)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_lt(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epi64_mask&expand=1036)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmplt_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi64_mask&expand=1033)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_lt(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epi64_mask&expand=1034)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmplt_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epi64_mask&expand=913)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epi64_mask&expand=914)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpgt_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64_mask&expand=911)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epi64_mask&expand=912)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpgt_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi64_mask&expand=909)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_gt(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epi64_mask&expand=910)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpgt_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epi64_mask&expand=977)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epi64_mask&expand=978)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmple_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epi64_mask&expand=975)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_le(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epi64_mask&expand=976)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmple_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epi64_mask&expand=973)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_le(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epi64_mask&expand=974)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmple_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epi64_mask&expand=855)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epi64_mask&expand=856)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpge_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epi64_mask&expand=853)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_ge(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epi64_mask&expand=854)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpge_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epi64_mask&expand=851)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_ge(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epi64_mask&expand=852)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpge_epi64_mask(a, b) & k1
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epi64_mask&expand=787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epi64_mask&expand=788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpeq_epi64_mask(a, b) & k1
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64_mask&expand=785)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epi64_mask&expand=786)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpeq_epi64_mask(a, b) & k1
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64_mask&expand=783)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_eq(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epi64_mask&expand=784)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpeq_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epi64_mask&expand=1094)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epi64_mask&expand=1095)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpneq_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epi64_mask&expand=1092)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    simd_bitmask::<__m256i, _>(simd_ne(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epi64_mask&expand=1093)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+    _mm256_cmpneq_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epi64_mask&expand=1090)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    simd_bitmask::<__m128i, _>(simd_ne(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epi64_mask&expand=1091)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
+pub unsafe fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    _mm_cmpneq_epi64_mask(a, b) & k1
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi64_mask&expand=703)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm512_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    a: __m512i,
+    b: __m512i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let r = vpcmpq(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi64_mask&expand=704)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm512_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let r = vpcmpq(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epi64_mask&expand=701)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm256_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    let r = vpcmpq256(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epi64_mask&expand=702)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    let r = vpcmpq256(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epi64_mask&expand=699)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let neg_one = -1;
+    let a = a.as_i64x2();
+    let b = b.as_i64x2();
+    let r = vpcmpq128(a, b, IMM3, neg_one);
+    transmute(r)
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epi64_mask&expand=700)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
+pub unsafe fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM3);
+    let a = a.as_i64x2();
+    let b = b.as_i64x2();
+    let r = vpcmpq128(a, b, IMM3, k1 as i8);
+    transmute(r)
+}
+
+/// Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi32&expand=4556)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
+    simd_reduce_add_unordered(a.as_i32x16())
+}
+
+/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi32&expand=4555)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
+    simd_reduce_add_unordered(simd_select_bitmask(
+        k,
+        a.as_i32x16(),
+        _mm512_setzero_si512().as_i32x16(),
+    ))
+}
+
+/// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi64&expand=4558)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
+    simd_reduce_add_unordered(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=4557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_add_unordered(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_setzero_si512().as_i64x8(),
+    ))
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ps&expand=4562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_add_ps(a: __m512) -> f32 {
+    simd_reduce_add_unordered(a.as_f32x16())
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_ps&expand=4561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
+    simd_reduce_add_unordered(simd_select_bitmask(
+        k,
+        a.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+    ))
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_pd&expand=4560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
+    simd_reduce_add_unordered(a.as_f64x8())
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_pd&expand=4559)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
+    simd_reduce_add_unordered(simd_select_bitmask(
+        k,
+        a.as_f64x8(),
+        _mm512_setzero_pd().as_f64x8(),
+    ))
+}
+
+/// Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi32&expand=4600)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
+    simd_reduce_mul_unordered(a.as_i32x16())
+}
+
+/// Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi32&expand=4599)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
+    simd_reduce_mul_unordered(simd_select_bitmask(
+        k,
+        a.as_i32x16(),
+        _mm512_set1_epi32(1).as_i32x16(),
+    ))
+}
+
+/// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi64&expand=4602)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
+    simd_reduce_mul_unordered(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi64&expand=4601)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_mul_unordered(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_set1_epi64(1).as_i64x8(),
+    ))
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ps&expand=4606)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
+    simd_reduce_mul_unordered(a.as_f32x16())
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_ps&expand=4605)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
+    simd_reduce_mul_unordered(simd_select_bitmask(
+        k,
+        a.as_f32x16(),
+        _mm512_set1_ps(1.).as_f32x16(),
+    ))
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_pd&expand=4604)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
+    simd_reduce_mul_unordered(a.as_f64x8())
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_pd&expand=4603)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
+    simd_reduce_mul_unordered(simd_select_bitmask(
+        k,
+        a.as_f64x8(),
+        _mm512_set1_pd(1.).as_f64x8(),
+    ))
+}
+
+/// Reduce the packed signed 32-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi32&expand=4576)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_epi32(a: __m512i) -> i32 {
+    simd_reduce_max(a.as_i32x16())
+}
+
+/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi32&expand=4575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
+    simd_reduce_max(simd_select_bitmask(
+        k,
+        a.as_i32x16(),
+        _mm512_undefined_epi32().as_i32x16(),
+    ))
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi64&expand=4578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
+    simd_reduce_max(a.as_i64x8())
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi64&expand=4577)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_max(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_set1_epi64(0).as_i64x8(),
+    ))
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu32&expand=4580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_epu32(a: __m512i) -> u32 {
+    simd_reduce_max(a.as_u32x16())
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu32&expand=4579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
+    simd_reduce_max(simd_select_bitmask(
+        k,
+        a.as_u32x16(),
+        _mm512_undefined_epi32().as_u32x16(),
+    ))
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu64&expand=4582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
+    simd_reduce_max(a.as_u64x8())
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu64&expand=4581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
+    simd_reduce_max(simd_select_bitmask(
+        k,
+        a.as_u64x8(),
+        _mm512_set1_epi64(0).as_u64x8(),
+    ))
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ps&expand=4586)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_ps(a: __m512) -> f32 {
+    simd_reduce_max(a.as_f32x16())
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_ps&expand=4585)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
+    simd_reduce_max(simd_select_bitmask(
+        k,
+        a.as_f32x16(),
+        _mm512_undefined_ps().as_f32x16(),
+    ))
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_pd&expand=4584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
+    simd_reduce_max(a.as_f64x8())
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_pd&expand=4583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
+    simd_reduce_max(simd_select_bitmask(
+        k,
+        a.as_f64x8(),
+        _mm512_undefined_pd().as_f64x8(),
+    ))
+}
+
+/// Reduce the packed signed 32-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi32&expand=4588)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_epi32(a: __m512i) -> i32 {
+    simd_reduce_min(a.as_i32x16())
+}
+
+/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi32&expand=4587)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
+    simd_reduce_min(simd_select_bitmask(
+        k,
+        a.as_i32x16(),
+        _mm512_undefined_epi32().as_i32x16(),
+    ))
+}
+
+/// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi64&expand=4590)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
+    simd_reduce_min(a.as_i64x8())
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_min(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_set1_epi64(0).as_i64x8(),
+    ))
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu32&expand=4592)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_epu32(a: __m512i) -> u32 {
+    simd_reduce_min(a.as_u32x16())
+}
+
+/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu32&expand=4591)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
+    simd_reduce_min(simd_select_bitmask(
+        k,
+        a.as_u32x16(),
+        _mm512_undefined_epi32().as_u32x16(),
+    ))
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu64&expand=4594)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
+    simd_reduce_min(a.as_u64x8())
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
+    simd_reduce_min(simd_select_bitmask(
+        k,
+        a.as_u64x8(),
+        _mm512_set1_epi64(0).as_u64x8(),
+    ))
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ps&expand=4598)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_ps(a: __m512) -> f32 {
+    simd_reduce_min(a.as_f32x16())
+}
+
+/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_ps&expand=4597)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
+    simd_reduce_min(simd_select_bitmask(
+        k,
+        a.as_f32x16(),
+        _mm512_undefined_ps().as_f32x16(),
+    ))
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_pd&expand=4596)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
+    simd_reduce_min(a.as_f64x8())
+}
+
+/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_pd&expand=4595)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
+    simd_reduce_min(simd_select_bitmask(
+        k,
+        a.as_f64x8(),
+        _mm512_undefined_pd().as_f64x8(),
+    ))
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi32&expand=4564)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_and_epi32(a: __m512i) -> i32 {
+    simd_reduce_and(a.as_i32x16())
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi32&expand=4563)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
+    simd_reduce_and(simd_select_bitmask(
+        k,
+        a.as_i32x16(),
+        _mm512_set1_epi32(0xFF).as_i32x16(),
+    ))
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi64&expand=4566)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
+    simd_reduce_and(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=4557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_and(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_set1_epi64(1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7)
+            .as_i64x8(),
+    ))
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi32&expand=4608)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_or_epi32(a: __m512i) -> i32 {
+    simd_reduce_or(a.as_i32x16())
+}
+
+/// Reduce the packed 32-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi32&expand=4607)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
+    simd_reduce_or(simd_select_bitmask(
+        k,
+        a.as_i32x16(),
+        _mm512_setzero_si512().as_i32x16(),
+    ))
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi64&expand=4610)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
+    simd_reduce_or(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi64&expand=4609)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_or(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_setzero_si512().as_i64x8(),
+    ))
+}
+
+/// Returns vector of type `__m512d` with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+// This intrinsic has no corresponding instruction.
+pub unsafe fn _mm512_undefined_pd() -> __m512d {
+    _mm512_set1_pd(0.0)
+}
+
+/// Returns vector of type `__m512` with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+// This intrinsic has no corresponding instruction.
+pub unsafe fn _mm512_undefined_ps() -> __m512 {
+    _mm512_set1_ps(0.0)
+}
+
+/// Return vector of type __m512i with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_epi32&expand=5995)
+#[inline]
+#[target_feature(enable = "avx512f")]
+// This intrinsic has no corresponding instruction.
+pub unsafe fn _mm512_undefined_epi32() -> __m512i {
+    _mm512_set1_epi32(0)
+}
+
+/// Return vector of type __m512 with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined&expand=5994)
+#[inline]
+#[target_feature(enable = "avx512f")]
+// This intrinsic has no corresponding instruction.
+pub unsafe fn _mm512_undefined() -> __m512 {
+    _mm512_set1_ps(0.0)
+}
+
+/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_epi32&expand=3377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_loadu_epi32(mem_addr: *const i32) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_epi32&expand=3374)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm256_loadu_epi32(mem_addr: *const i32) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_epi32&expand=3371)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm_loadu_epi32(mem_addr: *const i32) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_storeu_epi16&expand=1460)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm512_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_storeu_epi8&expand=1462)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm256_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_storeu_epi8&expand=1461)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdw))]
+pub unsafe fn _mm_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_storeu_epi16&expand=1833)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovsdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_storeu_epi16&expand=1832)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_storeu_epi16&expand=1831)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_storeu_epi16&expand=2068)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovusdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_storeu_epi16&expand=2067)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_storeu_epi16&expand=2066)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_storeu_epi8&expand=1463)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm512_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_storeu_epi8&expand=1462)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm256_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_storeu_epi8&expand=1461)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovdb))]
+pub unsafe fn _mm_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_storeu_epi8&expand=1836)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovsdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_storeu_epi8&expand=1835)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_storeu_epi8&expand=1834)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_storeu_epi8&expand=2071)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovusdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_storeu_epi8&expand=2070)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_storeu_epi8&expand=2069)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi16&expand=1513)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm512_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_storeu_epi16&expand=1512)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm256_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_storeu_epi16&expand=1511)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqw))]
+pub unsafe fn _mm_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi16&expand=1866)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovsqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_storeu_epi16&expand=1865)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_storeu_epi16&expand=1864)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_storeu_epi16&expand=2101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovusqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_storeu_epi16&expand=2100)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_storeu_epi16&expand=2099)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi8&expand=1519)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm512_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_storeu_epi8&expand=1518)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm256_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_storeu_epi8&expand=1517)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqb))]
+pub unsafe fn _mm_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi8&expand=1872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovsqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_storeu_epi8&expand=1871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_storeu_epi8&expand=1870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_storeu_epi8&expand=2107)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovusqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_storeu_epi8&expand=2106)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_storeu_epi8&expand=2105)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
+///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi32&expand=1516)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm512_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_storeu_epi32&expand=1515)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm256_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_storeu_epi32&expand=1514)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovqd))]
+pub unsafe fn _mm_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi32&expand=1869)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovsqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_storeu_epi32&expand=1868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_storeu_epi32&expand=1867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_storeu_epi32&expand=2104)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovusqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_storeu_epi32&expand=2103)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_storeu_epi32&expand=2102)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
+/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_epi32&expand=5628)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_storeu_epi32(mem_addr: *mut i32, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_epi32&expand=5626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm256_storeu_epi32(mem_addr: *mut i32, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_epi32&expand=5624)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm_storeu_epi32(mem_addr: *mut i32, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_epi64&expand=3386)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm512_loadu_epi64(mem_addr: *const i64) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_epi64&expand=3383)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm256_loadu_epi64(mem_addr: *const i64) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_epi64&expand=3380)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm_loadu_epi64(mem_addr: *const i64) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_epi64&expand=5634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm512_storeu_epi64(mem_addr: *mut i64, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_epi64&expand=5632)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm256_storeu_epi64(mem_addr: *mut i64, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_epi64&expand=5630)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
+pub unsafe fn _mm_storeu_epi64(mem_addr: *mut i64, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_si512&expand=3420)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_loadu_si512(mem_addr: *const i32) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Store 512-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_si512&expand=5657)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
+pub unsafe fn _mm512_storeu_si512(mem_addr: *mut i32, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Loads 512-bits (composed of 8 packed double-precision (64-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_loadu_pd(mem_addr: *const f64) -> __m512d {
+    ptr::read_unaligned(mem_addr as *const __m512d)
+}
+
+/// Stores 512-bits (composed of 8 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_storeu_pd(mem_addr: *mut f64, a: __m512d) {
+    ptr::write_unaligned(mem_addr as *mut __m512d, a);
+}
+
+/// Loads 512-bits (composed of 16 packed single-precision (32-bit)
+/// floating-point elements) from memory into result.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+pub unsafe fn _mm512_loadu_ps(mem_addr: *const f32) -> __m512 {
+    ptr::read_unaligned(mem_addr as *const __m512)
+}
+
+/// Stores 512-bits (composed of 16 packed single-precision (32-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) {
+    ptr::write_unaligned(mem_addr as *mut __m512, a);
+}
+
+/// Load 512-bits of integer data from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_load_si512&expand=3345)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+pub unsafe fn _mm512_load_si512(mem_addr: *const i32) -> __m512i {
+    ptr::read(mem_addr as *const __m512i)
+}
+
+/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_store_si512&expand=5598)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+pub unsafe fn _mm512_store_si512(mem_addr: *mut i32, a: __m512i) {
+    ptr::write(mem_addr as *mut __m512i, a);
+}
+
+/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_epi32&expand=3304)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i {
+    ptr::read(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_epi32&expand=3301)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+pub unsafe fn _mm256_load_epi32(mem_addr: *const i32) -> __m256i {
+    ptr::read(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_epi32&expand=3298)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+pub unsafe fn _mm_load_epi32(mem_addr: *const i32) -> __m128i {
+    ptr::read(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_store_epi32&expand=5569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) {
+    ptr::write(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_epi32&expand=5567)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+pub unsafe fn _mm256_store_epi32(mem_addr: *mut i32, a: __m256i) {
+    ptr::write(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_epi32&expand=5565)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
+pub unsafe fn _mm_store_epi32(mem_addr: *mut i32, a: __m128i) {
+    ptr::write(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_epi64&expand=3313)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i {
+    ptr::read(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_epi64&expand=3310)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+pub unsafe fn _mm256_load_epi64(mem_addr: *const i64) -> __m256i {
+    ptr::read(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_epi64&expand=3307)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+pub unsafe fn _mm_load_epi64(mem_addr: *const i64) -> __m128i {
+    ptr::read(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_epi64&expand=5575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) {
+    ptr::write(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_epi64&expand=5573)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+pub unsafe fn _mm256_store_epi64(mem_addr: *mut i64, a: __m256i) {
+    ptr::write(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_epi64&expand=5571)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
+pub unsafe fn _mm_store_epi64(mem_addr: *mut i64, a: __m128i) {
+    ptr::write(mem_addr as *mut __m128i, a);
+}
+
+/// Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ps&expand=3336)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 {
+    ptr::read(mem_addr as *const __m512)
+}
+
+/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ps&expand=5592)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) {
+    ptr::write(mem_addr as *mut __m512, a);
+}
+
+/// Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_pd&expand=3326)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd
+pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d {
+    ptr::read(mem_addr as *const __m512d)
+}
+
+/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_pd&expand=5585)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd
+pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) {
+    ptr::write(mem_addr as *mut __m512d, a);
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vmovdqu32 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vmovdqu64 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
+    let mut dst: __m512 = src;
+    asm!(
+        vpl!("vmovups {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    let mut dst: __m512;
+    asm!(
+        vpl!("vmovups {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
+    let mut dst: __m512d = src;
+    asm!(
+        vpl!("vmovupd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    let mut dst: __m512d;
+    asm!(
+        vpl!("vmovupd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vmovdqu32 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vmovdqu64 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256 = src;
+    asm!(
+        vpl!("vmovups {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256;
+    asm!(
+        vpl!("vmovups {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
+    let mut dst: __m256d = src;
+    asm!(
+        vpl!("vmovupd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    let mut dst: __m256d;
+    asm!(
+        vpl!("vmovupd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vmovdqu32 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vmovdqu64 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128 = src;
+    asm!(
+        vpl!("vmovups {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128;
+    asm!(
+        vpl!("vmovups {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d = src;
+    asm!(
+        vpl!("vmovupd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d;
+    asm!(
+        vpl!("vmovupd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vmovdqa32 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vmovdqa64 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
+    let mut dst: __m512 = src;
+    asm!(
+        vpl!("vmovaps {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    let mut dst: __m512;
+    asm!(
+        vpl!("vmovaps {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
+    let mut dst: __m512d = src;
+    asm!(
+        vpl!("vmovapd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    let mut dst: __m512d;
+    asm!(
+        vpl!("vmovapd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vmovdqa32 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vmovdqa64 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256 = src;
+    asm!(
+        vpl!("vmovaps {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256;
+    asm!(
+        vpl!("vmovaps {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
+    let mut dst: __m256d = src;
+    asm!(
+        vpl!("vmovapd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    let mut dst: __m256d;
+    asm!(
+        vpl!("vmovapd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vmovdqa32 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vmovdqa64 {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128 = src;
+    asm!(
+        vpl!("vmovaps {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128;
+    asm!(
+        vpl!("vmovaps {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d = src;
+    asm!(
+        vpl!("vmovapd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d;
+    asm!(
+        vpl!("vmovapd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
+    asm!(
+        vps!("vmovdqu32", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
+    asm!(
+        vps!("vmovdqu64", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
+    asm!(
+        vps!("vmovups", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
+    asm!(
+        vps!("vmovupd", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
+    asm!(
+        vps!("vmovdqu32", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
+    asm!(
+        vps!("vmovdqu64", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
+    asm!(
+        vps!("vmovups", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
+    asm!(
+        vps!("vmovupd", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
+    asm!(
+        vps!("vmovdqu32", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
+    asm!(
+        vps!("vmovdqu64", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
+    asm!(
+        vps!("vmovups", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
+    asm!(
+        vps!("vmovupd", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
+    asm!(
+        vps!("vmovdqa32", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
+    asm!(
+        vps!("vmovdqa64", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
+    asm!(
+        vps!("vmovaps", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
+    asm!(
+        vps!("vmovapd", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(zmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
+    asm!(
+        vps!("vmovdqa32", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
+    asm!(
+        vps!("vmovdqa64", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
+    asm!(
+        vps!("vmovaps", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
+    asm!(
+        vps!("vmovapd", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(ymm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
+    asm!(
+        vps!("vmovdqa32", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
+    asm!(
+        vps!("vmovdqa64", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
+    asm!(
+        vps!("vmovaps", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
+    asm!(
+        vps!("vmovapd", "{{{mask}}}, {a}"),
+        p = in(reg) mem_addr,
+        mask = in(kreg) mask,
+        a = in(xmm_reg) a,
+        options(nostack)
+    );
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_expandloadu_epi32(
+    src: __m512i,
+    k: __mmask16,
+    mem_addr: *const i32,
+) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vpexpandd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_expandloadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vpexpandd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_expandloadu_epi32(
+    src: __m256i,
+    k: __mmask8,
+    mem_addr: *const i32,
+) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vpexpandd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vpexpandd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_expandloadu_epi32(
+    src: __m128i,
+    k: __mmask8,
+    mem_addr: *const i32,
+) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vpexpandd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vpexpandd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_expandloadu_epi64(
+    src: __m512i,
+    k: __mmask8,
+    mem_addr: *const i64,
+) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vpexpandq {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vpexpandq {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_expandloadu_epi64(
+    src: __m256i,
+    k: __mmask8,
+    mem_addr: *const i64,
+) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vpexpandq {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vpexpandq {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_expandloadu_epi64(
+    src: __m128i,
+    k: __mmask8,
+    mem_addr: *const i64,
+) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vpexpandq {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vpexpandq {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_expandloadu_ps(
+    src: __m512,
+    k: __mmask16,
+    mem_addr: *const f32,
+) -> __m512 {
+    let mut dst: __m512 = src;
+    asm!(
+        vpl!("vexpandps {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_expandloadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    let mut dst: __m512;
+    asm!(
+        vpl!("vexpandps {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_expandloadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256 = src;
+    asm!(
+        vpl!("vexpandps {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256;
+    asm!(
+        vpl!("vexpandps {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_expandloadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128 = src;
+    asm!(
+        vpl!("vexpandps {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128;
+    asm!(
+        vpl!("vexpandps {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_expandloadu_pd(
+    src: __m512d,
+    k: __mmask8,
+    mem_addr: *const f64,
+) -> __m512d {
+    let mut dst: __m512d = src;
+    asm!(
+        vpl!("vexpandpd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    let mut dst: __m512d;
+    asm!(
+        vpl!("vexpandpd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_expandloadu_pd(
+    src: __m256d,
+    k: __mmask8,
+    mem_addr: *const f64,
+) -> __m256d {
+    let mut dst: __m256d = src;
+    asm!(
+        vpl!("vexpandpd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    let mut dst: __m256d;
+    asm!(
+        vpl!("vexpandpd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_expandloadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d = src;
+    asm!(
+        vpl!("vexpandpd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d;
+    asm!(
+        vpl!("vexpandpd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_pd&expand=5002)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+    transmute(r)
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_pd&expand=4924)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut mov: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        mov = simd_extract(b, 0);
+    }
+    let r = simd_insert(a, 0, mov);
+    transmute(r)
+}
+
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut mov: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        mov = simd_extract(b, 0);
+    }
+    let r = simd_insert(a, 0, mov);
+    transmute(r)
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut mov: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        mov = simd_extract(b, 0);
+    }
+    let r = simd_insert(a, 0, mov);
+    transmute(r)
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut mov: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        mov = simd_extract(b, 0);
+    }
+    let r = simd_insert(a, 0, mov);
+    transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta + extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta + extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta + extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta + extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta - extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta - extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta - extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta - extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta * extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta * extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta * extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta * extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta / extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta / extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta / extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta / extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vmaxss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vmaxss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vmaxsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vmaxsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vminss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vminss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vminsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vminsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vsqrtss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vsqrtss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vsqrtsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vsqrtsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vrsqrt14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b1,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrsqrt14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrsqrt14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b1,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrsqrt14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+    ))
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vrcp14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b1,
+    ))
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrcp14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+    ))
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrcp14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b1,
+    ))
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrcp14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+    ))
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vgetexpss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b1,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vgetexpss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vgetexpss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vgetexpsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b1,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vgetexpsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vgetexpsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_mask_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_maskz_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_mask_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_maskz_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vrndscaless(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_roundscale_ss<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_roundscale_ss<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vrndscaless(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vrndscalesd(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_roundscale_sd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_roundscale_sd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vrndscalesd(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(vscalefss(a, b, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vscalefss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let mut fmadd: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let mut fmadd: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let mut fmadd: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmadd: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmadd: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let mut fmadd: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_ss&expand=2668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss))]
+pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let mut fmsub: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_ss&expand=2670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss))]
+pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let mut fmsub: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_ss&expand=2669)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss))]
+pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let mut fmsub: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc = -fmsub;
+        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_sd&expand=2664)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd))]
+pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmsub: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_sd&expand=2666)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd))]
+pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmsub: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_sd&expand=2665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd))]
+pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let mut fmsub: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc = -fmsub;
+        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_ss&expand=2748)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss))]
+pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let mut fnmadd: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmadd;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_ss&expand=2750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss))]
+pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let mut fnmadd: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_ss&expand=2749)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss))]
+pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let mut fnmadd: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_sd&expand=2744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd))]
+pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let mut fnmadd: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmadd;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_sd&expand=2746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd))]
+pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let mut fnmadd: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_sd&expand=2745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd))]
+pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let mut fnmadd: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_ss&expand=2796)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss))]
+pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let mut fnmsub: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmsub;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_ss&expand=2798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss))]
+pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let mut fnmsub: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_ss&expand=2797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss))]
+pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let mut fnmsub: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc = -fnmsub;
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_sd&expand=2792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd))]
+pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let mut fnmsub: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmsub;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_sd&expand=2794)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd))]
+pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let mut fnmsub: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_sd&expand=2793)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd))]
+pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let mut fnmsub: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc = -fnmsub;
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fnmsub);
+    transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vaddss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_add_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vaddss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_add_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vaddss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vaddsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_add_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vaddsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_add_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vaddsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vsubss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vsubss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vsubss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vsubsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vsubsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vsubsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vmulss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vmulss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vmulss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vmulsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vmulsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vmulsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vdivss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_div_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vdivss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_div_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vdivss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vdivsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_div_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vdivsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vdivsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vmaxss(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_max_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vmaxss(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vmaxss(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vmaxsd(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_max_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vmaxsd(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_max_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vmaxsd(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vminss(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_min_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vminss(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vminss(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vminsd(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_min_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vminsd(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_min_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vminsd(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vsqrtss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vsqrtss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vsqrtss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vsqrtsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vsqrtsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vsqrtsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetexpss(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_getexp_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vgetexpss(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_getexp_round_ss<const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetexpss(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vgetexpsd(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_getexp_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vgetexpsd(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_getexp_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vgetexpsd(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_ss&expand=2892)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub unsafe fn _mm_getmant_round_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_ss&expand=2893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(4, 5, 6)]
+pub unsafe fn _mm_mask_getmant_round_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_ss&expand=2894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub unsafe fn _mm_maskz_getmant_round_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, k, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+pub unsafe fn _mm_getmant_round_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(4, 5, 6)]
+pub unsafe fn _mm_mask_getmant_round_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, SAE);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+pub unsafe fn _mm_maskz_getmant_round_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+    const SAE: i32,
+>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, k, SAE);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_ss&expand=4796)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vrndscaless(a, b, zero, 0b11111111, IMM8, SAE);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_ss&expand=4794)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_mask_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vrndscaless(a, b, src, k, IMM8, SAE);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_ss&expand=4795)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_maskz_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vrndscaless(a, b, zero, k, IMM8, SAE);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_sd&expand=4793)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vrndscalesd(a, b, zero, 0b11111111, IMM8, SAE);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_sd&expand=4791)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_mask_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vrndscalesd(a, b, src, k, IMM8, SAE);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_sd&expand=4792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_maskz_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vrndscalesd(a, b, zero, k, IMM8, SAE);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_ss&expand=4895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vscalefss(a, b, zero, 0b11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_ss&expand=4893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vscalefss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_ss&expand=4894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vscalefss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_sd&expand=4892)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vscalefsd(a, b, zero, 0b11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_sd&expand=4890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vscalefsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_sd&expand=4891)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vscalefsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_ss&expand=2573)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let extracta: f32 = simd_extract(a, 0);
+    let extractb: f32 = simd_extract(b, 0);
+    let extractc: f32 = simd_extract(c, 0);
+    let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, r);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_ss&expand=2574)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fmadd: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_ss&expand=2576)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fmadd: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_ss&expand=2575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fmadd: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_sd&expand=2569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let extracta: f64 = simd_extract(a, 0);
+    let extractb: f64 = simd_extract(b, 0);
+    let extractc: f64 = simd_extract(c, 0);
+    let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_sd&expand=2570)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fmadd: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_sd&expand=2572)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fmadd: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_Sd&expand=2571)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fmadd: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_ss&expand=2659)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let extracta: f32 = simd_extract(a, 0);
+    let extractb: f32 = simd_extract(b, 0);
+    let extractc: f32 = simd_extract(c, 0);
+    let extractc = -extractc;
+    let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_ss&expand=2660)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fmsub: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_ss&expand=2662)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fmsub: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_ss&expand=2661)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fmsub: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc = -fmsub;
+        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(c, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_sd&expand=2655)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let extracta: f64 = simd_extract(a, 0);
+    let extractb: f64 = simd_extract(b, 0);
+    let extractc: f64 = simd_extract(c, 0);
+    let extractc = -extractc;
+    let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_sd&expand=2656)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fmsub: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_sd&expand=2658)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fmsub: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_sd&expand=2657)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fmsub: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc = -fmsub;
+        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(c, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_ss&expand=2739)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let extracta: f32 = simd_extract(a, 0);
+    let extracta = -extracta;
+    let extractb: f32 = simd_extract(b, 0);
+    let extractc: f32 = simd_extract(c, 0);
+    let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_ss&expand=2740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmadd: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmadd;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_ss&expand=2742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmadd: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_ss&expand=2741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmadd: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
+    }
+    let r = simd_insert(c, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_sd&expand=2735)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let extracta: f64 = simd_extract(a, 0);
+    let extracta = -extracta;
+    let extractb: f64 = simd_extract(b, 0);
+    let extractc: f64 = simd_extract(c, 0);
+    let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_sd&expand=2736)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmadd: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmadd;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_sd&expand=2738)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmadd: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_Sd&expand=2737)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmadd: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
+    }
+    let r = simd_insert(c, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_ss&expand=2787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let extracta: f32 = simd_extract(a, 0);
+    let extracta = -extracta;
+    let extractb: f32 = simd_extract(b, 0);
+    let extractc: f32 = simd_extract(c, 0);
+    let extractc = -extractc;
+    let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_ss&expand=2788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmsub: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmsub;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_ss&expand=2790)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmsub: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_ss&expand=2789)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmsub: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc = -fnmsub;
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(c, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_sd&expand=2783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let extracta: f64 = simd_extract(a, 0);
+    let extracta = -extracta;
+    let extractb: f64 = simd_extract(b, 0);
+    let extractc: f64 = simd_extract(c, 0);
+    let extractc = -extractc;
+    let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_sd&expand=2784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmsub: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmsub;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_sd&expand=2786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmsub: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_sd&expand=2785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let mut fnmsub: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc = -fnmsub;
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    }
+    let r = simd_insert(c, 0, fnmsub);
+    transmute(r)
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_ss&expand=2517)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fixupimm_ss<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let c = c.as_i32x4();
+    let r = vfixupimmss(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+    let fixupimm: f32 = simd_extract(r, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_ss&expand=2518)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fixupimm_ss<const IMM8: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let c = c.as_i32x4();
+    let fixupimm = vfixupimmss(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+    let fixupimm: f32 = simd_extract(fixupimm, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_ss&expand=2519)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fixupimm_ss<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let c = c.as_i32x4();
+    let fixupimm = vfixupimmssz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+    let fixupimm: f32 = simd_extract(fixupimm, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_sd&expand=2514)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fixupimm_sd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let c = c.as_i64x2();
+    let fixupimm = vfixupimmsd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+    let fixupimm: f64 = simd_extract(fixupimm, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_sd&expand=2515)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fixupimm_sd<const IMM8: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let c = c.as_i64x2();
+    let fixupimm = vfixupimmsd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+    let fixupimm: f64 = simd_extract(fixupimm, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_sd&expand=2516)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fixupimm_sd<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let c = c.as_i64x2();
+    let fixupimm = vfixupimmsdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+    let fixupimm: f64 = simd_extract(fixupimm, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_round_ss&expand=2511)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let c = c.as_i32x4();
+    let r = vfixupimmss(a, b, c, IMM8, 0b11111111, SAE);
+    let fixupimm: f32 = simd_extract(r, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_round_ss&expand=2512)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_mask_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let c = c.as_i32x4();
+    let r = vfixupimmss(a, b, c, IMM8, k, SAE);
+    let fixupimm: f32 = simd_extract(r, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_round_ss&expand=2513)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_maskz_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128i,
+) -> __m128 {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let c = c.as_i32x4();
+    let r = vfixupimmssz(a, b, c, IMM8, k, SAE);
+    let fixupimm: f32 = simd_extract(r, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_round_sd&expand=2508)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let c = c.as_i64x2();
+    let r = vfixupimmsd(a, b, c, IMM8, 0b11111111, SAE);
+    let fixupimm: f64 = simd_extract(r, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_round_sd&expand=2509)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_mask_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let c = c.as_i64x2();
+    let r = vfixupimmsd(a, b, c, IMM8, k, SAE);
+    let fixupimm: f64 = simd_extract(r, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_round_sd&expand=2510)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_maskz_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128i,
+) -> __m128d {
+    static_assert_imm8!(IMM8);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let c = c.as_i64x2();
+    let r = vfixupimmsdz(a, b, c, IMM8, k, SAE);
+    let fixupimm: f64 = simd_extract(r, 0);
+    let r = simd_insert(a, 0, fixupimm);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvtss_sd&expand=1896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2sd))]
+pub unsafe fn _mm_mask_cvtss_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    transmute(vcvtss2sd(
+        a.as_f64x2(),
+        b.as_f32x4(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvtss_sd&expand=1897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2sd))]
+pub unsafe fn _mm_maskz_cvtss_sd(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    transmute(vcvtss2sd(
+        a.as_f64x2(),
+        b.as_f32x4(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvtsd_ss&expand=1797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss))]
+pub unsafe fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    transmute(vcvtsd2ss(
+        a.as_f32x4(),
+        b.as_f64x2(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvtsd_ss&expand=1798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss))]
+pub unsafe fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    transmute(vcvtsd2ss(
+        a.as_f32x4(),
+        b.as_f64x2(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_sd&expand=1371)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundss_sd<const SAE: i32>(a: __m128d, b: __m128) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vcvtss2sd(a, b, zero, 0b11111111, SAE);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundss_sd&expand=1372)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_cvt_roundss_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f32x4();
+    let src = src.as_f64x2();
+    let r = vcvtss2sd(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundss_sd&expand=1373)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vcvtss2sd(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_ss&expand=1361)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vcvtsd2ss(a, b, zero, 0b11111111, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundsd_ss&expand=1362)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128d,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f64x2();
+    let src = src.as_f32x4();
+    let r = vcvtsd2ss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundsd_ss&expand=1363)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128d,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vcvtsd2ss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_si32&expand=1374)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtss2si(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_i32&expand=1369)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtss2si(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_u32&expand=1376)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtss2usi(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtss_i32&expand=1893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
+    transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtss_u32&expand=1901)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
+    transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_si32&expand=1359)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let r = vcvtsd2si(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_i32&expand=1357)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let r = vcvtsd2si(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=cvt_roundsd_u32&expand=1364)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let r = vcvtsd2usi(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_i32&expand=1791)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
+    transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_u32&expand=1799)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
+    transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundi32_ss&expand=1312)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtsi2ss(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsi32_ss&expand=1366)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtsi2ss(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundu32_ss&expand=1378)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtusi2ss(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_ss&expand=1643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss))]
+pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
+    let b = b as f32;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_sd&expand=1642)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd))]
+pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
+    let b = b as f64;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_Si32&expand=1936)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let r = vcvtss2si(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_i32&expand=1934)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let r = vcvtss2si(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_u32&expand=1938)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let r = vcvtss2usi(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_i32&expand=2022)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
+    transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_u32&expand=2026)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
+    transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si32&expand=1930)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let r = vcvtsd2si(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i32&expand=1928)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let r = vcvtsd2si(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundsd_u32&expand=1932)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let r = vcvtsd2usi(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_i32&expand=2015)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
+    transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_u32&expand=2020)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
+    transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_ss&expand=2032)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss))]
+pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
+    let b = b as f32;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sd&expand=2031)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2sd))]
+pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
+    let b = b as f64;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_ss&expand=1175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomiss
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> i32 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let r = vcomiss(a, b, IMM5, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sd&expand=1174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomisd
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> i32 {
+    static_assert_imm5!(IMM5);
+    static_assert_mantissas_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let r = vcomisd(a, b, IMM5, SAE);
+    transmute(r)
+}
+
+/// Equal
+pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
+/// Less-than
+pub const _MM_CMPINT_LT: _MM_CMPINT_ENUM = 0x01;
+/// Less-than-or-equal
+pub const _MM_CMPINT_LE: _MM_CMPINT_ENUM = 0x02;
+/// False
+pub const _MM_CMPINT_FALSE: _MM_CMPINT_ENUM = 0x03;
+/// Not-equal
+pub const _MM_CMPINT_NE: _MM_CMPINT_ENUM = 0x04;
+/// Not less-than
+pub const _MM_CMPINT_NLT: _MM_CMPINT_ENUM = 0x05;
+/// Not less-than-or-equal
+pub const _MM_CMPINT_NLE: _MM_CMPINT_ENUM = 0x06;
+/// True
+pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07;
+
+/// interval [1, 2)
+pub const _MM_MANT_NORM_1_2: _MM_MANTISSA_NORM_ENUM = 0x00;
+/// interval [0.5, 2)
+pub const _MM_MANT_NORM_P5_2: _MM_MANTISSA_NORM_ENUM = 0x01;
+/// interval [0.5, 1)
+pub const _MM_MANT_NORM_P5_1: _MM_MANTISSA_NORM_ENUM = 0x02;
+/// interval [0.75, 1.5)
+pub const _MM_MANT_NORM_P75_1P5: _MM_MANTISSA_NORM_ENUM = 0x03;
+
+/// sign = sign(SRC)
+pub const _MM_MANT_SIGN_SRC: _MM_MANTISSA_SIGN_ENUM = 0x00;
+/// sign = 0
+pub const _MM_MANT_SIGN_ZERO: _MM_MANTISSA_SIGN_ENUM = 0x01;
+/// DEST = NaN if sign(SRC) = 1
+pub const _MM_MANT_SIGN_NAN: _MM_MANTISSA_SIGN_ENUM = 0x02;
+
+pub const _MM_PERM_AAAA: _MM_PERM_ENUM = 0x00;
+pub const _MM_PERM_AAAB: _MM_PERM_ENUM = 0x01;
+pub const _MM_PERM_AAAC: _MM_PERM_ENUM = 0x02;
+pub const _MM_PERM_AAAD: _MM_PERM_ENUM = 0x03;
+pub const _MM_PERM_AABA: _MM_PERM_ENUM = 0x04;
+pub const _MM_PERM_AABB: _MM_PERM_ENUM = 0x05;
+pub const _MM_PERM_AABC: _MM_PERM_ENUM = 0x06;
+pub const _MM_PERM_AABD: _MM_PERM_ENUM = 0x07;
+pub const _MM_PERM_AACA: _MM_PERM_ENUM = 0x08;
+pub const _MM_PERM_AACB: _MM_PERM_ENUM = 0x09;
+pub const _MM_PERM_AACC: _MM_PERM_ENUM = 0x0A;
+pub const _MM_PERM_AACD: _MM_PERM_ENUM = 0x0B;
+pub const _MM_PERM_AADA: _MM_PERM_ENUM = 0x0C;
+pub const _MM_PERM_AADB: _MM_PERM_ENUM = 0x0D;
+pub const _MM_PERM_AADC: _MM_PERM_ENUM = 0x0E;
+pub const _MM_PERM_AADD: _MM_PERM_ENUM = 0x0F;
+pub const _MM_PERM_ABAA: _MM_PERM_ENUM = 0x10;
+pub const _MM_PERM_ABAB: _MM_PERM_ENUM = 0x11;
+pub const _MM_PERM_ABAC: _MM_PERM_ENUM = 0x12;
+pub const _MM_PERM_ABAD: _MM_PERM_ENUM = 0x13;
+pub const _MM_PERM_ABBA: _MM_PERM_ENUM = 0x14;
+pub const _MM_PERM_ABBB: _MM_PERM_ENUM = 0x15;
+pub const _MM_PERM_ABBC: _MM_PERM_ENUM = 0x16;
+pub const _MM_PERM_ABBD: _MM_PERM_ENUM = 0x17;
+pub const _MM_PERM_ABCA: _MM_PERM_ENUM = 0x18;
+pub const _MM_PERM_ABCB: _MM_PERM_ENUM = 0x19;
+pub const _MM_PERM_ABCC: _MM_PERM_ENUM = 0x1A;
+pub const _MM_PERM_ABCD: _MM_PERM_ENUM = 0x1B;
+pub const _MM_PERM_ABDA: _MM_PERM_ENUM = 0x1C;
+pub const _MM_PERM_ABDB: _MM_PERM_ENUM = 0x1D;
+pub const _MM_PERM_ABDC: _MM_PERM_ENUM = 0x1E;
+pub const _MM_PERM_ABDD: _MM_PERM_ENUM = 0x1F;
+pub const _MM_PERM_ACAA: _MM_PERM_ENUM = 0x20;
+pub const _MM_PERM_ACAB: _MM_PERM_ENUM = 0x21;
+pub const _MM_PERM_ACAC: _MM_PERM_ENUM = 0x22;
+pub const _MM_PERM_ACAD: _MM_PERM_ENUM = 0x23;
+pub const _MM_PERM_ACBA: _MM_PERM_ENUM = 0x24;
+pub const _MM_PERM_ACBB: _MM_PERM_ENUM = 0x25;
+pub const _MM_PERM_ACBC: _MM_PERM_ENUM = 0x26;
+pub const _MM_PERM_ACBD: _MM_PERM_ENUM = 0x27;
+pub const _MM_PERM_ACCA: _MM_PERM_ENUM = 0x28;
+pub const _MM_PERM_ACCB: _MM_PERM_ENUM = 0x29;
+pub const _MM_PERM_ACCC: _MM_PERM_ENUM = 0x2A;
+pub const _MM_PERM_ACCD: _MM_PERM_ENUM = 0x2B;
+pub const _MM_PERM_ACDA: _MM_PERM_ENUM = 0x2C;
+pub const _MM_PERM_ACDB: _MM_PERM_ENUM = 0x2D;
+pub const _MM_PERM_ACDC: _MM_PERM_ENUM = 0x2E;
+pub const _MM_PERM_ACDD: _MM_PERM_ENUM = 0x2F;
+pub const _MM_PERM_ADAA: _MM_PERM_ENUM = 0x30;
+pub const _MM_PERM_ADAB: _MM_PERM_ENUM = 0x31;
+pub const _MM_PERM_ADAC: _MM_PERM_ENUM = 0x32;
+pub const _MM_PERM_ADAD: _MM_PERM_ENUM = 0x33;
+pub const _MM_PERM_ADBA: _MM_PERM_ENUM = 0x34;
+pub const _MM_PERM_ADBB: _MM_PERM_ENUM = 0x35;
+pub const _MM_PERM_ADBC: _MM_PERM_ENUM = 0x36;
+pub const _MM_PERM_ADBD: _MM_PERM_ENUM = 0x37;
+pub const _MM_PERM_ADCA: _MM_PERM_ENUM = 0x38;
+pub const _MM_PERM_ADCB: _MM_PERM_ENUM = 0x39;
+pub const _MM_PERM_ADCC: _MM_PERM_ENUM = 0x3A;
+pub const _MM_PERM_ADCD: _MM_PERM_ENUM = 0x3B;
+pub const _MM_PERM_ADDA: _MM_PERM_ENUM = 0x3C;
+pub const _MM_PERM_ADDB: _MM_PERM_ENUM = 0x3D;
+pub const _MM_PERM_ADDC: _MM_PERM_ENUM = 0x3E;
+pub const _MM_PERM_ADDD: _MM_PERM_ENUM = 0x3F;
+pub const _MM_PERM_BAAA: _MM_PERM_ENUM = 0x40;
+pub const _MM_PERM_BAAB: _MM_PERM_ENUM = 0x41;
+pub const _MM_PERM_BAAC: _MM_PERM_ENUM = 0x42;
+pub const _MM_PERM_BAAD: _MM_PERM_ENUM = 0x43;
+pub const _MM_PERM_BABA: _MM_PERM_ENUM = 0x44;
+pub const _MM_PERM_BABB: _MM_PERM_ENUM = 0x45;
+pub const _MM_PERM_BABC: _MM_PERM_ENUM = 0x46;
+pub const _MM_PERM_BABD: _MM_PERM_ENUM = 0x47;
+pub const _MM_PERM_BACA: _MM_PERM_ENUM = 0x48;
+pub const _MM_PERM_BACB: _MM_PERM_ENUM = 0x49;
+pub const _MM_PERM_BACC: _MM_PERM_ENUM = 0x4A;
+pub const _MM_PERM_BACD: _MM_PERM_ENUM = 0x4B;
+pub const _MM_PERM_BADA: _MM_PERM_ENUM = 0x4C;
+pub const _MM_PERM_BADB: _MM_PERM_ENUM = 0x4D;
+pub const _MM_PERM_BADC: _MM_PERM_ENUM = 0x4E;
+pub const _MM_PERM_BADD: _MM_PERM_ENUM = 0x4F;
+pub const _MM_PERM_BBAA: _MM_PERM_ENUM = 0x50;
+pub const _MM_PERM_BBAB: _MM_PERM_ENUM = 0x51;
+pub const _MM_PERM_BBAC: _MM_PERM_ENUM = 0x52;
+pub const _MM_PERM_BBAD: _MM_PERM_ENUM = 0x53;
+pub const _MM_PERM_BBBA: _MM_PERM_ENUM = 0x54;
+pub const _MM_PERM_BBBB: _MM_PERM_ENUM = 0x55;
+pub const _MM_PERM_BBBC: _MM_PERM_ENUM = 0x56;
+pub const _MM_PERM_BBBD: _MM_PERM_ENUM = 0x57;
+pub const _MM_PERM_BBCA: _MM_PERM_ENUM = 0x58;
+pub const _MM_PERM_BBCB: _MM_PERM_ENUM = 0x59;
+pub const _MM_PERM_BBCC: _MM_PERM_ENUM = 0x5A;
+pub const _MM_PERM_BBCD: _MM_PERM_ENUM = 0x5B;
+pub const _MM_PERM_BBDA: _MM_PERM_ENUM = 0x5C;
+pub const _MM_PERM_BBDB: _MM_PERM_ENUM = 0x5D;
+pub const _MM_PERM_BBDC: _MM_PERM_ENUM = 0x5E;
+pub const _MM_PERM_BBDD: _MM_PERM_ENUM = 0x5F;
+pub const _MM_PERM_BCAA: _MM_PERM_ENUM = 0x60;
+pub const _MM_PERM_BCAB: _MM_PERM_ENUM = 0x61;
+pub const _MM_PERM_BCAC: _MM_PERM_ENUM = 0x62;
+pub const _MM_PERM_BCAD: _MM_PERM_ENUM = 0x63;
+pub const _MM_PERM_BCBA: _MM_PERM_ENUM = 0x64;
+pub const _MM_PERM_BCBB: _MM_PERM_ENUM = 0x65;
+pub const _MM_PERM_BCBC: _MM_PERM_ENUM = 0x66;
+pub const _MM_PERM_BCBD: _MM_PERM_ENUM = 0x67;
+pub const _MM_PERM_BCCA: _MM_PERM_ENUM = 0x68;
+pub const _MM_PERM_BCCB: _MM_PERM_ENUM = 0x69;
+pub const _MM_PERM_BCCC: _MM_PERM_ENUM = 0x6A;
+pub const _MM_PERM_BCCD: _MM_PERM_ENUM = 0x6B;
+pub const _MM_PERM_BCDA: _MM_PERM_ENUM = 0x6C;
+pub const _MM_PERM_BCDB: _MM_PERM_ENUM = 0x6D;
+pub const _MM_PERM_BCDC: _MM_PERM_ENUM = 0x6E;
+pub const _MM_PERM_BCDD: _MM_PERM_ENUM = 0x6F;
+pub const _MM_PERM_BDAA: _MM_PERM_ENUM = 0x70;
+pub const _MM_PERM_BDAB: _MM_PERM_ENUM = 0x71;
+pub const _MM_PERM_BDAC: _MM_PERM_ENUM = 0x72;
+pub const _MM_PERM_BDAD: _MM_PERM_ENUM = 0x73;
+pub const _MM_PERM_BDBA: _MM_PERM_ENUM = 0x74;
+pub const _MM_PERM_BDBB: _MM_PERM_ENUM = 0x75;
+pub const _MM_PERM_BDBC: _MM_PERM_ENUM = 0x76;
+pub const _MM_PERM_BDBD: _MM_PERM_ENUM = 0x77;
+pub const _MM_PERM_BDCA: _MM_PERM_ENUM = 0x78;
+pub const _MM_PERM_BDCB: _MM_PERM_ENUM = 0x79;
+pub const _MM_PERM_BDCC: _MM_PERM_ENUM = 0x7A;
+pub const _MM_PERM_BDCD: _MM_PERM_ENUM = 0x7B;
+pub const _MM_PERM_BDDA: _MM_PERM_ENUM = 0x7C;
+pub const _MM_PERM_BDDB: _MM_PERM_ENUM = 0x7D;
+pub const _MM_PERM_BDDC: _MM_PERM_ENUM = 0x7E;
+pub const _MM_PERM_BDDD: _MM_PERM_ENUM = 0x7F;
+pub const _MM_PERM_CAAA: _MM_PERM_ENUM = 0x80;
+pub const _MM_PERM_CAAB: _MM_PERM_ENUM = 0x81;
+pub const _MM_PERM_CAAC: _MM_PERM_ENUM = 0x82;
+pub const _MM_PERM_CAAD: _MM_PERM_ENUM = 0x83;
+pub const _MM_PERM_CABA: _MM_PERM_ENUM = 0x84;
+pub const _MM_PERM_CABB: _MM_PERM_ENUM = 0x85;
+pub const _MM_PERM_CABC: _MM_PERM_ENUM = 0x86;
+pub const _MM_PERM_CABD: _MM_PERM_ENUM = 0x87;
+pub const _MM_PERM_CACA: _MM_PERM_ENUM = 0x88;
+pub const _MM_PERM_CACB: _MM_PERM_ENUM = 0x89;
+pub const _MM_PERM_CACC: _MM_PERM_ENUM = 0x8A;
+pub const _MM_PERM_CACD: _MM_PERM_ENUM = 0x8B;
+pub const _MM_PERM_CADA: _MM_PERM_ENUM = 0x8C;
+pub const _MM_PERM_CADB: _MM_PERM_ENUM = 0x8D;
+pub const _MM_PERM_CADC: _MM_PERM_ENUM = 0x8E;
+pub const _MM_PERM_CADD: _MM_PERM_ENUM = 0x8F;
+pub const _MM_PERM_CBAA: _MM_PERM_ENUM = 0x90;
+pub const _MM_PERM_CBAB: _MM_PERM_ENUM = 0x91;
+pub const _MM_PERM_CBAC: _MM_PERM_ENUM = 0x92;
+pub const _MM_PERM_CBAD: _MM_PERM_ENUM = 0x93;
+pub const _MM_PERM_CBBA: _MM_PERM_ENUM = 0x94;
+pub const _MM_PERM_CBBB: _MM_PERM_ENUM = 0x95;
+pub const _MM_PERM_CBBC: _MM_PERM_ENUM = 0x96;
+pub const _MM_PERM_CBBD: _MM_PERM_ENUM = 0x97;
+pub const _MM_PERM_CBCA: _MM_PERM_ENUM = 0x98;
+pub const _MM_PERM_CBCB: _MM_PERM_ENUM = 0x99;
+pub const _MM_PERM_CBCC: _MM_PERM_ENUM = 0x9A;
+pub const _MM_PERM_CBCD: _MM_PERM_ENUM = 0x9B;
+pub const _MM_PERM_CBDA: _MM_PERM_ENUM = 0x9C;
+pub const _MM_PERM_CBDB: _MM_PERM_ENUM = 0x9D;
+pub const _MM_PERM_CBDC: _MM_PERM_ENUM = 0x9E;
+pub const _MM_PERM_CBDD: _MM_PERM_ENUM = 0x9F;
+pub const _MM_PERM_CCAA: _MM_PERM_ENUM = 0xA0;
+pub const _MM_PERM_CCAB: _MM_PERM_ENUM = 0xA1;
+pub const _MM_PERM_CCAC: _MM_PERM_ENUM = 0xA2;
+pub const _MM_PERM_CCAD: _MM_PERM_ENUM = 0xA3;
+pub const _MM_PERM_CCBA: _MM_PERM_ENUM = 0xA4;
+pub const _MM_PERM_CCBB: _MM_PERM_ENUM = 0xA5;
+pub const _MM_PERM_CCBC: _MM_PERM_ENUM = 0xA6;
+pub const _MM_PERM_CCBD: _MM_PERM_ENUM = 0xA7;
+pub const _MM_PERM_CCCA: _MM_PERM_ENUM = 0xA8;
+pub const _MM_PERM_CCCB: _MM_PERM_ENUM = 0xA9;
+pub const _MM_PERM_CCCC: _MM_PERM_ENUM = 0xAA;
+pub const _MM_PERM_CCCD: _MM_PERM_ENUM = 0xAB;
+pub const _MM_PERM_CCDA: _MM_PERM_ENUM = 0xAC;
+pub const _MM_PERM_CCDB: _MM_PERM_ENUM = 0xAD;
+pub const _MM_PERM_CCDC: _MM_PERM_ENUM = 0xAE;
+pub const _MM_PERM_CCDD: _MM_PERM_ENUM = 0xAF;
+pub const _MM_PERM_CDAA: _MM_PERM_ENUM = 0xB0;
+pub const _MM_PERM_CDAB: _MM_PERM_ENUM = 0xB1;
+pub const _MM_PERM_CDAC: _MM_PERM_ENUM = 0xB2;
+pub const _MM_PERM_CDAD: _MM_PERM_ENUM = 0xB3;
+pub const _MM_PERM_CDBA: _MM_PERM_ENUM = 0xB4;
+pub const _MM_PERM_CDBB: _MM_PERM_ENUM = 0xB5;
+pub const _MM_PERM_CDBC: _MM_PERM_ENUM = 0xB6;
+pub const _MM_PERM_CDBD: _MM_PERM_ENUM = 0xB7;
+pub const _MM_PERM_CDCA: _MM_PERM_ENUM = 0xB8;
+pub const _MM_PERM_CDCB: _MM_PERM_ENUM = 0xB9;
+pub const _MM_PERM_CDCC: _MM_PERM_ENUM = 0xBA;
+pub const _MM_PERM_CDCD: _MM_PERM_ENUM = 0xBB;
+pub const _MM_PERM_CDDA: _MM_PERM_ENUM = 0xBC;
+pub const _MM_PERM_CDDB: _MM_PERM_ENUM = 0xBD;
+pub const _MM_PERM_CDDC: _MM_PERM_ENUM = 0xBE;
+pub const _MM_PERM_CDDD: _MM_PERM_ENUM = 0xBF;
+pub const _MM_PERM_DAAA: _MM_PERM_ENUM = 0xC0;
+pub const _MM_PERM_DAAB: _MM_PERM_ENUM = 0xC1;
+pub const _MM_PERM_DAAC: _MM_PERM_ENUM = 0xC2;
+pub const _MM_PERM_DAAD: _MM_PERM_ENUM = 0xC3;
+pub const _MM_PERM_DABA: _MM_PERM_ENUM = 0xC4;
+pub const _MM_PERM_DABB: _MM_PERM_ENUM = 0xC5;
+pub const _MM_PERM_DABC: _MM_PERM_ENUM = 0xC6;
+pub const _MM_PERM_DABD: _MM_PERM_ENUM = 0xC7;
+pub const _MM_PERM_DACA: _MM_PERM_ENUM = 0xC8;
+pub const _MM_PERM_DACB: _MM_PERM_ENUM = 0xC9;
+pub const _MM_PERM_DACC: _MM_PERM_ENUM = 0xCA;
+pub const _MM_PERM_DACD: _MM_PERM_ENUM = 0xCB;
+pub const _MM_PERM_DADA: _MM_PERM_ENUM = 0xCC;
+pub const _MM_PERM_DADB: _MM_PERM_ENUM = 0xCD;
+pub const _MM_PERM_DADC: _MM_PERM_ENUM = 0xCE;
+pub const _MM_PERM_DADD: _MM_PERM_ENUM = 0xCF;
+pub const _MM_PERM_DBAA: _MM_PERM_ENUM = 0xD0;
+pub const _MM_PERM_DBAB: _MM_PERM_ENUM = 0xD1;
+pub const _MM_PERM_DBAC: _MM_PERM_ENUM = 0xD2;
+pub const _MM_PERM_DBAD: _MM_PERM_ENUM = 0xD3;
+pub const _MM_PERM_DBBA: _MM_PERM_ENUM = 0xD4;
+pub const _MM_PERM_DBBB: _MM_PERM_ENUM = 0xD5;
+pub const _MM_PERM_DBBC: _MM_PERM_ENUM = 0xD6;
+pub const _MM_PERM_DBBD: _MM_PERM_ENUM = 0xD7;
+pub const _MM_PERM_DBCA: _MM_PERM_ENUM = 0xD8;
+pub const _MM_PERM_DBCB: _MM_PERM_ENUM = 0xD9;
+pub const _MM_PERM_DBCC: _MM_PERM_ENUM = 0xDA;
+pub const _MM_PERM_DBCD: _MM_PERM_ENUM = 0xDB;
+pub const _MM_PERM_DBDA: _MM_PERM_ENUM = 0xDC;
+pub const _MM_PERM_DBDB: _MM_PERM_ENUM = 0xDD;
+pub const _MM_PERM_DBDC: _MM_PERM_ENUM = 0xDE;
+pub const _MM_PERM_DBDD: _MM_PERM_ENUM = 0xDF;
+pub const _MM_PERM_DCAA: _MM_PERM_ENUM = 0xE0;
+pub const _MM_PERM_DCAB: _MM_PERM_ENUM = 0xE1;
+pub const _MM_PERM_DCAC: _MM_PERM_ENUM = 0xE2;
+pub const _MM_PERM_DCAD: _MM_PERM_ENUM = 0xE3;
+pub const _MM_PERM_DCBA: _MM_PERM_ENUM = 0xE4;
+pub const _MM_PERM_DCBB: _MM_PERM_ENUM = 0xE5;
+pub const _MM_PERM_DCBC: _MM_PERM_ENUM = 0xE6;
+pub const _MM_PERM_DCBD: _MM_PERM_ENUM = 0xE7;
+pub const _MM_PERM_DCCA: _MM_PERM_ENUM = 0xE8;
+pub const _MM_PERM_DCCB: _MM_PERM_ENUM = 0xE9;
+pub const _MM_PERM_DCCC: _MM_PERM_ENUM = 0xEA;
+pub const _MM_PERM_DCCD: _MM_PERM_ENUM = 0xEB;
+pub const _MM_PERM_DCDA: _MM_PERM_ENUM = 0xEC;
+pub const _MM_PERM_DCDB: _MM_PERM_ENUM = 0xED;
+pub const _MM_PERM_DCDC: _MM_PERM_ENUM = 0xEE;
+pub const _MM_PERM_DCDD: _MM_PERM_ENUM = 0xEF;
+pub const _MM_PERM_DDAA: _MM_PERM_ENUM = 0xF0;
+pub const _MM_PERM_DDAB: _MM_PERM_ENUM = 0xF1;
+pub const _MM_PERM_DDAC: _MM_PERM_ENUM = 0xF2;
+pub const _MM_PERM_DDAD: _MM_PERM_ENUM = 0xF3;
+pub const _MM_PERM_DDBA: _MM_PERM_ENUM = 0xF4;
+pub const _MM_PERM_DDBB: _MM_PERM_ENUM = 0xF5;
+pub const _MM_PERM_DDBC: _MM_PERM_ENUM = 0xF6;
+pub const _MM_PERM_DDBD: _MM_PERM_ENUM = 0xF7;
+pub const _MM_PERM_DDCA: _MM_PERM_ENUM = 0xF8;
+pub const _MM_PERM_DDCB: _MM_PERM_ENUM = 0xF9;
+pub const _MM_PERM_DDCC: _MM_PERM_ENUM = 0xFA;
+pub const _MM_PERM_DDCD: _MM_PERM_ENUM = 0xFB;
+pub const _MM_PERM_DDDA: _MM_PERM_ENUM = 0xFC;
+pub const _MM_PERM_DDDB: _MM_PERM_ENUM = 0xFD;
+pub const _MM_PERM_DDDC: _MM_PERM_ENUM = 0xFE;
+pub const _MM_PERM_DDDD: _MM_PERM_ENUM = 0xFF;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.pmul.dq.512"]
+    fn vpmuldq(a: i32x16, b: i32x16) -> i64x8;
+    #[link_name = "llvm.x86.avx512.pmulu.dq.512"]
+    fn vpmuludq(a: u32x16, b: u32x16) -> u64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmaxs.d.512"]
+    fn vpmaxsd(a: i32x16, b: i32x16) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmaxs.q.512"]
+    fn vpmaxsq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.pmaxs.q.256"]
+    fn vpmaxsq256(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.pmaxs.q.128"]
+    fn vpmaxsq128(a: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.pmins.d.512"]
+    fn vpminsd(a: i32x16, b: i32x16) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmins.q.512"]
+    fn vpminsq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.pmins.q.256"]
+    fn vpminsq256(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.pmins.q.128"]
+    fn vpminsq128(a: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.pmaxu.d.512"]
+    fn vpmaxud(a: u32x16, b: u32x16) -> u32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmaxu.q.512"]
+    fn vpmaxuq(a: u64x8, b: u64x8) -> u64x8;
+    #[link_name = "llvm.x86.avx512.mask.pmaxu.q.256"]
+    fn vpmaxuq256(a: u64x4, b: u64x4) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.pmaxu.q.128"]
+    fn vpmaxuq128(a: u64x2, b: u64x2) -> u64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.pminu.d.512"]
+    fn vpminud(a: u32x16, b: u32x16) -> u32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pminu.q.512"]
+    fn vpminuq(a: u64x8, b: u64x8) -> u64x8;
+    #[link_name = "llvm.x86.avx512.mask.pminu.q.256"]
+    fn vpminuq256(a: u64x4, b: u64x4) -> u64x4;
+    #[link_name = "llvm.x86.avx512.mask.pminu.q.128"]
+    fn vpminuq128(a: u64x2, b: u64x2) -> u64x2;
+
+    #[link_name = "llvm.x86.avx512.sqrt.ps.512"]
+    fn vsqrtps(a: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.sqrt.pd.512"]
+    fn vsqrtpd(a: f64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.fma.v16f32"]
+    fn vfmadd132ps(a: f32x16, b: f32x16, c: f32x16) -> f32x16;
+    #[link_name = "llvm.fma.v8f64"]
+    fn vfmadd132pd(a: f64x8, b: f64x8, c: f64x8) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.vfmadd.ps.512"]
+    fn vfmadd132psround(a: f32x16, b: f32x16, c: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.vfmadd.pd.512"]
+    fn vfmadd132pdround(a: f64x8, b: f64x8, c: f64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.vfmaddsub.ps.512"]
+    fn vfmaddsub213ps(a: f32x16, b: f32x16, c: f32x16, d: i32) -> f32x16; //from clang
+    #[link_name = "llvm.x86.avx512.vfmaddsub.pd.512"]
+    fn vfmaddsub213pd(a: f64x8, b: f64x8, c: f64x8, d: i32) -> f64x8; //from clang
+
+    #[link_name = "llvm.x86.avx512.add.ps.512"]
+    fn vaddps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.add.pd.512"]
+    fn vaddpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.sub.ps.512"]
+    fn vsubps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.sub.pd.512"]
+    fn vsubpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mul.ps.512"]
+    fn vmulps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mul.pd.512"]
+    fn vmulpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.div.ps.512"]
+    fn vdivps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.div.pd.512"]
+    fn vdivpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.max.ps.512"]
+    fn vmaxps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.max.pd.512"]
+    fn vmaxpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.min.ps.512"]
+    fn vminps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.min.pd.512"]
+    fn vminpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.getexp.ps.512"]
+    fn vgetexpps(a: f32x16, src: f32x16, m: u16, sae: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.getexp.ps.256"]
+    fn vgetexpps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.getexp.ps.128"]
+    fn vgetexpps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.getexp.pd.512"]
+    fn vgetexppd(a: f64x8, src: f64x8, m: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.getexp.pd.256"]
+    fn vgetexppd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.getexp.pd.128"]
+    fn vgetexppd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.512"]
+    fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.256"]
+    fn vrndscaleps256(a: f32x8, imm8: i32, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.128"]
+    fn vrndscaleps128(a: f32x4, imm8: i32, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
+    fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.256"]
+    fn vrndscalepd256(a: f64x4, imm8: i32, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.128"]
+    fn vrndscalepd128(a: f64x2, imm8: i32, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.512"]
+    fn vscalefps(a: f32x16, b: f32x16, src: f32x16, mask: u16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.256"]
+    fn vscalefps256(a: f32x8, b: f32x8, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.128"]
+    fn vscalefps128(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
+    fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.256"]
+    fn vscalefpd256(a: f64x4, b: f64x4, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.128"]
+    fn vscalefpd128(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.512"]
+    fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.256"]
+    fn vfixupimmps256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.128"]
+    fn vfixupimmps128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
+    fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.256"]
+    fn vfixupimmpd256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.128"]
+    fn vfixupimmpd128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.512"]
+    fn vfixupimmpsz(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.256"]
+    fn vfixupimmpsz256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.128"]
+    fn vfixupimmpsz128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
+    fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.256"]
+    fn vfixupimmpdz256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.128"]
+    fn vfixupimmpdz128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.pternlog.d.512"]
+    fn vpternlogd(a: i32x16, b: i32x16, c: i32x16, imm8: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.pternlog.d.256"]
+    fn vpternlogd256(a: i32x8, b: i32x8, c: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.pternlog.d.128"]
+    fn vpternlogd128(a: i32x4, b: i32x4, c: i32x4, imm8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.pternlog.q.512"]
+    fn vpternlogq(a: i64x8, b: i64x8, c: i64x8, imm8: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.pternlog.q.256"]
+    fn vpternlogq256(a: i64x4, b: i64x4, c: i64x4, imm8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.pternlog.q.128"]
+    fn vpternlogq128(a: i64x2, b: i64x2, c: i64x2, imm8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
+    fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ps.256"]
+    fn vgetmantps256(a: f32x8, mantissas: i32, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ps.128"]
+    fn vgetmantps128(a: f32x4, mantissas: i32, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
+    fn vgetmantpd(a: f64x8, mantissas: i32, src: f64x8, m: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.getmant.pd.256"]
+    fn vgetmantpd256(a: f64x4, mantissas: i32, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.getmant.pd.128"]
+    fn vgetmantpd128(a: f64x2, mantissas: i32, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rcp14.ps.512"]
+    fn vrcp14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.rcp14.ps.256"]
+    fn vrcp14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.rcp14.ps.128"]
+    fn vrcp14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.rcp14.pd.512"]
+    fn vrcp14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.rcp14.pd.256"]
+    fn vrcp14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.rcp14.pd.128"]
+    fn vrcp14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.ps.512"]
+    fn vrsqrt14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.rsqrt14.ps.256"]
+    fn vrsqrt14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.rsqrt14.ps.128"]
+    fn vrsqrt14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.pd.512"]
+    fn vrsqrt14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.rsqrt14.pd.256"]
+    fn vrsqrt14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.rsqrt14.pd.128"]
+    fn vrsqrt14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2dq.512"]
+    fn vcvtps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.512"]
+    fn vcvtps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.256"]
+    fn vcvtps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.128"]
+    fn vcvtps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtps2pd.512"]
+    fn vcvtps2pd(a: f32x8, src: f64x8, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2ps.512"]
+    fn vcvtpd2ps(a: f64x8, src: f32x8, mask: u8, rounding: i32) -> f32x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2dq.512"]
+    fn vcvtpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.512"]
+    fn vcvtpd2udq(a: f64x8, src: u32x8, mask: u8, rounding: i32) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.256"]
+    fn vcvtpd2udq256(a: f64x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.128"]
+    fn vcvtpd2udq128(a: f64x2, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.sitofp.round.v16f32.v16i32"]
+    fn vcvtdq2ps(a: i32x16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.uitofp.round.v16f32.v16i32"]
+    fn vcvtudq2ps(a: u32x16, rounding: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.512"]
+    fn vcvtps2ph(a: f32x16, sae: i32, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.256"]
+    fn vcvtps2ph256(a: f32x8, sae: i32, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.128"]
+    fn vcvtps2ph128(a: f32x4, sae: i32, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.vcvtph2ps.512"]
+    fn vcvtph2ps(a: i16x16, src: f32x16, mask: u16, sae: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.512"]
+    fn vcvttps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.256"]
+    fn vcvttps2dq256(a: f32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.128"]
+    fn vcvttps2dq128(a: f32x4, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.512"]
+    fn vcvttps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.256"]
+    fn vcvttps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.128"]
+    fn vcvttps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.512"]
+    fn vcvttpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.256"]
+    fn vcvttpd2dq256(a: f64x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.128"]
+    fn vcvttpd2dq128(a: f64x2, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.512"]
+    fn vcvttpd2udq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.256"]
+    fn vcvttpd2udq256(a: f64x4, src: i32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.128"]
+    fn vcvttpd2udq128(a: f64x2, src: i32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.128"]
+    fn vpmovdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.256"]
+    fn vpmovdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.128"]
+    fn vpmovdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.256"]
+    fn vpmovqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.128"]
+    fn vpmovqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.256"]
+    fn vpmovqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.128"]
+    fn vpmovqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.128"]
+    fn vpmovqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.512"]
+    fn vpmovdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.256"]
+    fn vpmovdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.128"]
+    fn vpmovdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.512"]
+    fn vpmovsdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.256"]
+    fn vpmovsdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.128"]
+    fn vpmovsdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.512"]
+    fn vpmovusdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.256"]
+    fn vpmovusdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.128"]
+    fn vpmovusdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.512"]
+    fn vpmovdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.256"]
+    fn vpmovdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.128"]
+    fn vpmovdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.512"]
+    fn vpmovsdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.256"]
+    fn vpmovsdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.128"]
+    fn vpmovsdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.512"]
+    fn vpmovusdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.256"]
+    fn vpmovusdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.128"]
+    fn vpmovusdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.512"]
+    fn vpmovqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.256"]
+    fn vpmovqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.128"]
+    fn vpmovqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.512"]
+    fn vpmovsqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.256"]
+    fn vpmovsqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.128"]
+    fn vpmovsqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.512"]
+    fn vpmovusqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.256"]
+    fn vpmovusqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.128"]
+    fn vpmovusqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.512"]
+    fn vpmovqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.256"]
+    fn vpmovqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.128"]
+    fn vpmovqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.512"]
+    fn vpmovsqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.256"]
+    fn vpmovsqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.128"]
+    fn vpmovsqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.512"]
+    fn vpmovusqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.256"]
+    fn vpmovusqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.128"]
+    fn vpmovusqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.512"]
+    fn vpmovqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.256"]
+    fn vpmovqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.128"]
+    fn vpmovqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.512"]
+    fn vpmovsqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.256"]
+    fn vpmovsqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.128"]
+    fn vpmovsqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.512"]
+    fn vpmovusqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.256"]
+    fn vpmovusqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.128"]
+    fn vpmovusqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmov.qb.512"]
+    fn vpmovqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.512"]
+    fn vpmovsdw(a: i32x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.256"]
+    fn vpmovsdw256(a: i32x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.128"]
+    fn vpmovsdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.512"]
+    fn vpmovsdb(a: i32x16, src: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.256"]
+    fn vpmovsdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.128"]
+    fn vpmovsdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.512"]
+    fn vpmovsqd(a: i64x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.256"]
+    fn vpmovsqd256(a: i64x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.128"]
+    fn vpmovsqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.512"]
+    fn vpmovsqw(a: i64x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.256"]
+    fn vpmovsqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.128"]
+    fn vpmovsqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.512"]
+    fn vpmovsqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.256"]
+    fn vpmovsqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.128"]
+    fn vpmovsqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.512"]
+    fn vpmovusdw(a: u32x16, src: u16x16, mask: u16) -> u16x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.256"]
+    fn vpmovusdw256(a: u32x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.128"]
+    fn vpmovusdw128(a: u32x4, src: u16x8, mask: u8) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.512"]
+    fn vpmovusdb(a: u32x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.256"]
+    fn vpmovusdb256(a: u32x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.128"]
+    fn vpmovusdb128(a: u32x4, src: u8x16, mask: u8) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.512"]
+    fn vpmovusqd(a: u64x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.256"]
+    fn vpmovusqd256(a: u64x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.128"]
+    fn vpmovusqd128(a: u64x2, src: u32x4, mask: u8) -> u32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.512"]
+    fn vpmovusqw(a: u64x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.256"]
+    fn vpmovusqw256(a: u64x4, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.128"]
+    fn vpmovusqw128(a: u64x2, src: u16x8, mask: u8) -> u16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.512"]
+    fn vpmovusqb(a: u64x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.256"]
+    fn vpmovusqb256(a: u64x4, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.128"]
+    fn vpmovusqb128(a: u64x2, src: u8x16, mask: u8) -> u8x16;
+
+    #[link_name = "llvm.x86.avx512.gather.dpd.512"]
+    fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.gather.dps.512"]
+    fn vgatherdps(src: f32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.gather.qpd.512"]
+    fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.gather.qps.512"]
+    fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
+    #[link_name = "llvm.x86.avx512.gather.dpq.512"]
+    fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.gather.dpi.512"]
+    fn vpgatherdd(src: i32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.gather.qpq.512"]
+    fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.gather.qpi.512"]
+    fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
+
+    #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
+    fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dps.512"]
+    fn vscatterdps(slice: *mut i8, mask: i16, offsets: i32x16, src: f32x16, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
+    fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qps.512"]
+    fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
+    fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.scatter.dpi.512"]
+    fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpq.512"]
+    fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
+    fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.ss"]
+    fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.sd"]
+    fn vcmpsd(a: __m128d, b: __m128d, op: i32, m: i8, sae: i32) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"]
+    fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16;
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.256"]
+    fn vcmpps256(a: f32x8, b: f32x8, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.128"]
+    fn vcmpps128(a: f32x4, b: f32x4, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.512"]
+    fn vcmppd(a: f64x8, b: f64x8, op: i32, m: i8, sae: i32) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.256"]
+    fn vcmppd256(a: f64x4, b: f64x4, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
+    fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"]
+    fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.ucmp.q.256"]
+    fn vpcmpuq256(a: i64x4, b: i64x4, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.ucmp.q.128"]
+    fn vpcmpuq128(a: i64x2, b: i64x2, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.q.512"]
+    fn vpcmpq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.q.256"]
+    fn vpcmpq256(a: i64x4, b: i64x4, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.q.128"]
+    fn vpcmpq128(a: i64x2, b: i64x2, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.ucmp.d.512"]
+    fn vpcmpud(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
+    #[link_name = "llvm.x86.avx512.mask.ucmp.d.256"]
+    fn vpcmpud256(a: i32x8, b: i32x8, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.ucmp.d.128"]
+    fn vpcmpud128(a: i32x4, b: i32x4, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.cmp.d.512"]
+    fn vpcmpd(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
+    #[link_name = "llvm.x86.avx512.mask.cmp.d.256"]
+    fn vpcmpd256(a: i32x8, b: i32x8, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.d.128"]
+    fn vpcmpd128(a: i32x4, b: i32x4, op: i32, m: i8) -> i8;
+
+    #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
+    fn vprold(a: i32x16, i8: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.prol.d.256"]
+    fn vprold256(a: i32x8, i8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.prol.d.128"]
+    fn vprold128(a: i32x4, i8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.pror.d.512"]
+    fn vprord(a: i32x16, i8: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.pror.d.256"]
+    fn vprord256(a: i32x8, i8: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.pror.d.128"]
+    fn vprord128(a: i32x4, i8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.prol.q.512"]
+    fn vprolq(a: i64x8, i8: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.prol.q.256"]
+    fn vprolq256(a: i64x4, i8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.prol.q.128"]
+    fn vprolq128(a: i64x2, i8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.pror.q.512"]
+    fn vprorq(a: i64x8, i8: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.pror.q.256"]
+    fn vprorq256(a: i64x4, i8: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.pror.q.128"]
+    fn vprorq128(a: i64x2, i8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
+    fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.prolv.d.256"]
+    fn vprolvd256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.prolv.d.128"]
+    fn vprolvd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
+    fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.prorv.d.256"]
+    fn vprorvd256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.prorv.d.128"]
+    fn vprorvd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
+    fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.prolv.q.256"]
+    fn vprolvq256(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.prolv.q.128"]
+    fn vprolvq128(a: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
+    fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.prorv.q.256"]
+    fn vprorvq256(a: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.prorv.q.128"]
+    fn vprorvq128(a: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.psllv.d.512"]
+    fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psrlv.d.512"]
+    fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psllv.q.512"]
+    fn vpsllvq(a: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrlv.q.512"]
+    fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.pslli.d.512"]
+    fn vpsllid(a: i32x16, imm8: u32) -> i32x16;
+
+    #[link_name = "llvm.x86.avx2.pslli.d"]
+    fn psllid256(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.sse2.pslli.d"]
+    fn psllid128(a: i32x4, imm8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.psrli.d.512"]
+    fn vpsrlid(a: i32x16, imm8: u32) -> i32x16;
+
+    #[link_name = "llvm.x86.avx2.psrli.d"]
+    fn psrlid256(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.sse2.psrli.d"]
+    fn psrlid128(a: i32x4, imm8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.pslli.q.512"]
+    fn vpslliq(a: i64x8, imm8: u32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx2.pslli.q"]
+    fn pslliq256(a: i64x4, imm8: i32) -> i64x4;
+    #[link_name = "llvm.x86.sse2.pslli.q"]
+    fn pslliq128(a: i64x2, imm8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.psrli.q.512"]
+    fn vpsrliq(a: i64x8, imm8: u32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx2.psrli.q"]
+    fn psrliq256(a: i64x4, imm8: i32) -> i64x4;
+    #[link_name = "llvm.x86.sse2.psrli.q"]
+    fn psrliq128(a: i64x2, imm8: i32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.psll.d.512"]
+    fn vpslld(a: i32x16, count: i32x4) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psrl.d.512"]
+    fn vpsrld(a: i32x16, count: i32x4) -> i32x16;
+    #[link_name = "llvm.x86.avx512.psll.q.512"]
+    fn vpsllq(a: i64x8, count: i64x2) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrl.q.512"]
+    fn vpsrlq(a: i64x8, count: i64x2) -> i64x8;
+
+    #[link_name = "llvm.x86.avx512.psra.d.512"]
+    fn vpsrad(a: i32x16, count: i32x4) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.psra.q.512"]
+    fn vpsraq(a: i64x8, count: i64x2) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psra.q.256"]
+    fn vpsraq256(a: i64x4, count: i64x2) -> i64x4;
+    #[link_name = "llvm.x86.avx512.psra.q.128"]
+    fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.psrai.d.512"]
+    fn vpsraid512(a: i32x16, imm8: u32) -> i32x16;
+    #[link_name = "llvm.x86.avx2.psrai.d"]
+    fn psraid256(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.sse2.psrai.d"]
+    fn psraid128(a: i32x4, imm8: i32) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.psrai.q.512"]
+    fn vpsraiq(a: i64x8, imm8: u32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrai.q.256"]
+    fn vpsraiq256(a: i64x4, imm8: u32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.psrai.q.128"]
+    fn vpsraiq128(a: i64x2, imm8: u32) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.psrav.d.512"]
+    fn vpsravd(a: i32x16, count: i32x16) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.psrav.q.512"]
+    fn vpsravq(a: i64x8, count: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.psrav.q.256"]
+    fn vpsravq256(a: i64x4, count: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.psrav.q.128"]
+    fn vpsravq128(a: i64x2, count: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"]
+    fn vpermilps(a: f32x16, b: i32x16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"]
+    fn vpermilpd(a: f64x8, b: i64x8) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.permvar.si.512"]
+    fn vpermd(a: i32x16, idx: i32x16) -> i32x16;
+
+    #[link_name = "llvm.x86.avx512.permvar.di.512"]
+    fn vpermq(a: i64x8, idx: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.permvar.di.256"]
+    fn vpermq256(a: i64x4, idx: i64x4) -> i64x4;
+
+    #[link_name = "llvm.x86.avx512.permvar.sf.512"]
+    fn vpermps(a: f32x16, idx: i32x16) -> f32x16;
+
+    #[link_name = "llvm.x86.avx512.permvar.df.512"]
+    fn vpermpd(a: f64x8, idx: i64x8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.permvar.df.256"]
+    fn vpermpd256(a: f64x4, idx: i64x4) -> f64x4;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.d.512"]
+    fn vpermi2d(a: i32x16, idx: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpermi2var.d.256"]
+    fn vpermi2d256(a: i32x8, idx: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.d.128"]
+    fn vpermi2d128(a: i32x4, idx: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.q.512"]
+    fn vpermi2q(a: i64x8, idx: i64x8, b: i64x8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.q.256"]
+    fn vpermi2q256(a: i64x4, idx: i64x4, b: i64x4) -> i64x4;
+    #[link_name = "llvm.x86.avx512.vpermi2var.q.128"]
+    fn vpermi2q128(a: i64x2, idx: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.ps.512"]
+    fn vpermi2ps(a: f32x16, idx: i32x16, b: f32x16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.vpermi2var.ps.256"]
+    fn vpermi2ps256(a: f32x8, idx: i32x8, b: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.ps.128"]
+    fn vpermi2ps128(a: f32x4, idx: i32x4, b: f32x4) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.vpermi2var.pd.512"]
+    fn vpermi2pd(a: f64x8, idx: i64x8, b: f64x8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.vpermi2var.pd.256"]
+    fn vpermi2pd256(a: f64x4, idx: i64x4, b: f64x4) -> f64x4;
+    #[link_name = "llvm.x86.avx512.vpermi2var.pd.128"]
+    fn vpermi2pd128(a: f64x2, idx: i64x2, b: f64x2) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.d.512"]
+    fn vpcompressd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.d.256"]
+    fn vpcompressd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.d.128"]
+    fn vpcompressd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.q.512"]
+    fn vpcompressq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.q.256"]
+    fn vpcompressq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.compress.q.128"]
+    fn vpcompressq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.ps.512"]
+    fn vcompressps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.ps.256"]
+    fn vcompressps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.ps.128"]
+    fn vcompressps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.pd.512"]
+    fn vcompresspd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.compress.pd.256"]
+    fn vcompresspd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.compress.pd.128"]
+    fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.d.512"]
+    fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.d.256"]
+    fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.d.128"]
+    fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.q.512"]
+    fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.q.256"]
+    fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.q.128"]
+    fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"]
+    fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"]
+    fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"]
+    fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"]
+    fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"]
+    fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"]
+    fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.expand.d.512"]
+    fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.d.256"]
+    fn vpexpandd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.d.128"]
+    fn vpexpandd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.q.512"]
+    fn vpexpandq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.q.256"]
+    fn vpexpandq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.q.128"]
+    fn vpexpandq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.ps.512"]
+    fn vexpandps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.ps.256"]
+    fn vexpandps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.ps.128"]
+    fn vexpandps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.pd.512"]
+    fn vexpandpd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.expand.pd.256"]
+    fn vexpandpd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx512.mask.expand.pd.128"]
+    fn vexpandpd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.add.ss.round"]
+    fn vaddss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.add.sd.round"]
+    fn vaddsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.sub.ss.round"]
+    fn vsubss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.sub.sd.round"]
+    fn vsubsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.mul.ss.round"]
+    fn vmulss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.mul.sd.round"]
+    fn vmulsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.div.ss.round"]
+    fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
+    fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.max.ss.round"]
+    fn vmaxss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.max.sd.round"]
+    fn vmaxsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.min.ss.round"]
+    fn vminss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.min.sd.round"]
+    fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
+    fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
+    fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.getexp.ss"]
+    fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.getexp.sd"]
+    fn vgetexpsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ss"]
+    fn vgetmantss(a: f32x4, b: f32x4, mantissas: i32, src: f32x4, m: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.getmant.sd"]
+    fn vgetmantsd(a: f64x2, b: f64x2, mantissas: i32, src: f64x2, m: u8, sae: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.ss"]
+    fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.rsqrt14.sd"]
+    fn vrsqrt14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.rcp14.ss"]
+    fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.rcp14.sd"]
+    fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ss"]
+    fn vrndscaless(a: f32x4, b: f32x4, src: f32x4, mask: u8, imm8: i32, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.sd"]
+    fn vrndscalesd(a: f64x2, b: f64x2, src: f64x2, mask: u8, imm8: i32, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ss"]
+    fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.scalef.sd"]
+    fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.vfmadd.f32"]
+    fn vfmadd132ss(a: f32, b: f32, c: f32, rounding: i32) -> f32;
+    #[link_name = "llvm.x86.avx512.vfmadd.f64"]
+    fn vfmadd132sd(a: f64, b: f64, c: f64, rounding: i32) -> f64;
+
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ss"]
+    fn vfixupimmss(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.sd"]
+    fn vfixupimmsd(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ss"]
+    fn vfixupimmssz(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.sd"]
+    fn vfixupimmsdz(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.cvtss2sd.round"]
+    fn vcvtss2sd(a: f64x2, a: f32x4, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.cvtsd2ss.round"]
+    fn vcvtsd2ss(a: f32x4, b: f64x2, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.vcvtss2si32"]
+    fn vcvtss2si(a: f32x4, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.vcvtss2usi32"]
+    fn vcvtss2usi(a: f32x4, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.vcvtsd2si32"]
+    fn vcvtsd2si(a: f64x2, rounding: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.vcvtsd2usi32"]
+    fn vcvtsd2usi(a: f64x2, rounding: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx512.cvtsi2ss32"]
+    fn vcvtsi2ss(a: f32x4, b: i32, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.cvtsi2sd64"]
+    fn vcvtsi2sd(a: f64x2, b: i64, rounding: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.cvtusi2ss"]
+    fn vcvtusi2ss(a: f32x4, b: u32, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.cvtusi642sd"]
+    fn vcvtusi2sd(a: f64x2, b: u64, rounding: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.vcomi.ss"]
+    fn vcomiss(a: f32x4, b: f32x4, imm8: i32, sae: i32) -> i32;
+    #[link_name = "llvm.x86.avx512.vcomi.sd"]
+    fn vcomisd(a: f64x2, b: f64x2, imm8: i32, sae: i32) -> i32;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::hint::black_box;
+    use crate::mem::{self};
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_abs_epi32(a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_mask_abs_epi32(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi32(a, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm512_maskz_abs_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi32(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_mask_abs_epi32(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi32(a, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, -100, -32,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_maskz_abs_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi32(0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            0, 0, 0, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi32() {
+        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
+        let r = _mm_mask_abs_epi32(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi32(a, 0b00001111, a);
+        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi32() {
+        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
+        let r = _mm_maskz_abs_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi32(0b00001111, a);
+        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let r = _mm512_abs_ps(a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1., 1., f32::MAX,
+            f32::MAX, 100., 100., 32.,
+            0., 1., 1., f32::MAX,
+            f32::MAX, 100., 100., 32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let r = _mm512_mask_abs_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_abs_ps(a, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1., 1., f32::MAX,
+            f32::MAX, 100., 100., 32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_mask_mov_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi32(src, 0b11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_mov_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi32(0b11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(2);
+        let r = _mm256_mask_mov_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi32(src, 0b11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_mov_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi32(0b11111111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(2);
+        let r = _mm_mask_mov_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi32(src, 0b00001111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi32() {
+        let a = _mm_set1_epi32(2);
+        let r = _mm_maskz_mov_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi32(0b00001111, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_ps() {
+        let src = _mm512_set1_ps(1.);
+        let a = _mm512_set1_ps(2.);
+        let r = _mm512_mask_mov_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_mov_ps(src, 0b11111111_11111111, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_ps() {
+        let a = _mm512_set1_ps(2.);
+        let r = _mm512_maskz_mov_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mov_ps(0b11111111_11111111, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_ps() {
+        let src = _mm256_set1_ps(1.);
+        let a = _mm256_set1_ps(2.);
+        let r = _mm256_mask_mov_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_mov_ps(src, 0b11111111, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_ps() {
+        let a = _mm256_set1_ps(2.);
+        let r = _mm256_maskz_mov_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_mov_ps(0b11111111, a);
+        assert_eq_m256(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_ps() {
+        let src = _mm_set1_ps(1.);
+        let a = _mm_set1_ps(2.);
+        let r = _mm_mask_mov_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_mov_ps(src, 0b00001111, a);
+        assert_eq_m128(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_ps() {
+        let a = _mm_set1_ps(2.);
+        let r = _mm_maskz_mov_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_mov_ps(0b00001111, a);
+        assert_eq_m128(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_add_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_add_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi32(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_add_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            1, 2, 0, i32::MIN,
+            i32::MIN + 1, 101, -99, -31,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_add_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi32() {
+        let a = _mm256_setr_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_add_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi32(0b11111111, a, b);
+        let e = _mm256_setr_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_add_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(2, 0, i32::MIN, i32::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi32() {
+        let a = _mm_setr_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_add_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi32(0b00001111, a, b);
+        let e = _mm_setr_epi32(2, 0, i32::MIN, i32::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_add_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_mask_add_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_add_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_add_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_add_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1., 2., 0., f32::MAX,
+            f32::MIN + 1., 101., -99., -31.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_mask_add_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_add_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_add_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_add_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_mask_add_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_add_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_maskz_add_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_add_ps(0b00001111, a, b);
+        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_sub_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_sub_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi32(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_sub_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            -1, 0, -2, i32::MAX - 1,
+            i32::MAX, 99, -101, -33,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_sub_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_sub_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_sub_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_sub_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_sub_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_mask_sub_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sub_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_sub_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sub_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0., -2., f32::MAX - 1.,
+            f32::MIN, 99., -101., -33.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_mask_sub_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_sub_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_sub_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_sub_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_mask_sub_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_sub_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(1.);
+        let r = _mm_maskz_sub_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_sub_ps(0b00001111, a, b);
+        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mullo_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mullo_epi32(a, b);
+        let e = _mm512_setr_epi32(
+            0, 2, -2, -2, 0, 200, -200, -64, 0, 2, -2, -2, 0, 200, -200, -64,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mullo_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mask_mullo_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mullo_epi32(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_epi32(
+            0, 2, -2, -2,
+            0, 200, -200, -64,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mullo_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_mullo_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mullo_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 2, -2, -2, 0, 200, -200, -64, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mullo_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_mask_mullo_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mullo_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mullo_epi32() {
+        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_mullo_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mullo_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mullo_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_mask_mullo_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mullo_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(2, -2, -2, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mullo_epi32() {
+        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_maskz_mullo_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mullo_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(2, -2, -2, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mul_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200.,
+            -64.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mask_mul_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_mul_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+            0., 1., -1., f32::MAX,
+            f32::MIN, 100., -100., -32.,
+        );
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_maskz_mul_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mul_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mul_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_mask_mul_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_mul_ps(a, 0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+        );
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_ps() {
+        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_maskz_mul_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_mul_ps(0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_ps(
+            0., 2., -2., f32::INFINITY,
+            f32::NEG_INFINITY, 200., -200., -64.,
+        );
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mul_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_mask_mul_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_mul_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mul_ps() {
+        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_maskz_mul_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_mul_ps(0b00001111, a, b);
+        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
+        );
+        let b = _mm512_setr_ps(
+            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
+        );
+        let r = _mm512_div_ps(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.5, -0.5, -1.,
+            50., f32::INFINITY, -50., -16.,
+            0., 0.5, -0.5, 500.,
+            f32::NEG_INFINITY, 50., -50., -16.,
+        );
+        assert_eq_m512(r, e); // 0/0 = NAN
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
+        );
+        let b = _mm512_setr_ps(
+            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
+        );
+        let r = _mm512_mask_div_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_div_ps(a, 0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.5, -0.5, -1.,
+            50., f32::INFINITY, -50., -16.,
+            0., 1., -1., 1000.,
+            -131., 100., -100., -32.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
+        );
+        let b = _mm512_setr_ps(
+            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
+        );
+        let r = _mm512_maskz_div_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_div_ps(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.5, -0.5, -1.,
+            50., f32::INFINITY, -50., -16.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_div_ps() {
+        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
+        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
+        let r = _mm256_mask_div_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_div_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_div_ps() {
+        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
+        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
+        let r = _mm256_maskz_div_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_div_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_div_ps() {
+        let a = _mm_set_ps(100., 100., -100., -32.);
+        let b = _mm_set_ps(2., 0., 2., 2.);
+        let r = _mm_mask_div_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_div_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_div_ps() {
+        let a = _mm_set_ps(100., 100., -100., -32.);
+        let b = _mm_set_ps(2., 0., 2., 2.);
+        let r = _mm_maskz_div_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_div_ps(0b00001111, a, b);
+        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi32(a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_max_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_max_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_max_ps(a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_max_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_max_ps(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_max_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_max_ps(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_mask_max_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_max_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_maskz_max_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_max_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_mask_max_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(3., 2., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_maskz_max_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(3., 2., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu32(a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_max_epu32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu32(0b11111111, a, b);
+        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_max_epu32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_max_epu32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu32(0b00001111, a, b);
+        let e = _mm_set_epi32(3, 2, 2, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi32(a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_min_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_min_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_min_ps(a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_min_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_min_ps(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_min_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_min_ps(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_mask_min_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_min_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm256_maskz_min_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_min_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_mask_min_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_min_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(0., 1., 1., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(3., 2., 1., 0.);
+        let r = _mm_maskz_min_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_min_ps(0b00001111, a, b);
+        let e = _mm_set_ps(0., 1., 1., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu32(a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu32(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu32() {
+        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu32(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_mask_min_epu32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu32(0b11111111, a, b);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_mask_min_epu32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let b = _mm_set_epi32(3, 2, 1, 0);
+        let r = _mm_maskz_min_epu32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu32(0b00001111, a, b);
+        let e = _mm_set_epi32(0, 1, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        let r = _mm512_sqrt_ps(a);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        let r = _mm512_mask_sqrt_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sqrt_ps(a, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
+        );
+        let r = _mm512_maskz_sqrt_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sqrt_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sqrt_ps() {
+        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm256_mask_sqrt_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_sqrt_ps(a, 0b11111111, a);
+        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sqrt_ps() {
+        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm256_maskz_sqrt_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_sqrt_ps(0b11111111, a);
+        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sqrt_ps() {
+        let a = _mm_set_ps(0., 1., 4., 9.);
+        let r = _mm_mask_sqrt_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_sqrt_ps(a, 0b00001111, a);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sqrt_ps() {
+        let a = _mm_set_ps(0., 1., 4., 9.);
+        let r = _mm_maskz_sqrt_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_sqrt_ps(0b00001111, a);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fmadd_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmadd_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmadd_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmadd_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmadd_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(2.);
+        let r = _mm512_mask3_fmadd_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmadd_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            2., 3., 4., 5., 6., 7., 8., 9., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmadd_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmadd_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmadd_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmadd_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmadd_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmadd_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmadd_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmadd_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmadd_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmadd_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_ps() {
+        let a = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let r = _mm512_fmsub_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmsub_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsub_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmsub_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsub_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fmsub_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsub_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            -1., 0., 1., 2., 3., 4., 5., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmsub_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmsub_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmsub_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmsub_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmsub_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmsub_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmsub_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsub_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(-1., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmsub_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmsub_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(-1., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmsub_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsub_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(-1., 0., 1., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fmaddsub_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmaddsub_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmaddsub_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmaddsub_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmaddsub_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            -1., 2., 1., 4., 3., 6., 5., 8., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmaddsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmaddsub_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmaddsub_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmaddsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmaddsub_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmaddsub_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmaddsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmaddsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmaddsub_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmaddsub_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(1., 0., 3., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmaddsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmaddsub_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmaddsub_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(1., 0., 3., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmaddsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(1., 0., 3., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_ps() {
+        let a = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        let r = _mm512_fmsubadd_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 9., 8., 11., 10., 13., 12., 15., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmsubadd_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsubadd_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmsubadd_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsubadd_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            1., 0., 3., 2., 5., 4., 7., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsubadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fmsubadd_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fmsubadd_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsubadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fmsubadd_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fmsubadd_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsubadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsubadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fmsubadd_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsubadd_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(-1., 2., 1., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsubadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fmsubadd_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fmsubadd_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(-1., 2., 1., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsubadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(-1., 2., 1., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fnmadd_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fnmadd_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmadd_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fnmadd_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmadd_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            1., 0., -1., -2., -3., -4., -5., -6., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fnmadd_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fnmadd_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fnmadd_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fnmadd_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmadd_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fnmadd_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmadd_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(1., 0., -1., -2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fnmadd_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fnmadd_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(1., 0., -1., -2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmadd_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fnmadd_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmadd_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(1., 0., -1., -2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fnmsub_ps(a, b, c);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14., -15., -16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fnmsub_ps(a, 0, b, c);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmsub_ps(a, 0b00000000_11111111, b, c);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fnmsub_ps(0, a, b, c);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmsub_ps(0b00000000_11111111, a, b, c);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let c = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0);
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0b00000000_11111111);
+        let e = _mm512_setr_ps(
+            -1., -2., -3., -4., -5., -6., -7., -8., 2., 2., 2., 2., 2., 2., 2., 2.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask_fnmsub_ps(a, 0, b, c);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_fnmsub_ps(a, 0b11111111, b, c);
+        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_fnmsub_ps(0, a, b, c);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_fnmsub_ps(0b11111111, a, b, c);
+        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmsub_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm256_set1_ps(1.);
+        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0);
+        assert_eq_m256(r, c);
+        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0b11111111);
+        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask_fnmsub_ps(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmsub_ps(a, 0b00001111, b, c);
+        let e = _mm_set_ps(-1., -2., -3., -4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_maskz_fnmsub_ps(0, a, b, c);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_fnmsub_ps(0b00001111, a, b, c);
+        let e = _mm_set_ps(-1., -2., -3., -4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmsub_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set_ps(0., 1., 2., 3.);
+        let c = _mm_set1_ps(1.);
+        let r = _mm_mask3_fnmsub_ps(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmsub_ps(a, b, c, 0b00001111);
+        let e = _mm_set_ps(-1., -2., -3., -4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rcp14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_rcp14_ps(a);
+        let e = _mm512_set1_ps(0.33333206);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rcp14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_rcp14_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_rcp14_ps(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
+            0.33333206, 0.33333206, 0.33333206, 0.33333206,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rcp14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_rcp14_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_rcp14_ps(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
+            0.33333206, 0.33333206, 0.33333206, 0.33333206,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rcp14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_rcp14_ps(a);
+        let e = _mm256_set1_ps(0.33333206);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rcp14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_mask_rcp14_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_rcp14_ps(a, 0b11111111, a);
+        let e = _mm256_set1_ps(0.33333206);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rcp14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_rcp14_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_rcp14_ps(0b11111111, a);
+        let e = _mm256_set1_ps(0.33333206);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rcp14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_rcp14_ps(a);
+        let e = _mm_set1_ps(0.33333206);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rcp14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_mask_rcp14_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_rcp14_ps(a, 0b00001111, a);
+        let e = _mm_set1_ps(0.33333206);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rcp14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_maskz_rcp14_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_rcp14_ps(0b00001111, a);
+        let e = _mm_set1_ps(0.33333206);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rsqrt14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_rsqrt14_ps(a);
+        let e = _mm512_set1_ps(0.5773392);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rsqrt14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_rsqrt14_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_rsqrt14_ps(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
+            0.5773392, 0.5773392, 0.5773392,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rsqrt14_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_rsqrt14_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_rsqrt14_ps(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
+            0.5773392, 0.5773392, 0.5773392,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rsqrt14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_mask_rsqrt14_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_rsqrt14_ps(a, 0b11111111, a);
+        let e = _mm256_set1_ps(0.5773392);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rsqrt14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_rsqrt14_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_rsqrt14_ps(0b11111111, a);
+        let e = _mm256_set1_ps(0.5773392);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rsqrt14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_mask_rsqrt14_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_rsqrt14_ps(a, 0b00001111, a);
+        let e = _mm_set1_ps(0.5773392);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rsqrt14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_maskz_rsqrt14_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_rsqrt14_ps(0b00001111, a);
+        let e = _mm_set1_ps(0.5773392);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_getexp_ps(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_getexp_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getexp_ps(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_getexp_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_getexp_ps(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getexp_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_getexp_ps(a);
+        let e = _mm256_set1_ps(1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getexp_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_mask_getexp_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_getexp_ps(a, 0b11111111, a);
+        let e = _mm256_set1_ps(1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getexp_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_getexp_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_getexp_ps(0b11111111, a);
+        let e = _mm256_set1_ps(1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getexp_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_getexp_ps(a);
+        let e = _mm_set1_ps(1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getexp_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_getexp_ps(a, 0b00001111, a);
+        let e = _mm_set1_ps(1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getexp_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_getexp_ps(0b00001111, a);
+        let e = _mm_set1_ps(1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_roundscale_ps::<0b00_00_00_00>(a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
+        let e = _mm512_set1_ps(1.1);
+        assert_eq_m512(r, e);
+        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111_11111111, a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111_11111111, a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_roundscale_ps() {
+        let a = _mm256_set1_ps(1.1);
+        let r = _mm256_roundscale_ps::<0b00_00_00_00>(a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_roundscale_ps() {
+        let a = _mm256_set1_ps(1.1);
+        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
+        let e = _mm256_set1_ps(1.1);
+        assert_eq_m256(r, e);
+        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111, a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_roundscale_ps() {
+        let a = _mm256_set1_ps(1.1);
+        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111, a);
+        let e = _mm256_set1_ps(1.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_roundscale_ps() {
+        let a = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_ps::<0b00_00_00_00>(a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_roundscale_ps() {
+        let a = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
+        let e = _mm_set1_ps(1.1);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0b00001111, a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_roundscale_ps() {
+        let a = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0b00001111, a);
+        let e = _mm_set1_ps(1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_scalef_ps(a, b);
+        let e = _mm512_set1_ps(8.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_scalef_ps(a, 0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_scalef_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_scalef_ps(0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_scalef_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(3.);
+        let r = _mm256_scalef_ps(a, b);
+        let e = _mm256_set1_ps(8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_scalef_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(3.);
+        let r = _mm256_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_scalef_ps(a, 0b11111111, a, b);
+        let e = _mm256_set1_ps(8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_scalef_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(3.);
+        let r = _mm256_maskz_scalef_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_scalef_ps(0b11111111, a, b);
+        let e = _mm256_set1_ps(8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_scalef_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_ps(a, b);
+        let e = _mm_set1_ps(8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_scalef_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_scalef_ps(a, 0b00001111, a, b);
+        let e = _mm_set1_ps(8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_scalef_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_scalef_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_scalef_ps(0b00001111, a, b);
+        let e = _mm_set1_ps(8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_ps() {
+        let a = _mm512_set1_ps(f32::NAN);
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        //let r = _mm512_fixupimm_ps(a, b, c, 5);
+        let r = _mm512_fixupimm_ps::<5>(a, b, c);
+        let e = _mm512_set1_ps(0.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_mask_fixupimm_ps::<5>(a, 0b11111111_00000000, b, c);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_maskz_fixupimm_ps::<5>(0b11111111_00000000, a, b, c);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_fixupimm_ps() {
+        let a = _mm256_set1_ps(f32::NAN);
+        let b = _mm256_set1_ps(f32::MAX);
+        let c = _mm256_set1_epi32(i32::MAX);
+        let r = _mm256_fixupimm_ps::<5>(a, b, c);
+        let e = _mm256_set1_ps(0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fixupimm_ps() {
+        let a = _mm256_set1_ps(f32::NAN);
+        let b = _mm256_set1_ps(f32::MAX);
+        let c = _mm256_set1_epi32(i32::MAX);
+        let r = _mm256_mask_fixupimm_ps::<5>(a, 0b11111111, b, c);
+        let e = _mm256_set1_ps(0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fixupimm_ps() {
+        let a = _mm256_set1_ps(f32::NAN);
+        let b = _mm256_set1_ps(f32::MAX);
+        let c = _mm256_set1_epi32(i32::MAX);
+        let r = _mm256_maskz_fixupimm_ps::<5>(0b11111111, a, b, c);
+        let e = _mm256_set1_ps(0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_fixupimm_ps() {
+        let a = _mm_set1_ps(f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_fixupimm_ps::<5>(a, b, c);
+        let e = _mm_set1_ps(0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fixupimm_ps() {
+        let a = _mm_set1_ps(f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_mask_fixupimm_ps::<5>(a, 0b00001111, b, c);
+        let e = _mm_set1_ps(0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fixupimm_ps() {
+        let a = _mm_set1_ps(f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_maskz_fixupimm_ps::<5>(0b00001111, a, b, c);
+        let e = _mm_set1_ps(0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ternarylogic_epi32() {
+        let a = _mm512_set1_epi32(1 << 2);
+        let b = _mm512_set1_epi32(1 << 1);
+        let c = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_ternarylogic_epi32::<8>(a, b, c);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ternarylogic_epi32() {
+        let src = _mm512_set1_epi32(1 << 2);
+        let a = _mm512_set1_epi32(1 << 1);
+        let b = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ternarylogic_epi32() {
+        let a = _mm512_set1_epi32(1 << 2);
+        let b = _mm512_set1_epi32(1 << 1);
+        let c = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_maskz_ternarylogic_epi32::<9>(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ternarylogic_epi32::<8>(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ternarylogic_epi32() {
+        let a = _mm256_set1_epi32(1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let c = _mm256_set1_epi32(1 << 0);
+        let r = _mm256_ternarylogic_epi32::<8>(a, b, c);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ternarylogic_epi32() {
+        let src = _mm256_set1_epi32(1 << 2);
+        let a = _mm256_set1_epi32(1 << 1);
+        let b = _mm256_set1_epi32(1 << 0);
+        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ternarylogic_epi32() {
+        let a = _mm256_set1_epi32(1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let c = _mm256_set1_epi32(1 << 0);
+        let r = _mm256_maskz_ternarylogic_epi32::<9>(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ternarylogic_epi32::<8>(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ternarylogic_epi32() {
+        let a = _mm_set1_epi32(1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let c = _mm_set1_epi32(1 << 0);
+        let r = _mm_ternarylogic_epi32::<8>(a, b, c);
+        let e = _mm_set1_epi32(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ternarylogic_epi32() {
+        let src = _mm_set1_epi32(1 << 2);
+        let a = _mm_set1_epi32(1 << 1);
+        let b = _mm_set1_epi32(1 << 0);
+        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ternarylogic_epi32() {
+        let a = _mm_set1_epi32(1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let c = _mm_set1_epi32(1 << 0);
+        let r = _mm_maskz_ternarylogic_epi32::<9>(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ternarylogic_epi32::<8>(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm512_set1_ps(1.25);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(
+            a,
+            0b11111111_00000000,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r =
+            _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getmant_ps() {
+        let a = _mm256_set1_ps(10.);
+        let r = _mm256_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm256_set1_ps(1.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getmant_ps() {
+        let a = _mm256_set1_ps(10.);
+        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a);
+        let e = _mm256_set1_ps(1.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getmant_ps() {
+        let a = _mm256_set1_ps(10.);
+        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a);
+        let e = _mm256_set1_ps(1.25);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getmant_ps() {
+        let a = _mm_set1_ps(10.);
+        let r = _mm_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+        let e = _mm_set1_ps(1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getmant_ps() {
+        let a = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b00001111, a);
+        let e = _mm_set1_ps(1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getmant_ps() {
+        let a = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b00001111, a);
+        let e = _mm_set1_ps(1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(-1.);
+        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5,
+            3., 4.5, 5., 6.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5,
+            3., 4.5, 5., 6.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_ps(
+            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
+        );
+        let b = _mm512_set1_ps(1.);
+        let r =
+            _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+            7., 8.5, 9., 10.5,
+            11., 12.5, 13., -0.99999994,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            8., 9.5, 10., 11.5,
+            12., 13.5, 14., 0.00000000000000000000007,
+        );
+        let b = _mm512_set1_ps(0.1);
+        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.15, 0.2, 0.35,
+            0.4, 0.55, 0.6, 0.75,
+            0.8, 0.95, 1.0, 1.15,
+            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0.14999999, 0.2, 0.35,
+            0.4, 0.54999995, 0.59999996, 0.75,
+            0.8, 0.95, 1.0, 1.15,
+            1.1999999, 1.3499999, 1.4, 0.000000000000000000000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            8., 9.5, 10., 11.5,
+            12., 13.5, 14., 0.00000000000000000000007,
+        );
+        let b = _mm512_set1_ps(0.1);
+        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            0.8, 0.95, 1.0, 1.15,
+            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_setr_ps(
+            0., 1.5, 2., 3.5,
+            4., 5.5, 6., 7.5,
+            8., 9.5, 10., 11.5,
+            12., 13.5, 14., 0.00000000000000000000007,
+        );
+        let b = _mm512_set1_ps(0.1);
+        let r =
+            _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+            0.8, 0.95, 1.0, 1.15,
+            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ps(0.33333334);
+        assert_eq_m512(r, e);
+        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ps(0.3333333);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
+            0.33333334, 0.33333334, 0.33333334, 0.33333334,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r =
+            _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
+            0.33333334, 0.33333334, 0.33333334, 0.33333334,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_ps(1.7320508);
+        assert_eq_m512(r, e);
+        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_ps(1.7320509);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r =
+            _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
+            1.7320508, 1.7320508, 1.7320508,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r =
+            _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
+            1.7320508, 1.7320508, 1.7320508,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        #[rustfmt::skip]
+        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(-0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r =
+            _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_ps(
+            1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
+            -0.9999999, 1., -0.9999999, 1., -0.9999999,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r =
+            _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+        );
+        assert_eq_m512(r, e);
+        let r = _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_ps(
+            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
+            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r =
+            _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(1.);
+        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r =
+            _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.99999994);
+        assert_eq_m512(r, e);
+        let r = _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_ps(0.9999999);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b00000000_11111111,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_round_ps() {
+        let a = _mm512_set1_ps(0.00000007);
+        let b = _mm512_set1_ps(1.);
+        let c = _mm512_set1_ps(-1.);
+        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512(r, c);
+        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            b,
+            c,
+            0b00000000_11111111,
+        );
+        let e = _mm512_setr_ps(
+            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
+            0.99999994, -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_round_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_setr_ps(
+            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+        );
+        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_round_ps() {
+        let a = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        let e = _mm512_set1_ps(1.1);
+        assert_eq_m512(r, e);
+        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(
+            a,
+            0b11111111_11111111,
+            a,
+        );
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r =
+            _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111_11111111, a);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_ps(8.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111_00000000,
+            a,
+            b,
+        );
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_round_ps() {
+        let a = _mm512_set1_ps(f32::NAN);
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm512_set1_ps(0.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_mask_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
+            a,
+            0b11111111_00000000,
+            b,
+            c,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_round_ps() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_maskz_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
+            0b11111111_00000000,
+            a,
+            b,
+            c,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_round_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a);
+        let e = _mm512_set1_ps(1.25);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_round_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_mask_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_round_ps() {
+        let a = _mm512_set1_ps(10.);
+        let r = _mm512_maskz_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_getmant_round_ps::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11111111_00000000, a);
+        let e = _mm512_setr_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtps_epi32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtps_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtps_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtps_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtps_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvtps_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtps_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvtps_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtps_epu32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtps_epu32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtps_epu32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtps_epu32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_cvtps_epu32(a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtps_epu32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvtps_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtps_epu32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_cvtps_epu32(a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvtps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi8_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu8_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi16_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi16_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi16_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi16_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi16_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu16_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu16_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_ps(a);
+        let e = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_ps() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm256_set1_ps(-1.);
+        let r = _mm256_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_cvtepi32_ps(src, 0b11111111, a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_ps() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtepi32_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_cvtepi32_ps(0b11111111, a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_ps() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let src = _mm_set1_ps(-1.);
+        let r = _mm_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtepi32_ps(src, 0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_ps() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_cvtepi32_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtepi32_ps(0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32_ps(a);
+        let e = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_cvtepu32_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu32_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi16(a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtepi32_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm256_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi32_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi32_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi32_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi32_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi32_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi32_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_cvtsepi32_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_cvtusepi32_epi16(a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            8., 10., 10., 12.,
+            12., 14., 14., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let src = _mm256_set1_epi16(0);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvt_roundps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvt_roundps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvt_roundps_ph() {
+        let a = _mm_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvt_roundps_ph() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let src = _mm256_set1_epi16(0);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_cvtph_ps(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvtph_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let r = _mm512_maskz_cvtph_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm256_set1_ps(0.);
+        let r = _mm256_mask_cvtph_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_cvtph_ps(src, 0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm256_maskz_cvtph_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_cvtph_ps(0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm_set1_ps(0.);
+        let r = _mm_mask_cvtph_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtph_ps(src, 0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm_maskz_cvtph_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtph_ps(0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvttps_epi32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvttps_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttps_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttps_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvttps_epu32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvttps_epu32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_cvttps_epu32(a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epu32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_cvttps_epu32(a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_ps() {
+        let mut arr = [0f32; 256];
+        for i in 0..256 {
+            arr[i] = i as f32;
+        }
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_ps::<4>(index, arr.as_ptr() as *const u8);
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.,
+                                         120., 128., 136., 144., 152., 160., 168., 176.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_ps() {
+        let mut arr = [0f32; 256];
+        for i in 0..256 {
+            arr[i] = i as f32;
+        }
+        let src = _mm512_set1_ps(2.);
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_ps::<4>(src, mask, index, arr.as_ptr() as *const u8);
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.,
+                                         2., 128., 2., 144., 2., 160., 2., 176.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_epi32() {
+        let mut arr = [0i32; 256];
+        for i in 0..256 {
+            arr[i] = i as i32;
+        }
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_epi32::<4>(index, arr.as_ptr() as *const u8);
+        #[rustfmt::skip]
+        assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                             120, 128, 136, 144, 152, 160, 168, 176));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_epi32() {
+        let mut arr = [0i32; 256];
+        for i in 0..256 {
+            arr[i] = i as i32;
+        }
+        let src = _mm512_set1_epi32(2);
+        let mask = 0b10101010_10101010;
+        let index = _mm512_setr_epi32(
+            0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+        );
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_epi32::<4>(src, mask, index, arr.as_ptr() as *const u8);
+        assert_eq_m512i(
+            r,
+            _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, 2, 144, 2, 176, 2, 208, 2, 240),
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0f32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0f32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        #[rustfmt::skip]
+
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0i32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0i32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmplt_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnlt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnlt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmpnle_ps_mask(b, a);
+        assert_eq!(m, 0b00001101_00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmpnle_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let m = _mm512_cmpeq_ps_mask(b, a);
+        assert_eq!(m, 0b11001101_11001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b01001000_01001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let m = _mm512_cmpneq_ps_mask(b, a);
+        assert_eq!(m, 0b00110010_00110010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_ps_mask() {
+        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm256_set1_ps(-1.);
+        let m = _mm256_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_ps_mask() {
+        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm256_set1_ps(-1.);
+        let mask = 0b01100110;
+        let r = _mm256_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_ps_mask() {
+        let a = _mm_set_ps(0., 1., -1., 13.);
+        let b = _mm_set1_ps(1.);
+        let m = _mm_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_ps_mask() {
+        let a = _mm_set_ps(0., 1., -1., 13.);
+        let b = _mm_set1_ps(1.);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpord_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b11000011_11000011;
+        let m = _mm512_mask_cmpord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b00000001_00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpunord_ps_mask(a, b);
+
+        assert_eq!(m, 0b11111010_11111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b00001111_00001111;
+        let m = _mm512_mask_cmpunord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b000001010_00001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_ss_mask::<_CMP_GE_OS>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_sd_mask::<_CMP_GE_OS>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epu32_mask(a, b);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmplt_epu32_mask(a, b);
+        assert_eq!(r, 0b10000000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b10000000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmplt_epu32_mask(a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmpgt_epu32_mask(a, b);
+        assert_eq!(r, 0b00111111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00111111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmpgt_epu32_mask(a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmple_epu32_mask(a, b),
+            !_mm512_cmpgt_epu32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmple_epu32_mask(mask, a, b),
+            0b01111010_01111010
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmple_epu32_mask(a, b);
+        assert_eq!(r, 0b11000000)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b11000000)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmple_epu32_mask(a, b);
+        assert_eq!(r, 0b00001100)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00001100)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmpge_epu32_mask(a, b),
+            !_mm512_cmplt_epu32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmpge_epu32_mask(a, b);
+        assert_eq!(r, 0b01111111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b01111111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmpge_epu32_mask(a, b);
+        assert_eq!(r, 0b00000111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00000111)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm256_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let m = _mm_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epu32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
+        let r = _mm256_cmpneq_epu32_mask(b, a);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let r = _mm_cmpneq_epu32_mask(b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let m = _mm256_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let m = _mm_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epi32_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmplt_epi32_mask(a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epi32_mask() {
+        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmplt_epi32_mask(a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi32_mask() {
+        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epi32_mask(b, a);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmpgt_epi32_mask(a, b);
+        assert_eq!(r, 0b11011010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b11011010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmpgt_epi32_mask(a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmple_epi32_mask(a, b),
+            !_mm512_cmpgt_epi32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmple_epi32_mask(a, b);
+        assert_eq!(r, 0b00100101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00100101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 200);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmple_epi32_mask(a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 200);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmpge_epi32_mask(a, b),
+            !_mm512_cmplt_epi32_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmpge_epi32_mask(mask, a, b),
+            0b01111010_01111010
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmpge_epi32_mask(a, b);
+        assert_eq!(r, 0b11111010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b11111010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmpge_epi32_mask(a, b);
+        assert_eq!(r, 0b00001111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00001111)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm256_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let m = _mm_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epi32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpneq_epi32_mask(b, a);
+        assert_eq!(m, !_mm256_cmpeq_epi32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00110011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let r = _mm_cmpneq_epi32_mask(b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let m = _mm256_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b01100110;
+        let r = _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(1);
+        let m = _mm_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi8() {
+        let r = _mm512_set1_epi8(2);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi8(
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2, 2, 2, 2, 2,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi16() {
+        let r = _mm512_set1_epi16(2);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi16(
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi32() {
+        let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_epi32() {
+        let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi8() {
+        let r = _mm512_set_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2,
+        );
+        assert_eq_m512i(r, _mm512_set1_epi8(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi16() {
+        let r = _mm512_set_epi16(
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        assert_eq_m512i(r, _mm512_set1_epi16(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi32() {
+        let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, _mm512_set1_epi32(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_si512() {
+        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_epi32() {
+        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32());
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_ps() {
+        let r = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_set_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_ps() {
+        let r = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_setr_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_ps() {
+        #[rustfmt::skip]
+        let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2.,
+                                     2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512(expected, _mm512_set1_ps(2.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set4_epi32() {
+        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set4_ps() {
+        let r = _mm512_set_ps(
+            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
+        );
+        assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_epi32() {
+        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_ps() {
+        let r = _mm512_set_ps(
+            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
+        );
+        assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_ps() {
+        assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero() {
+        assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_pd() {
+        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_pd(black_box(p));
+        let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_pd() {
+        let a = _mm512_set1_pd(9.);
+        let mut r = _mm512_undefined_pd();
+        _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_ps() {
+        let a = &[
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        ];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_ps(black_box(p));
+        let e = _mm512_setr_ps(
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_ps() {
+        let a = _mm512_set1_ps(9.);
+        let mut r = _mm512_undefined_ps();
+        _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_epi32() {
+        let src = _mm512_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let src = _mm512_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_load_epi32(src, m, black_box(p));
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_load_epi32(m, black_box(p));
+        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_epi32() {
+        let mut r = [42_i32; 16];
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let m = 0b11101000_11001010;
+        _mm512_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(_mm512_loadu_epi32(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16],
+        }
+        let mut r = Align { data: [42; 16] };
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let m = 0b11101000_11001010;
+        _mm512_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(_mm512_load_epi32(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_epi64() {
+        let src = _mm512_set1_epi64(42);
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_epi64() {
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let src = _mm512_set1_epi64(42);
+        let a = Align {
+            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_load_epi64(src, m, black_box(p));
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_load_epi64(m, black_box(p));
+        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_epi64() {
+        let mut r = [42_i64; 8];
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm512_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(_mm512_loadu_epi64(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8],
+        }
+        let mut r = Align { data: [42; 8] };
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        let p = r.data.as_mut_ptr();
+        _mm512_mask_store_epi64(p, m, a);
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(_mm512_load_epi64(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_ps() {
+        let src = _mm512_set1_ps(42.0);
+        let a = &[
+            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+            16.0,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_ps() {
+        let a = &[
+            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+            16.0,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_loadu_ps(m, black_box(p));
+        let e = _mm512_setr_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let src = _mm512_set1_ps(42.0);
+        let a = Align {
+            data: [
+                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+                15.0, 16.0,
+            ],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_load_ps(src, m, black_box(p));
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [
+                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+                15.0, 16.0,
+            ],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_load_ps(m, black_box(p));
+        let e = _mm512_setr_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_ps() {
+        let mut r = [42_f32; 16];
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let m = 0b11101000_11001010;
+        _mm512_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(_mm512_loadu_ps(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16],
+        }
+        let mut r = Align { data: [42.0; 16] };
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let m = 0b11101000_11001010;
+        _mm512_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(_mm512_load_ps(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_pd() {
+        let src = _mm512_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_loadu_pd(m, black_box(p));
+        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let src = _mm512_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_load_pd(src, m, black_box(p));
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_load_pd(m, black_box(p));
+        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_pd() {
+        let mut r = [42_f64; 8];
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm512_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(_mm512_loadu_pd(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8],
+        }
+        let mut r = Align { data: [42.0; 8] };
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm512_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(_mm512_load_pd(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi32() {
+        let src = _mm256_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_epi32() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i32; 8], // 32 bytes
+        }
+        let src = _mm256_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_load_epi32(src, m, black_box(p));
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_epi32() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i32; 8], // 32 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_load_epi32(m, black_box(p));
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi32() {
+        let mut r = [42_i32; 8];
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm256_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(_mm256_loadu_epi32(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 8],
+        }
+        let mut r = Align { data: [42; 8] };
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm256_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(_mm256_load_epi32(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi64() {
+        let src = _mm256_set1_epi64x(42);
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi64() {
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4], // 32 bytes
+        }
+        let src = _mm256_set1_epi64x(42);
+        let a = Align {
+            data: [1_i64, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_load_epi64(src, m, black_box(p));
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4], // 32 bytes
+        }
+        let a = Align {
+            data: [1_i64, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_load_epi64(m, black_box(p));
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi64() {
+        let mut r = [42_i64; 4];
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm256_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(_mm256_loadu_epi64(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4],
+        }
+        let mut r = Align { data: [42; 4] };
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm256_mask_store_epi64(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(_mm256_load_epi64(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_ps() {
+        let src = _mm256_set1_ps(42.0);
+        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_ps() {
+        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_loadu_ps(m, black_box(p));
+        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8], // 32 bytes
+        }
+        let src = _mm256_set1_ps(42.0);
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_load_ps(src, m, black_box(p));
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8], // 32 bytes
+        }
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_load_ps(m, black_box(p));
+        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_ps() {
+        let mut r = [42_f32; 8];
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm256_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(_mm256_loadu_ps(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8],
+        }
+        let mut r = Align { data: [42.0; 8] };
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm256_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(_mm256_load_ps(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_pd() {
+        let src = _mm256_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_loadu_pd(m, black_box(p));
+        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4], // 32 bytes
+        }
+        let src = _mm256_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_load_pd(src, m, black_box(p));
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4], // 32 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_load_pd(m, black_box(p));
+        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_pd() {
+        let mut r = [42_f64; 4];
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm256_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(_mm256_loadu_pd(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_store_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4],
+        }
+        let mut r = Align { data: [42.0; 4] };
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm256_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(_mm256_load_pd(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi32() {
+        let src = _mm_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 32 bytes
+        }
+        let src = _mm_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_load_epi32(src, m, black_box(p));
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 16 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_load_epi32(m, black_box(p));
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi32() {
+        let mut r = [42_i32; 4];
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(_mm_loadu_epi32(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 16 bytes
+        }
+        let mut r = Align { data: [42; 4] };
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(_mm_load_epi32(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi64() {
+        let src = _mm_set1_epi64x(42);
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi64() {
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let src = _mm_set1_epi64x(42);
+        let a = Align { data: [1_i64, 2] };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_load_epi64(src, m, black_box(p));
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let a = Align { data: [1_i64, 2] };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_load_epi64(m, black_box(p));
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi64() {
+        let mut r = [42_i64; 2];
+        let a = _mm_setr_epi64x(1, 2);
+        let m = 0b10;
+        _mm_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(_mm_loadu_epi64(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let mut r = Align { data: [42; 2] };
+        let a = _mm_setr_epi64x(1, 2);
+        let m = 0b10;
+        _mm_mask_store_epi64(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(_mm_load_epi64(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_ps() {
+        let src = _mm_set1_ps(42.0);
+        let a = &[1.0_f32, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_ps() {
+        let a = &[1.0_f32, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_loadu_ps(m, black_box(p));
+        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let src = _mm_set1_ps(42.0);
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_load_ps(src, m, black_box(p));
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_load_ps(m, black_box(p));
+        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_ps() {
+        let mut r = [42_f32; 4];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(_mm_loadu_ps(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let mut r = Align { data: [42.0; 4] };
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(_mm_load_ps(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_pd() {
+        let src = _mm_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_loadu_pd(m, black_box(p));
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let src = _mm_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_load_pd(src, m, black_box(p));
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_load_pd(m, black_box(p));
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_pd() {
+        let mut r = [42_f64; 2];
+        let a = _mm_setr_pd(1.0, 2.0);
+        let m = 0b10;
+        _mm_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(_mm_loadu_pd(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let mut r = Align { data: [42.0; 2] };
+        let a = _mm_setr_pd(1.0, 2.0);
+        let m = 0b10;
+        _mm_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_pd() {
+        let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_pd() {
+        let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rol_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_rol_epi32::<1>(a);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rol_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rol_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let r = _mm512_maskz_rol_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rol_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rol_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_rol_epi32::<1>(a);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rol_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rol_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_maskz_rol_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rol_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rol_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_rol_epi32::<1>(a);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rol_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_mask_rol_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rol_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rol_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_maskz_rol_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rol_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ror_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_ror_epi32::<1>(a);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ror_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ror_epi32() {
+        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        let r = _mm512_maskz_ror_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ror_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ror_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm256_ror_epi32::<1>(a);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ror_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ror_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm256_maskz_ror_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ror_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ror_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let r = _mm_ror_epi32::<1>(a);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ror_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let r = _mm_mask_ror_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_ror_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ror_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let r = _mm_maskz_ror_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ror_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_slli_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_slli_epi32::<1>(a);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_slli_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_mask_slli_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_slli_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_slli_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let r = _mm512_maskz_slli_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_slli_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_slli_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_mask_slli_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_slli_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_slli_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm256_maskz_slli_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_slli_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_slli_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_mask_slli_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_slli_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_slli_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let r = _mm_maskz_slli_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_slli_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srli_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_srli_epi32::<1>(a);
+        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srli_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let r = _mm512_mask_srli_epi32::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srli_epi32::<1>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srli_epi32() {
+        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
+        let r = _mm512_maskz_srli_epi32::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srli_epi32::<1>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0 << 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srli_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_mask_srli_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srli_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srli_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskz_srli_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srli_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srli_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_mask_srli_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srli_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srli_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_maskz_srli_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srli_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rolv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_rolv_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rolv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_rolv_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rolv_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rolv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_rolv_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rolv_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rolv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_rolv_epi32(a, b);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rolv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_rolv_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rolv_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rolv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_rolv_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rolv_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rolv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_rolv_epi32(a, b);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rolv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_rolv_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rolv_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rolv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_rolv_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rolv_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rorv_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_rorv_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rorv_epi32() {
+        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_mask_rorv_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rorv_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rorv_epi32() {
+        let a = _mm512_set_epi32(3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
+        let b = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_rorv_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rorv_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rorv_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_rorv_epi32(a, b);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rorv_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_mask_rorv_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rorv_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rorv_epi32() {
+        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_rorv_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rorv_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rorv_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_rorv_epi32(a, b);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rorv_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_mask_rorv_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rorv_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rorv_epi32() {
+        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_maskz_rorv_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rorv_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sllv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_sllv_epi32(a, count);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sllv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_mask_sllv_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sllv_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sllv_epi32() {
+        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
+        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_maskz_sllv_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sllv_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sllv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_mask_sllv_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sllv_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sllv_epi32() {
+        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_sllv_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sllv_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sllv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_mask_sllv_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sllv_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sllv_epi32() {
+        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_maskz_sllv_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sllv_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(0, 2, 2, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srlv_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_srlv_epi32(a, count);
+        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srlv_epi32() {
+        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let count = _mm512_set1_epi32(1);
+        let r = _mm512_mask_srlv_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srlv_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srlv_epi32() {
+        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
+        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r = _mm512_maskz_srlv_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srlv_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srlv_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_mask_srlv_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srlv_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srlv_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_srlv_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srlv_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srlv_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_mask_srlv_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srlv_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srlv_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_maskz_srlv_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srlv_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sll_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_sll_epi32(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 1 << 2, 1 << 3, 1 << 4,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sll_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_mask_sll_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sll_epi32(a, 0b11111111_11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 1 << 2, 1 << 3, 1 << 4,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sll_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 31,
+        );
+        let count = _mm_set_epi32(2, 0, 0, 2);
+        let r = _mm512_maskz_sll_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sll_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sll_epi32() {
+        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_mask_sll_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sll_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sll_epi32() {
+        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_maskz_sll_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sll_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sll_epi32() {
+        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_mask_sll_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sll_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sll_epi32() {
+        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_maskz_sll_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sll_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srl_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_srl_epi32(a, count);
+        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srl_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        );
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_mask_srl_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srl_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srl_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 31, 1 << 0, 1 << 1, 1 << 2,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 31,
+        );
+        let count = _mm_set_epi32(2, 0, 0, 2);
+        let r = _mm512_maskz_srl_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srl_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 29);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srl_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_mask_srl_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srl_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srl_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_maskz_srl_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srl_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srl_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_mask_srl_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srl_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srl_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_maskz_srl_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srl_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sra_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        let count = _mm_set_epi32(1, 0, 0, 2);
+        let r = _mm512_sra_epi32(a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sra_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
+        let count = _mm_set_epi32(0, 0, 0, 2);
+        let r = _mm512_mask_sra_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sra_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sra_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
+        let count = _mm_set_epi32(2, 0, 0, 2);
+        let r = _mm512_maskz_sra_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sra_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sra_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_mask_sra_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sra_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sra_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm256_maskz_sra_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sra_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sra_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_mask_sra_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sra_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sra_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi32(0, 0, 0, 1);
+        let r = _mm_maskz_sra_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sra_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srav_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm512_srav_epi32(a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srav_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
+        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        let r = _mm512_mask_srav_epi32(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srav_epi32(a, 0b11111111_11111111, a, count);
+        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srav_epi32() {
+        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
+        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2);
+        let r = _mm512_maskz_srav_epi32(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srav_epi32(0b00000000_11111111, a, count);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srav_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_mask_srav_epi32(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srav_epi32(a, 0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srav_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let count = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_srav_epi32(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srav_epi32(0b11111111, a, count);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srav_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_mask_srav_epi32(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srav_epi32(a, 0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srav_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let count = _mm_set1_epi32(1);
+        let r = _mm_maskz_srav_epi32(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srav_epi32(0b00001111, a, count);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srai_epi32() {
+        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15);
+        let r = _mm512_srai_epi32::<2>(a);
+        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srai_epi32() {
+        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srai_epi32() {
+        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
+        let r = _mm512_maskz_srai_epi32::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srai_epi32::<2>(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srai_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srai_epi32() {
+        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskz_srai_epi32::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srai_epi32::<1>(0b11111111, a);
+        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srai_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_mask_srai_epi32::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srai_epi32::<1>(a, 0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srai_epi32() {
+        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
+        let r = _mm_maskz_srai_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srai_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permute_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_permute_ps::<0b11_11_11_11>(a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permute_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permute_ps() {
+        let a = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permute_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111, a);
+        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permute_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0b11111111, a);
+        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permute_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0b00001111, a);
+        let e = _mm_set_ps(0., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permute_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0b00001111, a);
+        let e = _mm_set_ps(0., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutevar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_permutevar_epi32(idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutevar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_permutevar_epi32(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutevar_epi32(a, 0b11111111_11111111, idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutevar_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_set1_epi32(0b01);
+        let r = _mm512_permutevar_ps(a, b);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutevar_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_set1_epi32(0b01);
+        let r = _mm512_mask_permutevar_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permutevar_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutevar_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let b = _mm512_set1_epi32(0b01);
+        let r = _mm512_maskz_permutevar_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permutevar_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutevar_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set1_epi32(0b01);
+        let r = _mm256_mask_permutevar_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permutevar_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutevar_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm256_set1_epi32(0b01);
+        let r = _mm256_maskz_permutevar_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permutevar_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutevar_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set1_epi32(0b01);
+        let r = _mm_mask_permutevar_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_permutevar_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutevar_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set1_epi32(0b01);
+        let r = _mm_maskz_permutevar_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_permutevar_ps(0b00001111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_permutexvar_epi32(idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_permutexvar_epi32(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi32(a, 0b11111111_11111111, idx, a);
+        let e = _mm512_set1_epi32(14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_epi32() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_permutexvar_epi32(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi32(0b00000000_11111111, idx, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi32() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_permutexvar_epi32(idx, a);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi32() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_permutexvar_epi32(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi32(a, 0b11111111, idx, a);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi32() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_permutexvar_epi32(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi32(0b11111111, idx, a);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_ps() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_permutexvar_ps(idx, a);
+        let e = _mm512_set1_ps(14.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_ps() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_permutexvar_ps(a, 0, idx, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permutexvar_ps(a, 0b11111111_11111111, idx, a);
+        let e = _mm512_set1_ps(14.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_ps() {
+        let idx = _mm512_set1_epi32(1);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_permutexvar_ps(0, idx, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permutexvar_ps(0b00000000_11111111, idx, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 14., 14., 14., 14., 14., 14., 14., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_ps() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_permutexvar_ps(idx, a);
+        let e = _mm256_set1_ps(6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_ps() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_permutexvar_ps(a, 0, idx, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permutexvar_ps(a, 0b11111111, idx, a);
+        let e = _mm256_set1_ps(6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_ps() {
+        let idx = _mm256_set1_epi32(1);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_permutexvar_ps(0, idx, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permutexvar_ps(0b11111111, idx, a);
+        let e = _mm256_set1_ps(6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_permutex2var_epi32(a, idx, b);
+        let e = _mm512_set_epi32(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_mask_permutex2var_epi32(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi32(a, 0b11111111_11111111, idx, b);
+        let e = _mm512_set_epi32(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_maskz_permutex2var_epi32(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi32(0b00000000_11111111, a, idx, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 10, 100, 9, 100, 8, 100, 7, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1000, 1 << 4, 2000, 1 << 4,
+            3000, 1 << 4, 4000, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_epi32(100);
+        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0b00000000_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1000, 1 << 4, 2000, 1 << 4,
+            3000, 1 << 4, 4000, 1 << 4,
+            10, 100, 9, 100,
+            8, 100, 7, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_permutex2var_epi32(a, idx, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_mask_permutex2var_epi32(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi32(a, 0b11111111, idx, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_maskz_permutex2var_epi32(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi32(0b11111111, a, idx, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_epi32(100);
+        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0b11111111, b);
+        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_permutex2var_epi32(a, idx, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_mask_permutex2var_epi32(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi32(a, 0b00001111, idx, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_maskz_permutex2var_epi32(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi32(0b00001111, a, idx, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_epi32(100);
+        let r = _mm_mask2_permutex2var_epi32(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi32(a, idx, 0b00001111, b);
+        let e = _mm_set_epi32(2, 100, 1, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_permutex2var_ps(a, idx, b);
+        let e = _mm512_set_ps(
+            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_mask_permutex2var_ps(a, 0, idx, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_permutex2var_ps(a, 0b11111111_11111111, idx, b);
+        let e = _mm512_set_ps(
+            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_maskz_permutex2var_ps(0, a, idx, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_permutex2var_ps(0b00000000_11111111, a, idx, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi32(
+            1, 1 << 4, 2, 1 << 4,
+            3, 1 << 4, 4, 1 << 4,
+            5, 1 << 4, 6, 1 << 4,
+            7, 1 << 4, 8, 1 << 4,
+        );
+        let b = _mm512_set1_ps(100.);
+        let r = _mm512_mask2_permutex2var_ps(a, idx, 0, b);
+        assert_eq_m512(r, _mm512_castsi512_ps(idx));
+        let r = _mm512_mask2_permutex2var_ps(a, idx, 0b11111111_11111111, b);
+        let e = _mm512_set_ps(
+            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_permutex2var_ps(a, idx, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_mask_permutex2var_ps(a, 0, idx, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permutex2var_ps(a, 0b11111111, idx, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_maskz_permutex2var_ps(0, a, idx, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permutex2var_ps(0b11111111, a, idx, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm256_set1_ps(100.);
+        let r = _mm256_mask2_permutex2var_ps(a, idx, 0, b);
+        assert_eq_m256(r, _mm256_castsi256_ps(idx));
+        let r = _mm256_mask2_permutex2var_ps(a, idx, 0b11111111, b);
+        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_permutex2var_ps(a, idx, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_mask_permutex2var_ps(a, 0, idx, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_permutex2var_ps(a, 0b00001111, idx, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_maskz_permutex2var_ps(0, a, idx, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_permutex2var_ps(0b00001111, a, idx, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
+        let b = _mm_set1_ps(100.);
+        let r = _mm_mask2_permutex2var_ps(a, idx, 0, b);
+        assert_eq_m128(r, _mm_castsi128_ps(idx));
+        let r = _mm_mask2_permutex2var_ps(a, idx, 0b00001111, b);
+        let e = _mm_set_ps(2., 100., 1., 100.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_epi32() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm512_shuffle_epi32::<_MM_PERM_AADD>(a);
+        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_epi32() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_epi32() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_epi32() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_epi32() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b11111111, a);
+        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_epi32() {
+        let a = _mm_set_epi32(1, 4, 5, 8);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b00001111, a);
+        let e = _mm_set_epi32(8, 8, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_epi32() {
+        let a = _mm_set_epi32(1, 4, 5, 8);
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00001111, a);
+        let e = _mm_set_epi32(8, 8, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_ps() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_shuffle_ps::<0b00_00_11_11>(a, b);
+        let e = _mm512_setr_ps(
+            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_ps() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_ps(
+            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_ps() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            8., 8., 2., 2., 16., 16., 10., 10., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_ps() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_ps() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_shuffle_ps::<0b00_00_11_11>(0b11111111, a, b);
+        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_ps() {
+        let a = _mm_set_ps(1., 4., 5., 8.);
+        let b = _mm_set_ps(2., 3., 6., 7.);
+        let r = _mm_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_shuffle_ps::<0b00_00_11_11>(a, 0b00001111, a, b);
+        let e = _mm_set_ps(7., 7., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_ps() {
+        let a = _mm_set_ps(1., 4., 5., 8.);
+        let b = _mm_set_ps(2., 3., 6., 7.);
+        let r = _mm_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_shuffle_ps::<0b00_00_11_11>(0b00001111, a, b);
+        let e = _mm_set_ps(7., 7., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_i32x4() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_shuffle_i32x4::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_i32x4() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_i32x4() {
+        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_shuffle_i32x4::<0b00>(a, b);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0b11111111, a, b);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_f32x4() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_shuffle_f32x4::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_ps(
+            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_f32x4() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_ps(
+            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_f32x4() {
+        let a = _mm512_setr_ps(
+            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
+        );
+        let b = _mm512_setr_ps(
+            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
+        );
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            1., 4., 5., 8., 1., 4., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_shuffle_f32x4::<0b00>(a, b);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0b11111111, a, b);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extractf32x4_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_extractf32x4_ps::<1>(a);
+        let e = _mm_setr_ps(5., 6., 7., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extractf32x4_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let src = _mm_set1_ps(100.);
+        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0b11111111, a);
+        let e = _mm_setr_ps(5., 6., 7., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_extractf32x4_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_extractf32x4_ps::<1>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm512_maskz_extractf32x4_ps::<1>(0b00000001, a);
+        let e = _mm_setr_ps(5., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_extractf32x4_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_extractf32x4_ps::<1>(a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_extractf32x4_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let src = _mm_set1_ps(100.);
+        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_extractf32x4_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_maskz_extractf32x4_ps::<1>(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm256_maskz_extractf32x4_ps::<1>(0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extracti32x4_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_extracti32x4_epi32::<1>(a);
+        let e = _mm_setr_epi32(5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extracti32x4_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm_set1_epi32(100);
+        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0b11111111, a);
+        let e = _mm_setr_epi32(5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm512_maskz_extracti32x4_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_extracti32x4_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_extracti32x4_epi32::<1>(0b00000001, a);
+        let e = _mm_setr_epi32(5, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_extracti32x4_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_extracti32x4_epi32::<1>(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_extracti32x4_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set1_epi32(100);
+        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_extracti32x4_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_extracti32x4_epi32::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_extracti32x4_epi32::<1>(0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_moveldup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_moveldup_ps(a);
+        let e = _mm512_setr_ps(
+            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_moveldup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_moveldup_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_moveldup_ps(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_moveldup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_moveldup_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_moveldup_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 3., 3., 5., 5., 7., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_moveldup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_moveldup_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_moveldup_ps(a, 0b11111111, a);
+        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_moveldup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_maskz_moveldup_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_moveldup_ps(0b11111111, a);
+        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_moveldup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_moveldup_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_moveldup_ps(a, 0b00001111, a);
+        let e = _mm_set_ps(2., 2., 4., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_moveldup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_moveldup_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_moveldup_ps(0b00001111, a);
+        let e = _mm_set_ps(2., 2., 4., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_movehdup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_movehdup_ps(a);
+        let e = _mm512_setr_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_movehdup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_mask_movehdup_ps(a, 0, a);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_movehdup_ps(a, 0b11111111_11111111, a);
+        let e = _mm512_setr_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_movehdup_ps() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let r = _mm512_maskz_movehdup_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_movehdup_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            2., 2., 4., 4., 6., 6., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_movehdup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_mask_movehdup_ps(a, 0, a);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_movehdup_ps(a, 0b11111111, a);
+        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_movehdup_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_maskz_movehdup_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_movehdup_ps(0b11111111, a);
+        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_movehdup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_mask_movehdup_ps(a, 0, a);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_movehdup_ps(a, 0b00001111, a);
+        let e = _mm_set_ps(1., 1., 3., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_movehdup_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let r = _mm_maskz_movehdup_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_movehdup_ps(0b00001111, a);
+        let e = _mm_set_ps(1., 1., 3., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_inserti32x4() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
+        let r = _mm512_inserti32x4::<0>(a, b);
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_inserti32x4() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
+        let r = _mm512_mask_inserti32x4::<0>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_inserti32x4::<0>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_inserti32x4() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
+        let r = _mm512_maskz_inserti32x4::<0>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_inserti32x4::<0>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_inserti32x4() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_inserti32x4::<1>(a, b);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_inserti32x4() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_inserti32x4::<0>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_inserti32x4::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_inserti32x4() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_inserti32x4::<0>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_inserti32x4::<1>(0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_insertf32x4::<0>(a, b);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_mask_insertf32x4::<0>(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_insertf32x4::<0>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_maskz_insertf32x4::<0>(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_insertf32x4::<0>(0b00000000_11111111, a, b);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_insertf32x4() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_insertf32x4::<1>(a, b);
+        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_insertf32x4() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_insertf32x4::<0>(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_insertf32x4::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_insertf32x4() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_insertf32x4::<0>(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_insertf32x4::<1>(0b11111111, a, b);
+        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps128_ps512() {
+        let a = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_castps128_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps256_ps512() {
+        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_castps256_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextps128_ps512() {
+        let a = _mm_setr_ps(17., 18., 19., 20.);
+        let r = _mm512_zextps128_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextps256_ps512() {
+        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_zextps256_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps512_ps128() {
+        let a = _mm512_setr_ps(
+            17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castps512_ps128(a);
+        let e = _mm_setr_ps(17., 18., 19., 20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps512_ps256() {
+        let a = _mm512_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castps512_ps256(a);
+        let e = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps_pd() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_castps_pd(a);
+        let e = _mm512_set1_pd(0.007812501848093234);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps_si512() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_castps_si512(a);
+        let e = _mm512_set1_epi32(1065353216);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_broadcastd_epi32(a);
+        let e = _mm512_set1_epi32(20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastd_epi32() {
+        let src = _mm512_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_maskz_broadcastd_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastd_epi32() {
+        let src = _mm256_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastd_epi32(src, 0b11111111, a);
+        let e = _mm256_set1_epi32(20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_broadcastd_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastd_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastd_epi32() {
+        let src = _mm_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastd_epi32(src, 0b00001111, a);
+        let e = _mm_set1_epi32(20);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_broadcastd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastd_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(20);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_broadcastss_ps(a);
+        let e = _mm512_set1_ps(20.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastss_ps() {
+        let src = _mm512_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_ps(20.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_maskz_broadcastss_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            20., 20., 20., 20., 20., 20., 20., 20., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastss_ps() {
+        let src = _mm256_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_broadcastss_ps(src, 0b11111111, a);
+        let e = _mm256_set1_ps(20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_broadcastss_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_broadcastss_ps(0b11111111, a);
+        let e = _mm256_set1_ps(20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastss_ps() {
+        let src = _mm_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_broadcastss_ps(src, 0b00001111, a);
+        let e = _mm_set1_ps(20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_broadcastss_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_broadcastss_ps(0b00001111, a);
+        let e = _mm_set1_ps(20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_broadcast_i32x4(a);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_i32x4() {
+        let src = _mm512_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_mask_broadcast_i32x4(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm512_maskz_broadcast_i32x4(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_broadcast_i32x4(a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_i32x4() {
+        let src = _mm256_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_broadcast_i32x4(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcast_i32x4(src, 0b11111111, a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_broadcast_i32x4(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcast_i32x4(0b11111111, a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_broadcast_f32x4(a);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_f32x4() {
+        let src = _mm512_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_mask_broadcast_f32x4(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm512_maskz_broadcast_f32x4(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_broadcast_f32x4(a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_f32x4() {
+        let src = _mm256_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_broadcast_f32x4(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_broadcast_f32x4(src, 0b11111111, a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_broadcast_f32x4(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_broadcast_f32x4(0b11111111, a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mask_blend_epi32(0b11111111_00000000, a, b);
+        let e = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_mask_blend_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_mask_blend_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_mask_blend_ps(0b11111111, a, b);
+        let e = _mm256_set1_ps(2.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_mask_blend_ps(0b00001111, a, b);
+        let e = _mm_set1_ps(2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_unpackhi_epi32(a, b);
+        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(17, 1, 18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(17, 1, 18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_unpackhi_ps(a, b);
+        let e = _mm512_set_ps(
+            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_unpackhi_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_unpackhi_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_unpackhi_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_unpackhi_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_unpackhi_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(17., 1., 18., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_unpackhi_ps(0b00001111, a, b);
+        let e = _mm_set_ps(17., 1., 18., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_unpacklo_epi32(a, b);
+        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(19, 3, 20, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(19, 3, 20, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_unpacklo_ps(a, b);
+        let e = _mm512_set_ps(
+            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_unpacklo_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_unpacklo_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_unpacklo_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_unpacklo_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_unpacklo_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(19., 3., 20., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_unpacklo_ps(0b00001111, a, b);
+        let e = _mm_set_ps(19., 3., 20., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_alignr_epi32() {
+        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi32(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        );
+        let r = _mm512_alignr_epi32::<0>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi32::<16>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi32::<1>(a, b);
+        let e = _mm512_set_epi32(
+            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_alignr_epi32() {
+        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi32(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        );
+        let r = _mm512_mask_alignr_epi32::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_alignr_epi32::<1>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(
+            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_alignr_epi32() {
+        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi32(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        );
+        let r = _mm512_maskz_alignr_epi32::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_alignr_epi32::<1>(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 23, 22, 21, 20, 19, 18);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_alignr_epi32::<0>(a, b);
+        assert_eq_m256i(r, b);
+        let r = _mm256_alignr_epi32::<1>(a, b);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_mask_alignr_epi32::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi32::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_maskz_alignr_epi32::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi32::<1>(0b11111111, a, b);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_alignr_epi32::<0>(a, b);
+        assert_eq_m128i(r, b);
+        let r = _mm_alignr_epi32::<1>(a, b);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_mask_alignr_epi32::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi32::<1>(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_maskz_alignr_epi32::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi32::<1>(0b00001111, a, b);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_and_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_and_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_mask_and_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_and_epi32(a, 0b01111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_and_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_maskz_and_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_and_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_and_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_and_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_and_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_and_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_maskz_and_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_and_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_and_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_and_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_and_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_and_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_maskz_and_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_and_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_si512() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_and_epi32(a, b);
+        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_or_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_or_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_mask_or_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_or_epi32(a, 0b11111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_or_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_maskz_or_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_or_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_or_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_or_epi32(a, b);
+        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_or_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_or_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_or_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_or_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_maskz_or_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_or_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_or_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_or_epi32(a, b);
+        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_or_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_or_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_or_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_or_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_maskz_or_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_or_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_si512() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_or_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_xor_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_xor_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_mask_xor_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_xor_epi32(a, 0b01111111_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_xor_epi32() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_maskz_xor_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_xor_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_xor_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_xor_epi32(a, b);
+        let e = _mm256_set1_epi32(1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_xor_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_xor_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_xor_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_xor_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_maskz_xor_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_xor_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_xor_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_xor_epi32(a, b);
+        let e = _mm_set1_epi32(1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_xor_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_xor_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_xor_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_xor_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_maskz_xor_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_xor_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_si512() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            1 << 1 | 1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 3,
+        );
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(
+            1 << 1, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 3 | 1 << 4,
+        );
+        let r = _mm512_xor_epi32(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            1 << 2, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_epi32() {
+        let a = _mm512_set1_epi32(0);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm512_andnot_epi32(a, b);
+        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_andnot_epi32() {
+        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm512_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_andnot_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_andnot_epi32() {
+        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm512_maskz_andnot_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi32(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_andnot_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm256_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_andnot_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_andnot_epi32() {
+        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm256_maskz_andnot_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_andnot_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_andnot_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_andnot_epi32(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_andnot_epi32() {
+        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
+        let r = _mm_maskz_andnot_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_andnot_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kand() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b11001100_00110011;
+        let r = _mm512_kand(a, b);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kand_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b11001100_00110011;
+        let r = _kand_mask16(a, b);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kor() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kor(a, b);
+        let e: u16 = 0b11101110_00111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kor_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kor_mask16(a, b);
+        let e: u16 = 0b11101110_00111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kxor() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kxor(a, b);
+        let e: u16 = 0b11100010_00111000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kxor_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kxor_mask16(a, b);
+        let e: u16 = 0b11100010_00111000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_knot() {
+        let a: u16 = 0b11001100_00110011;
+        let r = _mm512_knot(a);
+        let e: u16 = 0b00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_knot_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let r = _knot_mask16(a);
+        let e: u16 = 0b00110011_11001100;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kandn() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kandn(a, b);
+        let e: u16 = 0b00100010_00001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kandn_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kandn_mask16(a, b);
+        let e: u16 = 0b00100010_00001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kxnor() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kxnor(a, b);
+        let e: u16 = 0b00011101_11000111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_kxnor_mask16() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _kxnor_mask16(a, b);
+        let e: u16 = 0b00011101_11000111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kmov() {
+        let a: u16 = 0b11001100_00110011;
+        let r = _mm512_kmov(a);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_int2mask() {
+        let a: i32 = 0b11001100_00110011;
+        let r = _mm512_int2mask(a);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2int() {
+        let k1: __mmask16 = 0b11001100_00110011;
+        let r = _mm512_mask2int(k1);
+        let e: i32 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kunpackb() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kunpackb(a, b);
+        let e: u16 = 0b00101110_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kortestc() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kortestc(a, b);
+        assert_eq!(r, 0);
+        let b: u16 = 0b11111111_11111111;
+        let r = _mm512_kortestc(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_test_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi32_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_test_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi32_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_test_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi32_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_test_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm_test_epi32_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_test_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_testn_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi32_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_testn_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 1);
+        let r = _mm512_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi32_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_testn_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_testn_epi32_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi32_mask() {
+        let a = _mm256_set1_epi32(1 << 0);
+        let b = _mm256_set1_epi32(1 << 1);
+        let r = _mm256_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_testn_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_testn_epi32_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi32_mask() {
+        let a = _mm_set1_epi32(1 << 0);
+        let b = _mm_set1_epi32(1 << 1);
+        let r = _mm_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi32_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_stream_ps() {
+        #[repr(align(32))]
+        struct Memory {
+            pub data: [f32; 16],
+        }
+        let a = _mm512_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 16] };
+
+        _mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
+        for i in 0..16 {
+            assert_eq!(mem.data[i], get_m512(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let e: i32 = _mm512_reduce_add_epi32(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let e: i32 = _mm512_mask_reduce_add_epi32(0b11111111_00000000, a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_ps() {
+        let a = _mm512_set1_ps(1.);
+        let e: f32 = _mm512_reduce_add_ps(a);
+        assert_eq!(16., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_ps() {
+        let a = _mm512_set1_ps(1.);
+        let e: f32 = _mm512_mask_reduce_add_ps(0b11111111_00000000, a);
+        assert_eq!(8., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let e: i32 = _mm512_reduce_mul_epi32(a);
+        assert_eq!(65536, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let e: i32 = _mm512_mask_reduce_mul_epi32(0b11111111_00000000, a);
+        assert_eq!(256, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_ps() {
+        let a = _mm512_set1_ps(2.);
+        let e: f32 = _mm512_reduce_mul_ps(a);
+        assert_eq!(65536., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_ps() {
+        let a = _mm512_set1_ps(2.);
+        let e: f32 = _mm512_mask_reduce_mul_ps(0b11111111_00000000, a);
+        assert_eq!(256., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_reduce_max_epi32(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_mask_reduce_max_epi32(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_reduce_max_epu32(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_mask_reduce_max_epu32(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_reduce_max_ps(a);
+        assert_eq!(15., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_mask_reduce_max_ps(0b11111111_00000000, a);
+        assert_eq!(7., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_reduce_min_epi32(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i32 = _mm512_mask_reduce_min_epi32(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_reduce_min_epu32(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epu32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u32 = _mm512_mask_reduce_min_epu32(0b11111111_00000000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_reduce_min_ps(a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let e: f32 = _mm512_mask_reduce_min_ps(0b11111111_00000000, a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_and_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_reduce_and_epi32(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_and_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_or_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_reduce_or_epi32(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_or_epi32() {
+        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_epi32() {
+        let src = _mm512_set1_epi32(200);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_compress_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_compress_epi32(src, 0b01010101_01010101, a);
+        let e = _mm512_set_epi32(
+            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_compress_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_compress_epi32(0b01010101_01010101, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi32() {
+        let src = _mm256_set1_epi32(200);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_compress_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_compress_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(200, 200, 200, 200, 1, 3, 5, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_compress_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_compress_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 1, 3, 5, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi32() {
+        let src = _mm_set1_epi32(200);
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_mask_compress_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_compress_epi32(src, 0b00000101, a);
+        let e = _mm_set_epi32(200, 200, 1, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_maskz_compress_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_compress_epi32(0b00000101, a);
+        let e = _mm_set_epi32(0, 0, 1, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_ps() {
+        let src = _mm512_set1_ps(200.);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_compress_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_compress_ps(src, 0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            200., 200., 200., 200., 200., 200., 200., 200., 1., 3., 5., 7., 9., 11., 13., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_compress_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_compress_ps(0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 3., 5., 7., 9., 11., 13., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_ps() {
+        let src = _mm256_set1_ps(200.);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_compress_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_compress_ps(src, 0b01010101, a);
+        let e = _mm256_set_ps(200., 200., 200., 200., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_compress_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_compress_ps(0b01010101, a);
+        let e = _mm256_set_ps(0., 0., 0., 0., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_ps() {
+        let src = _mm_set1_ps(200.);
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_mask_compress_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_compress_ps(src, 0b00000101, a);
+        let e = _mm_set_ps(200., 200., 1., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_maskz_compress_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_compress_ps(0b00000101, a);
+        let e = _mm_set_ps(0., 0., 1., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_epi32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let mut r = [0_i32; 16];
+        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i32; 16]);
+        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi32() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let mut r = [0_i32; 8];
+        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i32; 8]);
+        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut r = [0_i32; 4];
+        _mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i32; 4]);
+        _mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1011, a);
+        assert_eq!(&r, &[1, 2, 4, 0]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_epi64() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let mut r = [0_i64; 8];
+        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i64; 8]);
+        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi64() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let mut r = [0_i64; 4];
+        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i64; 4]);
+        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b1011, a);
+        assert_eq!(&r, &[1, 2, 4, 0]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi64() {
+        let a = _mm_setr_epi64x(1, 2);
+        let mut r = [0_i64; 2];
+        _mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i64; 2]);
+        _mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b10, a);
+        assert_eq!(&r, &[2, 0]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_ps() {
+        let a = _mm512_setr_ps(
+            1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32,
+            13_f32, 14_f32, 15_f32, 16_f32,
+        );
+        let mut r = [0_f32; 16];
+        _mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_f32; 16]);
+        _mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
+        assert_eq!(
+            &r,
+            &[
+                2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32,
+                0_f32, 0_f32, 0_f32, 0_f32, 0_f32
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_ps() {
+        let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32);
+        let mut r = [0_f32; 8];
+        _mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_f32; 8]);
+        _mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b11001010, a);
+        assert_eq!(
+            &r,
+            &[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32]
+        );
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_ps() {
+        let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32);
+        let mut r = [0.; 4];
+        _mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0.; 4]);
+        _mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1011, a);
+        assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compressstoreu_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let mut r = [0.; 8];
+        _mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0.; 8]);
+        _mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b11001010, a);
+        assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let mut r = [0.; 4];
+        _mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0.; 4]);
+        _mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b1011, a);
+        assert_eq!(&r, &[1., 2., 4., 0.]);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let mut r = [0.; 2];
+        _mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0.; 2]);
+        _mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b10, a);
+        assert_eq!(&r, &[2., 0.]);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_epi32() {
+        let src = _mm512_set1_epi32(200);
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_expand_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_expand_epi32(src, 0b01010101_01010101, a);
+        let e = _mm512_set_epi32(
+            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_epi32() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_expand_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_expand_epi32(0b01010101_01010101, a);
+        let e = _mm512_set_epi32(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi32() {
+        let src = _mm256_set1_epi32(200);
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_expand_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_expand_epi32(src, 0b01010101, a);
+        let e = _mm256_set_epi32(200, 4, 200, 5, 200, 6, 200, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi32() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_expand_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_expand_epi32(0b01010101, a);
+        let e = _mm256_set_epi32(0, 4, 0, 5, 0, 6, 0, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi32() {
+        let src = _mm_set1_epi32(200);
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_mask_expand_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_expand_epi32(src, 0b00000101, a);
+        let e = _mm_set_epi32(200, 2, 200, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi32() {
+        let a = _mm_set_epi32(0, 1, 2, 3);
+        let r = _mm_maskz_expand_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_expand_epi32(0b00000101, a);
+        let e = _mm_set_epi32(0, 2, 0, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_ps() {
+        let src = _mm512_set1_ps(200.);
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_mask_expand_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_expand_ps(src, 0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            200., 8., 200., 9., 200., 10., 200., 11., 200., 12., 200., 13., 200., 14., 200., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_ps() {
+        let a = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        let r = _mm512_maskz_expand_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_expand_ps(0b01010101_01010101, a);
+        let e = _mm512_set_ps(
+            0., 8., 0., 9., 0., 10., 0., 11., 0., 12., 0., 13., 0., 14., 0., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_ps() {
+        let src = _mm256_set1_ps(200.);
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_expand_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_expand_ps(src, 0b01010101, a);
+        let e = _mm256_set_ps(200., 4., 200., 5., 200., 6., 200., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_expand_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_expand_ps(0b01010101, a);
+        let e = _mm256_set_ps(0., 4., 0., 5., 0., 6., 0., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_ps() {
+        let src = _mm_set1_ps(200.);
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_mask_expand_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_expand_ps(src, 0b00000101, a);
+        let e = _mm_set_ps(200., 2., 200., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_ps() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let r = _mm_maskz_expand_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_expand_ps(0b00000101, a);
+        let e = _mm_set_ps(0., 2., 0., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_epi32() {
+        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_epi32(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_loadu_epi32() {
+        let a = &[4, 3, 2, 5, 8, 9, 64, 50];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_epi32(black_box(p));
+        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_loadu_epi32() {
+        let a = &[4, 3, 2, 5];
+        let p = a.as_ptr();
+        let r = _mm_loadu_epi32(black_box(p));
+        let e = _mm_setr_epi32(4, 3, 2, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(
+            0,
+            0,
+            0,
+            0,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_epi32() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_storeu_epi32() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm256_undefined_si256();
+        _mm256_storeu_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_storeu_epi32() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm_storeu_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_si512() {
+        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_si512(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_si512() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_storeu_si512(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_si512() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_si512(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_si512() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_store_si512(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_epi32(black_box(p));
+        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 8],
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, 8, 9, 64, 50],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm256_load_epi32(black_box(p));
+        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 4],
+        }
+        let a = Align { data: [4, 3, 2, 5] };
+        let p = (a.data).as_ptr();
+        let r = _mm_load_epi32(black_box(p));
+        let e = _mm_setr_epi32(4, 3, 2, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_epi32() {
+        let a = _mm512_set1_epi32(9);
+        let mut r = _mm512_undefined_epi32();
+        _mm512_store_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_store_epi32() {
+        let a = _mm256_set1_epi32(9);
+        let mut r = _mm256_undefined_si256();
+        _mm256_store_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_store_epi32() {
+        let a = _mm_set1_epi32(9);
+        let mut r = _mm_undefined_si128();
+        _mm_store_epi32(&mut r as *mut _ as *mut i32, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [
+                4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+            ],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_ps(black_box(p));
+        let e = _mm512_setr_ps(
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_ps() {
+        let a = _mm512_set1_ps(9.);
+        let mut r = _mm512_undefined_ps();
+        _mm512_store_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_set1_epi32() {
+        let src = _mm512_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm512_mask_set1_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi32(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm512_maskz_set1_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi32(0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi32() {
+        let src = _mm256_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm256_mask_set1_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi32(src, 0b11111111, a);
+        let e = _mm256_set1_epi32(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm256_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm256_maskz_set1_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi32() {
+        let src = _mm_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm_mask_set1_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi32(src, 0b00001111, a);
+        let e = _mm_set1_epi32(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm_maskz_set1_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_move_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_move_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_move_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 40.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_move_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_move_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_move_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 40.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_move_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_move_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_move_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_move_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_move_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_move_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_add_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_add_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_add_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_add_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_add_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_add_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_add_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_add_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_sub_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sub_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_sub_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sub_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sub_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sub_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sub_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sub_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_mul_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_mul_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_mul_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_mul_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_mul_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_mul_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_mul_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_mul_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_div_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_div_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_div_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_div_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_div_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_div_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_div_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_div_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_max_ss(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_max_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_max_ss(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_max_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_max_sd(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_max_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_max_sd(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_max_sd(0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_min_ss(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_min_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_min_ss(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_min_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_min_sd(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_min_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_min_sd(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_min_sd(0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_sqrt_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sqrt_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_sqrt_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sqrt_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sqrt_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sqrt_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sqrt_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sqrt_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rsqrt14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_rsqrt14_ss(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rsqrt14_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_rsqrt14_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_rsqrt14_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rsqrt14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_rsqrt14_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_rsqrt14_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rsqrt14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_rsqrt14_sd(a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rsqrt14_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_rsqrt14_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_rsqrt14_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rsqrt14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_rsqrt14_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_rsqrt14_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rcp14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_rcp14_ss(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rcp14_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_rcp14_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_rcp14_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rcp14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_rcp14_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_rcp14_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rcp14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_rcp14_sd(a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rcp14_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_rcp14_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_rcp14_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rcp14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_rcp14_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_rcp14_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_getexp_ss(a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_ss(a, 0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getexp_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_ss(0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getexp_ss(0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_getexp_sd(a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_sd(a, 0, a, b);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getexp_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_sd(0, a, b);
+        let e = _mm_set_pd(2., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getexp_sd(0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 20.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
+        let e = _mm_set_pd(20., 20.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
+        let e = _mm_set_pd(20., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_ss::<0>(a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_ss::<0>(a, 0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_ss::<0>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_ss::<0>(0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_roundscale_ss::<0>(0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_sd::<0>(a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_sd::<0>(a, 0, a, b);
+        let e = _mm_set_pd(2.2, 2.2);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_sd::<0>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_sd::<0>(0, a, b);
+        let e = _mm_set_pd(2.2, 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_roundscale_sd::<0>(0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_ss(a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_ss(a, 0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_scalef_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_scalef_ss(0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_scalef_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_sd(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_sd(a, 0, a, b);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_scalef_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_scalef_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_scalef_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmadd_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmadd_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmadd_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmadd_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmadd_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmadd_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmadd_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmadd_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmsub_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsub_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmsub_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmsub_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmsub_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsub_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmsub_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsub_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmsub_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmsub_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmsub_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsub_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmadd_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmadd_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmadd_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmadd_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmadd_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmadd_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmadd_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmadd_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmadd_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmadd_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmadd_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmadd_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmsub_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmsub_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmsub_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmsub_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmsub_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmsub_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmsub_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmsub_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmsub_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmsub_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmsub_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmsub_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_add_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_add_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sub_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sub_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mul_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mul_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_div_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_div_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sqrt_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r =
+            _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sqrt_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r =
+            _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(2., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r =
+            _mm_getmant_round_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
+                a, b,
+            );
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 20.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a, b);
+        let e = _mm_set_ps(20., 20., 20., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getmant_round_ss::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11111111, a, b);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r =
+            _mm_getmant_round_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
+                a, b,
+            );
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a, b);
+        let e = _mm_set_pd(20., 20.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a, b);
+        let e = _mm_set_pd(20., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getmant_round_sd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11111111, a, b);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        let e = _mm_set_pd(2.2, 2.2);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(2.2, 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r =
+            _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r =
+            _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., -1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., -1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_ps(1., 1., 1., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_ps(3., 3., 3., -5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
+        );
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
+        );
+        let e = _mm_set_pd(1., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fnmsub_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
+        );
+        let e = _mm_set_pd(3., -5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_fixupimm_ss::<5>(a, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_mask_fixupimm_ss::<5>(a, 0b11111111, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_maskz_fixupimm_ss::<5>(0b00000000, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fixupimm_ss::<5>(0b11111111, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_fixupimm_sd::<5>(a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_mask_fixupimm_sd::<5>(a, 0b11111111, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_maskz_fixupimm_sd::<5>(0b00000000, a, b, c);
+        let e = _mm_set_pd(0., 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fixupimm_sd::<5>(0b11111111, a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_round_ss() {
+        let a = _mm_set_ps(1., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm_set_ps(1., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_round_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_mask_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_round_ss() {
+        let a = _mm_set_ps(0., 0., 0., f32::NAN);
+        let b = _mm_set1_ps(f32::MAX);
+        let c = _mm_set1_epi32(i32::MAX);
+        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
+        let e = _mm_set_ps(0., 0., 0., -0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fixupimm_round_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fixupimm_round_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_mask_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fixupimm_round_sd() {
+        let a = _mm_set_pd(0., f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
+        let e = _mm_set_pd(0., 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
+        let e = _mm_set_pd(0., -0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvtss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_mask_cvtss_sd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_cvtss_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvtss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_maskz_cvtss_sd(0, a, b);
+        let e = _mm_set_pd(6., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvtss_sd(0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvtsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_mask_cvtsd_ss(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_cvtsd_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvtsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvtsd_ss(0, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvtsd_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvt_roundss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvt_roundss_sd() {
+        let a = _mm_set_pd(6., -7.5);
+        let b = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        let e = _mm_set_pd(6., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
+        let e = _mm_set_pd(6., -1.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cvt_roundsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_cvt_roundsd_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
+        let e = _mm_set_ps(0., -0.5, 1., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_si32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_i32(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_si32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i32 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_i32(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvt_roundi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvt_roundsi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u32 = 9;
+        let r = _mm_cvt_roundu32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvti32_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti32_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i32 = 9;
+        let r = _mm_cvti32_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_si32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_si32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_i32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_u32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_i32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_i32(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_u32() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_si32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_si32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_i32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_u32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_i32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_i32(a);
+        let e: i32 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_u32() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_u32(a);
+        let e: u32 = u32::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu32_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u32 = 9;
+        let r = _mm_cvtu32_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu32_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u32 = 9;
+        let r = _mm_cvtu32_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_comi_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_comi_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e: i32 = 0;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_comi_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_comi_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let e: i32 = 0;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsi512_si32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvtsi512_si32(a);
+        let e: i32 = 1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_pd() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
+        let r = _mm512_shuffle_pd::<0b11_11_11_11>(a, b);
+        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_pd() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
+        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0b11111111, a, b);
+        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_pd() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
+        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0b00001111, a, b);
+        let e = _mm512_setr_pd(4., 3., 8., 7., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_epi32() {
+        let src = _mm512_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_expandloadu_epi32(src, m, black_box(p));
+        let e = _mm512_set_epi32(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_expandloadu_epi32(m, black_box(p));
+        let e = _mm512_set_epi32(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi32() {
+        let src = _mm256_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_epi32(src, m, black_box(p));
+        let e = _mm256_set_epi32(4, 3, 2, 42, 1, 42, 42, 42);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_epi32(m, black_box(p));
+        let e = _mm256_set_epi32(4, 3, 2, 0, 1, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi32() {
+        let src = _mm_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11111000;
+        let r = _mm_mask_expandloadu_epi32(src, m, black_box(p));
+        let e = _mm_set_epi32(1, 42, 42, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi32() {
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11111000;
+        let r = _mm_maskz_expandloadu_epi32(m, black_box(p));
+        let e = _mm_set_epi32(1, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_epi64() {
+        let src = _mm512_set1_epi64(42);
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_mask_expandloadu_epi64(src, m, black_box(p));
+        let e = _mm512_set_epi64(4, 3, 2, 42, 1, 42, 42, 42);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_epi64() {
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_maskz_expandloadu_epi64(m, black_box(p));
+        let e = _mm512_set_epi64(4, 3, 2, 0, 1, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi64() {
+        let src = _mm256_set1_epi64x(42);
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_epi64(src, m, black_box(p));
+        let e = _mm256_set_epi64x(1, 42, 42, 42);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi64() {
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_epi64(m, black_box(p));
+        let e = _mm256_set_epi64x(1, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi64() {
+        let src = _mm_set1_epi64x(42);
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_epi64(src, m, black_box(p));
+        let e = _mm_set_epi64x(42, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi64() {
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_epi64(m, black_box(p));
+        let e = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_ps() {
+        let src = _mm512_set1_ps(42.);
+        let a = &[
+            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_expandloadu_ps(src, m, black_box(p));
+        let e = _mm512_set_ps(
+            8., 7., 6., 42., 5., 42., 42., 42., 4., 3., 42., 42., 2., 42., 1., 42.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_ps() {
+        let a = &[
+            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_expandloadu_ps(m, black_box(p));
+        let e = _mm512_set_ps(
+            8., 7., 6., 0., 5., 0., 0., 0., 4., 3., 0., 0., 2., 0., 1., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_ps() {
+        let src = _mm256_set1_ps(42.);
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_ps(src, m, black_box(p));
+        let e = _mm256_set_ps(4., 3., 2., 42., 1., 42., 42., 42.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_ps() {
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_ps(m, black_box(p));
+        let e = _mm256_set_ps(4., 3., 2., 0., 1., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_ps() {
+        let src = _mm_set1_ps(42.);
+        let a = &[1.0f32, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_ps(src, m, black_box(p));
+        let e = _mm_set_ps(1., 42., 42., 42.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_ps() {
+        let a = &[1.0f32, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_ps(m, black_box(p));
+        let e = _mm_set_ps(1., 0., 0., 0.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expandloadu_pd() {
+        let src = _mm512_set1_pd(42.);
+        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_mask_expandloadu_pd(src, m, black_box(p));
+        let e = _mm512_set_pd(4., 3., 2., 42., 1., 42., 42., 42.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expandloadu_pd() {
+        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm512_maskz_expandloadu_pd(m, black_box(p));
+        let e = _mm512_set_pd(4., 3., 2., 0., 1., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_pd() {
+        let src = _mm256_set1_pd(42.);
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_mask_expandloadu_pd(src, m, black_box(p));
+        let e = _mm256_set_pd(1., 42., 42., 42.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm256_maskz_expandloadu_pd(m, black_box(p));
+        let e = _mm256_set_pd(1., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_pd() {
+        let src = _mm_set1_pd(42.);
+        let a = &[1.0f64, 2.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_pd(src, m, black_box(p));
+        let e = _mm_set_pd(42., 42.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_pd() {
+        let a = &[1.0f64, 2.];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_pd(m, black_box(p));
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512gfni.rs b/library/stdarch/crates/core_arch/src/x86/avx512gfni.rs
new file mode 100644
index 000000000..d8ac5c29c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512gfni.rs
@@ -0,0 +1,1492 @@
+//! Galois Field New Instructions (GFNI)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::simd::i8x16;
+use crate::core_arch::simd::i8x32;
+use crate::core_arch::simd::i8x64;
+use crate::core_arch::simd_llvm::simd_select_bitmask;
+use crate::core_arch::x86::__m128i;
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+use crate::core_arch::x86::__mmask16;
+use crate::core_arch::x86::__mmask32;
+use crate::core_arch::x86::__mmask64;
+use crate::core_arch::x86::_mm256_setzero_si256;
+use crate::core_arch::x86::_mm512_setzero_si512;
+use crate::core_arch::x86::_mm_setzero_si128;
+use crate::core_arch::x86::m128iExt;
+use crate::core_arch::x86::m256iExt;
+use crate::core_arch::x86::m512iExt;
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
+    fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
+    #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
+    fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
+    #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
+    fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
+    #[link_name = "llvm.x86.vgf2p8affineqb.512"]
+    fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
+    #[link_name = "llvm.x86.vgf2p8affineqb.256"]
+    fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
+    #[link_name = "llvm.x86.vgf2p8affineqb.128"]
+    fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
+    #[link_name = "llvm.x86.vgf2p8mulb.512"]
+    fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.vgf2p8mulb.256"]
+    fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.vgf2p8mulb.128"]
+    fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
+}
+
+// LLVM requires AVX512BW for a lot of these instructions, see
+// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
+// however our tests also require the target feature list to match Intel's
+// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
+// requirement (for now)
+// also see
+// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
+// for forcing GFNI, BW and optionally VL extension
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub unsafe fn _mm512_mask_gf2p8mul_epi8(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    transmute(simd_select_bitmask(
+        k,
+        vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
+        src.as_i8x64(),
+    ))
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(
+        k,
+        vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
+        zero,
+    ))
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub unsafe fn _mm256_mask_gf2p8mul_epi8(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    transmute(simd_select_bitmask(
+        k,
+        vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
+        src.as_i8x32(),
+    ))
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(
+        k,
+        vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
+        zero,
+    ))
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub unsafe fn _mm_mask_gf2p8mul_epi8(
+    src: __m128i,
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    transmute(simd_select_bitmask(
+        k,
+        vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
+        src.as_i8x16(),
+    ))
+}
+
+/// Performs a multiplication in GF(2^8) on the packed bytes.
+/// The field is in polynomial representation with the reduction polynomial
+///  x^8 + x^4 + x^3 + x + 1.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8mulb))]
+pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(
+        k,
+        vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
+        zero,
+    ))
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affine_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    let r = vgf2p8affineqb_512(x, a, b);
+    transmute(r)
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affine_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let zero = _mm512_setzero_si512().as_i8x64();
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    let r = vgf2p8affineqb_512(x, a, b);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affine_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
+    src: __m512i,
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    let r = vgf2p8affineqb_512(x, a, b);
+    transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affine_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    let r = vgf2p8affineqb_256(x, a, b);
+    transmute(r)
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affine_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let zero = _mm256_setzero_si256().as_i8x32();
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    let r = vgf2p8affineqb_256(x, a, b);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affine_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
+    src: __m256i,
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    let r = vgf2p8affineqb_256(x, a, b);
+    transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affine_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    let r = vgf2p8affineqb_128(x, a, b);
+    transmute(r)
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affine_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let zero = _mm_setzero_si128().as_i8x16();
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    let r = vgf2p8affineqb_128(x, a, b);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Performs an affine transformation on the packed bytes in x.
+/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affine_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
+    src: __m128i,
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    let r = vgf2p8affineqb_128(x, a, b);
+    transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    let r = vgf2p8affineinvqb_512(x, a, b);
+    transmute(r)
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let zero = _mm512_setzero_si512().as_i8x64();
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    let r = vgf2p8affineinvqb_512(x, a, b);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
+    src: __m512i,
+    k: __mmask64,
+    x: __m512i,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x64();
+    let a = a.as_i8x64();
+    let r = vgf2p8affineinvqb_512(x, a, b);
+    transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    let r = vgf2p8affineinvqb_256(x, a, b);
+    transmute(r)
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let zero = _mm256_setzero_si256().as_i8x32();
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    let r = vgf2p8affineinvqb_256(x, a, b);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
+    src: __m256i,
+    k: __mmask32,
+    x: __m256i,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x32();
+    let a = a.as_i8x32();
+    let r = vgf2p8affineinvqb_256(x, a, b);
+    transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    let r = vgf2p8affineinvqb_128(x, a, b);
+    transmute(r)
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let zero = _mm_setzero_si128().as_i8x16();
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    let r = vgf2p8affineinvqb_128(x, a, b);
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Performs an affine transformation on the inverted packed bytes in x.
+/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
+/// and b being a constant 8-bit immediate value.
+/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
+/// The inverse of 0 is 0.
+/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affineinv_epi64_epi8)
+#[inline]
+#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
+#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
+    src: __m128i,
+    k: __mmask16,
+    x: __m128i,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(B);
+    let b = B as u8;
+    let x = x.as_i8x16();
+    let a = a.as_i8x16();
+    let r = vgf2p8affineinvqb_128(x, a, b);
+    transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use core::hint::black_box;
+    use core::intrinsics::size_of;
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    fn mulbyte(left: u8, right: u8) -> u8 {
+        // this implementation follows the description in
+        // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8mul_epi8
+        const REDUCTION_POLYNOMIAL: u16 = 0x11b;
+        let left: u16 = left.into();
+        let right: u16 = right.into();
+        let mut carryless_product: u16 = 0;
+
+        // Carryless multiplication
+        for i in 0..8 {
+            if ((left >> i) & 0x01) != 0 {
+                carryless_product ^= right << i;
+            }
+        }
+
+        // reduction, adding in "0" where appropriate to clear out high bits
+        // note that REDUCTION_POLYNOMIAL is zero in this context
+        for i in (8..=14).rev() {
+            if ((carryless_product >> i) & 0x01) != 0 {
+                carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
+            }
+        }
+
+        carryless_product as u8
+    }
+
+    const NUM_TEST_WORDS_512: usize = 4;
+    const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
+    const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
+    const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
+    const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
+    const NUM_BYTES: usize = 256;
+    const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
+    const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
+    const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
+
+    fn parity(input: u8) -> u8 {
+        let mut accumulator = 0;
+        for i in 0..8 {
+            accumulator ^= (input >> i) & 0x01;
+        }
+        accumulator
+    }
+
+    fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
+        // this implementation follows the description in
+        // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affine_epi64_epi8
+        let mut accumulator = 0;
+
+        for bit in 0..8 {
+            accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
+        }
+
+        accumulator ^ b
+    }
+
+    fn generate_affine_mul_test_data(
+        immediate: u8,
+    ) -> (
+        [u64; NUM_TEST_WORDS_64],
+        [u8; NUM_TEST_ENTRIES],
+        [u8; NUM_TEST_ENTRIES],
+    ) {
+        let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
+        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+
+        for i in 0..NUM_TEST_WORDS_64 {
+            left[i] = (i as u64) * 103 * 101;
+            for j in 0..8 {
+                let j64 = j as u64;
+                right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
+                result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
+            }
+        }
+
+        (left, right, result)
+    }
+
+    fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
+        let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
+        let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
+
+        for i in 0..NUM_BYTES {
+            input[i] = (i % 256) as u8;
+            result[i] = if i == 0 { 0 } else { 1 };
+        }
+
+        (input, result)
+    }
+
+    const AES_S_BOX: [u8; NUM_BYTES] = [
+        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
+        0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
+        0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
+        0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
+        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
+        0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
+        0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
+        0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
+        0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
+        0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
+        0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
+        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
+        0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
+        0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
+        0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
+        0x16,
+    ];
+
+    fn generate_byte_mul_test_data() -> (
+        [u8; NUM_TEST_ENTRIES],
+        [u8; NUM_TEST_ENTRIES],
+        [u8; NUM_TEST_ENTRIES],
+    ) {
+        let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
+
+        for i in 0..NUM_TEST_ENTRIES {
+            left[i] = (i % 256) as u8;
+            right[i] = left[i].wrapping_mul(101);
+            result[i] = mulbyte(left[i], right[i]);
+        }
+
+        (left, right, result)
+    }
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
+        let byte_offset = word_index * 16 / size_of::<T>();
+        let pointer = data.as_ptr().offset(byte_offset as isize) as *const __m128i;
+        _mm_loadu_si128(black_box(pointer))
+    }
+
+    #[target_feature(enable = "avx")]
+    unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
+        let byte_offset = word_index * 32 / size_of::<T>();
+        let pointer = data.as_ptr().offset(byte_offset as isize) as *const __m256i;
+        _mm256_loadu_si256(black_box(pointer))
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
+        let byte_offset = word_index * 64 / size_of::<T>();
+        let pointer = data.as_ptr().offset(byte_offset as isize) as *const i32;
+        _mm512_loadu_si512(black_box(pointer))
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw")]
+    unsafe fn test_mm512_gf2p8mul_epi8() {
+        let (left, right, expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&left, i);
+            let right = load_m512i_word(&right, i);
+            let expected = load_m512i_word(&expected, i);
+            let result = _mm512_gf2p8mul_epi8(left, right);
+            assert_eq_m512i(result, expected);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw")]
+    unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&left, i);
+            let right = load_m512i_word(&right, i);
+            let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
+            assert_eq_m512i(result_zero, _mm512_setzero_si512());
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8mul_epi8(left, right);
+            let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
+            let expected_masked =
+                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw")]
+    unsafe fn test_mm512_mask_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&left, i);
+            let right = load_m512i_word(&right, i);
+            let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
+            assert_eq_m512i(result_left, left);
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8mul_epi8(left, right);
+            let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
+            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_gf2p8mul_epi8() {
+        let (left, right, expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&left, i);
+            let right = load_m256i_word(&right, i);
+            let expected = load_m256i_word(&expected, i);
+            let result = _mm256_gf2p8mul_epi8(left, right);
+            assert_eq_m256i(result, expected);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&left, i);
+            let right = load_m256i_word(&right, i);
+            let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
+            assert_eq_m256i(result_zero, _mm256_setzero_si256());
+            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
+            const MASK_WORDS: i32 = 0b01_10_11_00;
+            let expected_result = _mm256_gf2p8mul_epi8(left, right);
+            let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
+            let expected_masked =
+                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&left, i);
+            let right = load_m256i_word(&right, i);
+            let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
+            assert_eq_m256i(result_left, left);
+            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
+            const MASK_WORDS: i32 = 0b01_10_11_00;
+            let expected_result = _mm256_gf2p8mul_epi8(left, right);
+            let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
+            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_gf2p8mul_epi8() {
+        let (left, right, expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&left, i);
+            let right = load_m128i_word(&right, i);
+            let expected = load_m128i_word(&expected, i);
+            let result = _mm_gf2p8mul_epi8(left, right);
+            assert_eq_m128i(result, expected);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&left, i);
+            let right = load_m128i_word(&right, i);
+            let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
+            assert_eq_m128i(result_zero, _mm_setzero_si128());
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8mul_epi8(left, right);
+            let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
+            let expected_masked =
+                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_gf2p8mul_epi8() {
+        let (left, right, _expected) = generate_byte_mul_test_data();
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&left, i);
+            let right = load_m128i_word(&right, i);
+            let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
+            assert_eq_m128i(result_left, left);
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8mul_epi8(left, right);
+            let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
+            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw")]
+    unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        let constant: i64 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm512_set1_epi64(identity);
+        let constant = _mm512_set1_epi64(constant);
+        let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
+
+        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
+        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let data = load_m512i_word(&bytes, i);
+            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m512i(result, data);
+            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m512i(result, constant_reference);
+            let data = load_m512i_word(&more_bytes, i);
+            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m512i(result, data);
+            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m512i(result, constant_reference);
+
+            let matrix = load_m512i_word(&matrices, i);
+            let vector = load_m512i_word(&vectors, i);
+            let reference = load_m512i_word(&references, i);
+
+            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
+            assert_eq_m512i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw")]
+    unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let matrix = load_m512i_word(&matrices, i);
+            let vector = load_m512i_word(&vectors, i);
+            let result_zero =
+                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m512i(result_zero, _mm512_setzero_si512());
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw")]
+    unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&vectors, i);
+            let right = load_m512i_word(&matrices, i);
+            let result_left =
+                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m512i(result_left, left);
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        let constant: i64 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm256_set1_epi64x(identity);
+        let constant = _mm256_set1_epi64x(constant);
+        let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
+
+        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
+        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let data = load_m256i_word(&bytes, i);
+            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m256i(result, data);
+            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m256i(result, constant_reference);
+            let data = load_m256i_word(&more_bytes, i);
+            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m256i(result, data);
+            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m256i(result, constant_reference);
+
+            let matrix = load_m256i_word(&matrices, i);
+            let vector = load_m256i_word(&vectors, i);
+            let reference = load_m256i_word(&references, i);
+
+            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
+            assert_eq_m256i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let matrix = load_m256i_word(&matrices, i);
+            let vector = load_m256i_word(&vectors, i);
+            let result_zero =
+                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m256i(result_zero, _mm256_setzero_si256());
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&vectors, i);
+            let right = load_m256i_word(&matrices, i);
+            let result_left =
+                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m256i(result_left, left);
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_gf2p8affine_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        let constant: i64 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm_set1_epi64x(identity);
+        let constant = _mm_set1_epi64x(constant);
+        let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
+
+        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
+        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let data = load_m128i_word(&bytes, i);
+            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m128i(result, data);
+            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m128i(result, constant_reference);
+            let data = load_m128i_word(&more_bytes, i);
+            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
+            assert_eq_m128i(result, data);
+            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
+            assert_eq_m128i(result, constant_reference);
+
+            let matrix = load_m128i_word(&matrices, i);
+            let vector = load_m128i_word(&vectors, i);
+            let reference = load_m128i_word(&references, i);
+
+            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
+            assert_eq_m128i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let matrix = load_m128i_word(&matrices, i);
+            let vector = load_m128i_word(&vectors, i);
+            let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m128i(result_zero, _mm_setzero_si128());
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&vectors, i);
+            let right = load_m128i_word(&matrices, i);
+            let result_left =
+                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m128i(result_left, left);
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw")]
+    unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm512_set1_epi64(identity);
+
+        // validate inversion
+        let (inputs, results) = generate_inv_tests_data();
+
+        for i in 0..NUM_BYTES_WORDS_512 {
+            let input = load_m512i_word(&inputs, i);
+            let reference = load_m512i_word(&results, i);
+            let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
+            let remultiplied = _mm512_gf2p8mul_epi8(result, input);
+            assert_eq_m512i(remultiplied, reference);
+        }
+
+        // validate subsequent affine operation
+        let (matrices, vectors, _affine_expected) =
+            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let vector = load_m512i_word(&vectors, i);
+            let matrix = load_m512i_word(&matrices, i);
+
+            let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
+            let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
+            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            assert_eq_m512i(result, reference);
+        }
+
+        // validate everything by virtue of checking against the AES SBox
+        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
+        let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
+
+        for i in 0..NUM_BYTES_WORDS_512 {
+            let reference = load_m512i_word(&AES_S_BOX, i);
+            let input = load_m512i_word(&inputs, i);
+            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
+            assert_eq_m512i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw")]
+    unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let matrix = load_m512i_word(&matrices, i);
+            let vector = load_m512i_word(&vectors, i);
+            let result_zero =
+                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m512i(result_zero, _mm512_setzero_si512());
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw")]
+    unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_512 {
+            let left = load_m512i_word(&vectors, i);
+            let right = load_m512i_word(&matrices, i);
+            let result_left =
+                _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m512i(result_left, left);
+            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
+            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
+            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
+                left, mask_bytes, left, right,
+            );
+            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
+            assert_eq_m512i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm256_set1_epi64x(identity);
+
+        // validate inversion
+        let (inputs, results) = generate_inv_tests_data();
+
+        for i in 0..NUM_BYTES_WORDS_256 {
+            let input = load_m256i_word(&inputs, i);
+            let reference = load_m256i_word(&results, i);
+            let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
+            let remultiplied = _mm256_gf2p8mul_epi8(result, input);
+            assert_eq_m256i(remultiplied, reference);
+        }
+
+        // validate subsequent affine operation
+        let (matrices, vectors, _affine_expected) =
+            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let vector = load_m256i_word(&vectors, i);
+            let matrix = load_m256i_word(&matrices, i);
+
+            let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
+            let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
+            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            assert_eq_m256i(result, reference);
+        }
+
+        // validate everything by virtue of checking against the AES SBox
+        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
+        let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
+
+        for i in 0..NUM_BYTES_WORDS_256 {
+            let reference = load_m256i_word(&AES_S_BOX, i);
+            let input = load_m256i_word(&inputs, i);
+            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
+            assert_eq_m256i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let matrix = load_m256i_word(&matrices, i);
+            let vector = load_m256i_word(&vectors, i);
+            let result_zero =
+                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m256i(result_zero, _mm256_setzero_si256());
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_256 {
+            let left = load_m256i_word(&vectors, i);
+            let right = load_m256i_word(&matrices, i);
+            let result_left =
+                _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m256i(result_left, left);
+            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
+            const MASK_WORDS: i32 = 0b11_01_10_00;
+            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
+                left, mask_bytes, left, right,
+            );
+            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m256i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
+        let identity: i64 = 0x01_02_04_08_10_20_40_80;
+        const IDENTITY_BYTE: i32 = 0;
+        const CONSTANT_BYTE: i32 = 0x63;
+        let identity = _mm_set1_epi64x(identity);
+
+        // validate inversion
+        let (inputs, results) = generate_inv_tests_data();
+
+        for i in 0..NUM_BYTES_WORDS_128 {
+            let input = load_m128i_word(&inputs, i);
+            let reference = load_m128i_word(&results, i);
+            let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
+            let remultiplied = _mm_gf2p8mul_epi8(result, input);
+            assert_eq_m128i(remultiplied, reference);
+        }
+
+        // validate subsequent affine operation
+        let (matrices, vectors, _affine_expected) =
+            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let vector = load_m128i_word(&vectors, i);
+            let matrix = load_m128i_word(&matrices, i);
+
+            let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
+            let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
+            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            assert_eq_m128i(result, reference);
+        }
+
+        // validate everything by virtue of checking against the AES SBox
+        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
+        let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
+
+        for i in 0..NUM_BYTES_WORDS_128 {
+            let reference = load_m128i_word(&AES_S_BOX, i);
+            let input = load_m128i_word(&inputs, i);
+            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
+            assert_eq_m128i(result, reference);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let matrix = load_m128i_word(&matrices, i);
+            let vector = load_m128i_word(&vectors, i);
+            let result_zero =
+                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
+            assert_eq_m128i(result_zero, _mm_setzero_si128());
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
+            let result_masked =
+                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
+            let expected_masked =
+                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+
+    #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
+        const CONSTANT_BYTE: i32 = 0x63;
+        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
+
+        for i in 0..NUM_TEST_WORDS_128 {
+            let left = load_m128i_word(&vectors, i);
+            let right = load_m128i_word(&matrices, i);
+            let result_left =
+                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
+            assert_eq_m128i(result_left, left);
+            let mask_bytes: __mmask16 = 0x0F_F0;
+            const MASK_WORDS: i32 = 0b01_10;
+            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
+            let result_masked =
+                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
+            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
+            assert_eq_m128i(result_masked, expected_masked);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs
new file mode 100644
index 000000000..26aa0320f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs
@@ -0,0 +1,196 @@
+use crate::core_arch::x86::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512IFMA52&expand=3488)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    vpmadd52huq_512(a, b, c)
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3497&avx512techs=AVX512IFMA52)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    vpmadd52luq_512(a, b, c)
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3485)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    vpmadd52huq_256(a, b, c)
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    vpmadd52luq_256(a, b, c)
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3482&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    vpmadd52huq_128(a, b, c)
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
+pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    vpmadd52luq_128(a, b, c)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
+    fn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.128"]
+    fn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.256"]
+    fn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.256"]
+    fn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.512"]
+    fn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.512"]
+    fn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_madd52hi_epu64() {
+        let mut a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        a = _mm512_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm512_set1_epi64(11030549757952);
+
+        assert_eq_m512i(a, expected);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_madd52lo_epu64() {
+        let mut a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        a = _mm512_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm512_set1_epi64(100055558127628);
+
+        assert_eq_m512i(a, expected);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_madd52hi_epu64() {
+        let mut a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        a = _mm256_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm256_set1_epi64x(11030549757952);
+
+        assert_eq_m256i(a, expected);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_madd52lo_epu64() {
+        let mut a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        a = _mm256_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm256_set1_epi64x(100055558127628);
+
+        assert_eq_m256i(a, expected);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_madd52hi_epu64() {
+        let mut a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        a = _mm_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(a, expected);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_madd52lo_epu64() {
+        let mut a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        a = _mm_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(a, expected);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vaes.rs b/library/stdarch/crates/core_arch/src/x86/avx512vaes.rs
new file mode 100644
index 000000000..676de312b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vaes.rs
@@ -0,0 +1,332 @@
+//! Vectorized AES Instructions (VAES)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.aesni.aesenc.256"]
+    fn aesenc_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesenclast.256"]
+    fn aesenclast_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesdec.256"]
+    fn aesdec_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesdeclast.256"]
+    fn aesdeclast_256(a: __m256i, round_key: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.aesni.aesenc.512"]
+    fn aesenc_512(a: __m512i, round_key: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.aesni.aesenclast.512"]
+    fn aesenclast_512(a: __m512i, round_key: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.aesni.aesdec.512"]
+    fn aesdec_512(a: __m512i, round_key: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.aesni.aesdeclast.512"]
+    fn aesdeclast_512(a: __m512i, round_key: __m512i) -> __m512i;
+}
+
+/// Performs one round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_aesenc_epi128)
+#[inline]
+#[target_feature(enable = "avx512vaes,avx512vl")]
+#[cfg_attr(test, assert_instr(vaesenc))]
+pub unsafe fn _mm256_aesenc_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    aesenc_256(a, round_key)
+}
+
+/// Performs the last round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_aesenclast_epi128)
+#[inline]
+#[target_feature(enable = "avx512vaes,avx512vl")]
+#[cfg_attr(test, assert_instr(vaesenclast))]
+pub unsafe fn _mm256_aesenclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    aesenclast_256(a, round_key)
+}
+
+/// Performs one round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_aesdec_epi128)
+#[inline]
+#[target_feature(enable = "avx512vaes,avx512vl")]
+#[cfg_attr(test, assert_instr(vaesdec))]
+pub unsafe fn _mm256_aesdec_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    aesdec_256(a, round_key)
+}
+
+/// Performs the last round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_aesdeclast_epi128)
+#[inline]
+#[target_feature(enable = "avx512vaes,avx512vl")]
+#[cfg_attr(test, assert_instr(vaesdeclast))]
+pub unsafe fn _mm256_aesdeclast_epi128(a: __m256i, round_key: __m256i) -> __m256i {
+    aesdeclast_256(a, round_key)
+}
+
+/// Performs one round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_aesenc_epi128)
+#[inline]
+#[target_feature(enable = "avx512vaes,avx512f")]
+#[cfg_attr(test, assert_instr(vaesenc))]
+pub unsafe fn _mm512_aesenc_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    aesenc_512(a, round_key)
+}
+
+/// Performs the last round of an AES encryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_aesenclast_epi128)
+#[inline]
+#[target_feature(enable = "avx512vaes,avx512f")]
+#[cfg_attr(test, assert_instr(vaesenclast))]
+pub unsafe fn _mm512_aesenclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    aesenclast_512(a, round_key)
+}
+
+/// Performs one round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_aesdec_epi128)
+#[inline]
+#[target_feature(enable = "avx512vaes,avx512f")]
+#[cfg_attr(test, assert_instr(vaesdec))]
+pub unsafe fn _mm512_aesdec_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    aesdec_512(a, round_key)
+}
+
+/// Performs the last round of an AES decryption flow on each 128-bit word (state) in `a` using
+/// the corresponding 128-bit word (key) in `round_key`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_aesdeclast_epi128)
+#[inline]
+#[target_feature(enable = "avx512vaes,avx512f")]
+#[cfg_attr(test, assert_instr(vaesdeclast))]
+pub unsafe fn _mm512_aesdeclast_epi128(a: __m512i, round_key: __m512i) -> __m512i {
+    aesdeclast_512(a, round_key)
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    // the first parts of these tests are straight ports from the AES-NI tests
+    // the second parts directly compare the two, for inputs that are different across lanes
+    // and "more random" than the standard test vectors
+    // ideally we'd be using quickcheck here instead
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn helper_for_256_avx512vaes(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
+    ) {
+        let a = _mm256_set_epi64x(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+        );
+        let k = _mm256_set_epi64x(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+        );
+        let mut a_decomp = [_mm_setzero_si128(); 2];
+        a_decomp[0] = _mm256_extracti128_si256::<0>(a);
+        a_decomp[1] = _mm256_extracti128_si256::<1>(a);
+        let mut k_decomp = [_mm_setzero_si128(); 2];
+        k_decomp[0] = _mm256_extracti128_si256::<0>(k);
+        k_decomp[1] = _mm256_extracti128_si256::<1>(k);
+        let r = vectorized(a, k);
+        let mut e_decomp = [_mm_setzero_si128(); 2];
+        for i in 0..2 {
+            e_decomp[i] = linear(a_decomp[i], k_decomp[i]);
+        }
+        assert_eq_m128i(_mm256_extracti128_si256::<0>(r), e_decomp[0]);
+        assert_eq_m128i(_mm256_extracti128_si256::<1>(r), e_decomp[1]);
+    }
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn setup_state_key<T>(broadcast: unsafe fn(__m128i) -> T) -> (T, T) {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
+        let k = _mm_set_epi64x(0x1133557799bbddff, 0x0022446688aaccee);
+        (broadcast(a), broadcast(k))
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn setup_state_key_256() -> (__m256i, __m256i) {
+        setup_state_key(_mm256_broadcastsi128_si256)
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn setup_state_key_512() -> (__m512i, __m512i) {
+        setup_state_key(_mm512_broadcast_i32x4)
+    }
+
+    #[simd_test(enable = "avx512vaes,avx512vl")]
+    unsafe fn test_mm256_aesdec_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesdec_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_avx512vaes(_mm_aesdec_si128, _mm256_aesdec_epi128);
+    }
+
+    #[simd_test(enable = "avx512vaes,avx512vl")]
+    unsafe fn test_mm256_aesdeclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesdeclast_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_avx512vaes(_mm_aesdeclast_si128, _mm256_aesdeclast_epi128);
+    }
+
+    #[simd_test(enable = "avx512vaes,avx512vl")]
+    unsafe fn test_mm256_aesenc_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        // they are repeated appropriately
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesenc_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_avx512vaes(_mm_aesenc_si128, _mm256_aesenc_epi128);
+    }
+
+    #[simd_test(enable = "avx512vaes,avx512vl")]
+    unsafe fn test_mm256_aesenclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let (a, k) = setup_state_key_256();
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let e = _mm256_broadcastsi128_si256(e);
+        let r = _mm256_aesenclast_epi128(a, k);
+        assert_eq_m256i(r, e);
+
+        helper_for_256_avx512vaes(_mm_aesenclast_si128, _mm256_aesenclast_epi128);
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn helper_for_512_avx512vaes(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
+    ) {
+        let a = _mm512_set_epi64(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+            0xAB05CFFA2621154C,
+            0x1171B47A186174C9,
+            0x8C6B6C0E7595CEC9,
+            0xBE3E7D4934E961BD,
+        );
+        let k = _mm512_set_epi64(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+            0xB1927AB22F31D0EC,
+            0xA9A5DA619BE4D7AF,
+            0xCA2590F56884FDC6,
+            0x19BE9F660038BDB5,
+        );
+        let mut a_decomp = [_mm_setzero_si128(); 4];
+        a_decomp[0] = _mm512_extracti32x4_epi32::<0>(a);
+        a_decomp[1] = _mm512_extracti32x4_epi32::<1>(a);
+        a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);
+        a_decomp[3] = _mm512_extracti32x4_epi32::<3>(a);
+        let mut k_decomp = [_mm_setzero_si128(); 4];
+        k_decomp[0] = _mm512_extracti32x4_epi32::<0>(k);
+        k_decomp[1] = _mm512_extracti32x4_epi32::<1>(k);
+        k_decomp[2] = _mm512_extracti32x4_epi32::<2>(k);
+        k_decomp[3] = _mm512_extracti32x4_epi32::<3>(k);
+        let r = vectorized(a, k);
+        let mut e_decomp = [_mm_setzero_si128(); 4];
+        for i in 0..4 {
+            e_decomp[i] = linear(a_decomp[i], k_decomp[i]);
+        }
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<0>(r), e_decomp[0]);
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<1>(r), e_decomp[1]);
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<2>(r), e_decomp[2]);
+        assert_eq_m128i(_mm512_extracti32x4_epi32::<3>(r), e_decomp[3]);
+    }
+
+    #[simd_test(enable = "avx512vaes,avx512f")]
+    unsafe fn test_mm512_aesdec_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664949.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0x044e4f5176fec48f, 0xb57ecfa381da39ee);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesdec_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_avx512vaes(_mm_aesdec_si128, _mm512_aesdec_epi128);
+    }
+
+    #[simd_test(enable = "avx512vaes,avx512f")]
+    unsafe fn test_mm512_aesdeclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714178.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0x36cad57d9072bf9e, 0xf210dd981fa4a493);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesdeclast_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_avx512vaes(_mm_aesdeclast_si128, _mm512_aesdeclast_epi128);
+    }
+
+    #[simd_test(enable = "avx512vaes,avx512f")]
+    unsafe fn test_mm512_aesenc_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc664810.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0x16ab0e57dfc442ed, 0x28e4ee1884504333);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesenc_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_avx512vaes(_mm_aesenc_si128, _mm512_aesenc_epi128);
+    }
+
+    #[simd_test(enable = "avx512vaes,avx512f")]
+    unsafe fn test_mm512_aesenclast_epi128() {
+        // Constants taken from https://msdn.microsoft.com/en-us/library/cc714136.aspx.
+        let (a, k) = setup_state_key_512();
+        let e = _mm_set_epi64x(0xb6dd7df25d7ab320, 0x4b04f98cf4c860f8);
+        let e = _mm512_broadcast_i32x4(e);
+        let r = _mm512_aesenclast_epi128(a, k);
+        assert_eq_m512i(r, e);
+
+        helper_for_512_avx512vaes(_mm_aesenclast_si128, _mm512_aesenclast_epi128);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs b/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs
new file mode 100644
index 000000000..f0ff75162
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs
@@ -0,0 +1,916 @@
+use crate::core_arch::{simd::*, simd_llvm::*, x86::*};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi8&expand=4262)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub unsafe fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    transmute(vpermi2b(a.as_i8x64(), idx.as_i8x64(), b.as_i8x64()))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi8&expand=4259)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vpermt2b))]
+pub unsafe fn _mm512_mask_permutex2var_epi8(
+    a: __m512i,
+    k: __mmask64,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+    transmute(simd_select_bitmask(k, permute, a.as_i8x64()))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi8&expand=4261)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub unsafe fn _mm512_maskz_permutex2var_epi8(
+    k: __mmask64,
+    a: __m512i,
+    idx: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi8&expand=4260)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vpermi2b))]
+pub unsafe fn _mm512_mask2_permutex2var_epi8(
+    a: __m512i,
+    idx: __m512i,
+    k: __mmask64,
+    b: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+    transmute(simd_select_bitmask(k, permute, idx.as_i8x64()))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_epi8&expand=4258)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub unsafe fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    transmute(vpermi2b256(a.as_i8x32(), idx.as_i8x32(), b.as_i8x32()))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_epi8&expand=4255)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2b))]
+pub unsafe fn _mm256_mask_permutex2var_epi8(
+    a: __m256i,
+    k: __mmask32,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+    transmute(simd_select_bitmask(k, permute, a.as_i8x32()))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_epi8&expand=4257)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub unsafe fn _mm256_maskz_permutex2var_epi8(
+    k: __mmask32,
+    a: __m256i,
+    idx: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_epi8&expand=4256)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermi2b))]
+pub unsafe fn _mm256_mask2_permutex2var_epi8(
+    a: __m256i,
+    idx: __m256i,
+    k: __mmask32,
+    b: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+    transmute(simd_select_bitmask(k, permute, idx.as_i8x32()))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_epi8&expand=4254)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub unsafe fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    transmute(vpermi2b128(a.as_i8x16(), idx.as_i8x16(), b.as_i8x16()))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_epi8&expand=4251)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermt2b))]
+pub unsafe fn _mm_mask_permutex2var_epi8(
+    a: __m128i,
+    k: __mmask16,
+    idx: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+    transmute(simd_select_bitmask(k, permute, a.as_i8x16()))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_epi8&expand=4253)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
+pub unsafe fn _mm_maskz_permutex2var_epi8(
+    k: __mmask16,
+    a: __m128i,
+    idx: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_epi8&expand=4252)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermi2b))]
+pub unsafe fn _mm_mask2_permutex2var_epi8(
+    a: __m128i,
+    idx: __m128i,
+    k: __mmask16,
+    b: __m128i,
+) -> __m128i {
+    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+    transmute(simd_select_bitmask(k, permute, idx.as_i8x16()))
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi8&expand=4316)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub unsafe fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
+    transmute(vpermb(a.as_i8x64(), idx.as_i8x64()))
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi8&expand=4314)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub unsafe fn _mm512_mask_permutexvar_epi8(
+    src: __m512i,
+    k: __mmask64,
+    idx: __m512i,
+    a: __m512i,
+) -> __m512i {
+    let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+    transmute(simd_select_bitmask(k, permute, src.as_i8x64()))
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi8&expand=4315)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub unsafe fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
+    let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_epi8&expand=4313)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub unsafe fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
+    transmute(vpermb256(a.as_i8x32(), idx.as_i8x32()))
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_epi8&expand=4311)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub unsafe fn _mm256_mask_permutexvar_epi8(
+    src: __m256i,
+    k: __mmask32,
+    idx: __m256i,
+    a: __m256i,
+) -> __m256i {
+    let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+    transmute(simd_select_bitmask(k, permute, src.as_i8x32()))
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_epi8&expand=4312)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub unsafe fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
+    let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_epi8&expand=4310)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub unsafe fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
+    transmute(vpermb128(a.as_i8x16(), idx.as_i8x16()))
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutexvar_epi8&expand=4308)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub unsafe fn _mm_mask_permutexvar_epi8(
+    src: __m128i,
+    k: __mmask16,
+    idx: __m128i,
+    a: __m128i,
+) -> __m128i {
+    let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+    transmute(simd_select_bitmask(k, permute, src.as_i8x16()))
+}
+
+/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutexvar_epi8&expand=4309)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermb))]
+pub unsafe fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
+    let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, permute, zero))
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_multishift_epi64_epi8&expand=4026)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub unsafe fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpmultishiftqb(a.as_i8x64(), b.as_i8x64()))
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_multishift_epi64_epi8&expand=4024)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub unsafe fn _mm512_mask_multishift_epi64_epi8(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, multishift, src.as_i8x64()))
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_multishift_epi64_epi8&expand=4025)
+#[inline]
+#[target_feature(enable = "avx512vbmi")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub unsafe fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, multishift, zero))
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_multishift_epi64_epi8&expand=4023)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub unsafe fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpmultishiftqb256(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_multishift_epi64_epi8&expand=4021)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub unsafe fn _mm256_mask_multishift_epi64_epi8(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, multishift, src.as_i8x32()))
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_multishift_epi64_epi8&expand=4022)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub unsafe fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+    let zero = _mm256_setzero_si256().as_i8x32();
+    transmute(simd_select_bitmask(k, multishift, zero))
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_multishift_epi64_epi8&expand=4020)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub unsafe fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpmultishiftqb128(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_multishift_epi64_epi8&expand=4018)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub unsafe fn _mm_mask_multishift_epi64_epi8(
+    src: __m128i,
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, multishift, src.as_i8x16()))
+}
+
+/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_multishift_epi64_epi8&expand=4019)
+#[inline]
+#[target_feature(enable = "avx512vbmi,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmultishiftqb))]
+pub unsafe fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, multishift, zero))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.vpermi2var.qi.512"]
+    fn vpermi2b(a: i8x64, idx: i8x64, b: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.vpermi2var.qi.256"]
+    fn vpermi2b256(a: i8x32, idx: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.vpermi2var.qi.128"]
+    fn vpermi2b128(a: i8x16, idx: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.permvar.qi.512"]
+    fn vpermb(a: i8x64, idx: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.permvar.qi.256"]
+    fn vpermb256(a: i8x32, idx: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.permvar.qi.128"]
+    fn vpermb128(a: i8x16, idx: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.pmultishift.qb.512"]
+    fn vpmultishiftqb(a: i8x64, b: i8x64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.pmultishift.qb.256"]
+    fn vpmultishiftqb256(a: i8x32, b: i8x32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.pmultishift.qb.128"]
+    fn vpmultishiftqb128(a: i8x16, b: i8x16) -> i8x16;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_permutex2var_epi8(a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_mask_permutex2var_epi8(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            idx,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_maskz_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_maskz_permutex2var_epi8(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            idx,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask2_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        #[rustfmt::skip]
+        let idx = _mm512_set_epi8(1,  1<<6, 2,  1<<6, 3,  1<<6, 4,  1<<6, 5,  1<<6, 6,  1<<6, 7,  1<<6, 8,  1<<6,
+                                  9,  1<<6, 10, 1<<6, 11, 1<<6, 12, 1<<6, 13, 1<<6, 14, 1<<6, 15, 1<<6, 16, 1<<6,
+                                  17, 1<<6, 18, 1<<6, 19, 1<<6, 20, 1<<6, 21, 1<<6, 22, 1<<6, 23, 1<<6, 24, 1<<6,
+                                  25, 1<<6, 26, 1<<6, 27, 1<<6, 28, 1<<6, 29, 1<<6, 30, 1<<6, 31, 1<<6, 32, 1<<6);
+        let b = _mm512_set1_epi8(100);
+        let r = _mm512_mask2_permutex2var_epi8(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi8(
+            a,
+            idx,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            b,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            62, 100, 61, 100, 60, 100, 59, 100, 58, 100, 57, 100, 56, 100, 55, 100,
+            54, 100, 53, 100, 52, 100, 51, 100, 50, 100, 49, 100, 48, 100, 47, 100,
+            46, 100, 45, 100, 44, 100, 43, 100, 42, 100, 41, 100, 40, 100, 39, 100,
+            38, 100, 37, 100, 36, 100, 35, 100, 34, 100, 33, 100, 32, 100, 31, 100,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_permutex2var_epi8(a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_mask_permutex2var_epi8(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi8(a, 0b11111111_11111111_11111111_11111111, idx, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_maskz_permutex2var_epi8(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi8(0b11111111_11111111_11111111_11111111, a, idx, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        #[rustfmt::skip]
+        let idx = _mm256_set_epi8(1,  1<<5, 2,  1<<5, 3,  1<<5, 4,  1<<5, 5,  1<<5, 6,  1<<5, 7,  1<<5, 8,  1<<5,
+                                  9,  1<<5, 10, 1<<5, 11, 1<<5, 12, 1<<5, 13, 1<<5, 14, 1<<5, 15, 1<<5, 16, 1<<5);
+        let b = _mm256_set1_epi8(100);
+        let r = _mm256_mask2_permutex2var_epi8(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi8(a, idx, 0b11111111_11111111_11111111_11111111, b);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            30, 100, 29, 100, 28, 100, 27, 100, 26, 100, 25, 100, 24, 100, 23, 100,
+            22, 100, 21, 100, 20, 100, 19, 100, 18, 100, 17, 100, 16, 100, 15, 100,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_permutex2var_epi8(a, idx, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_mask_permutex2var_epi8(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi8(a, 0b11111111_11111111, idx, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_maskz_permutex2var_epi8(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi8(0b11111111_11111111, a, idx, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let idx = _mm_set_epi8(1, 1 << 4, 2, 1 << 4, 3, 1 << 4, 4, 1 << 4, 5, 1 << 4, 6, 1 << 4, 7, 1 << 4, 8, 1 << 4);
+        let b = _mm_set1_epi8(100);
+        let r = _mm_mask2_permutex2var_epi8(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi8(a, idx, 0b11111111_11111111, b);
+        let e = _mm_set_epi8(
+            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_permutexvar_epi8() {
+        let idx = _mm512_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_permutexvar_epi8(idx, a);
+        let e = _mm512_set1_epi8(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask_permutexvar_epi8() {
+        let idx = _mm512_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_permutexvar_epi8(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            idx,
+            a,
+        );
+        let e = _mm512_set1_epi8(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_maskz_permutexvar_epi8() {
+        let idx = _mm512_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_permutexvar_epi8(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            idx,
+            a,
+        );
+        let e = _mm512_set1_epi8(62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi8() {
+        let idx = _mm256_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_permutexvar_epi8(idx, a);
+        let e = _mm256_set1_epi8(30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi8() {
+        let idx = _mm256_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_permutexvar_epi8(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi8(a, 0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm256_set1_epi8(30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi8() {
+        let idx = _mm256_set1_epi8(1);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_permutexvar_epi8(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi8(0b11111111_11111111_11111111_11111111, idx, a);
+        let e = _mm256_set1_epi8(30);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_permutexvar_epi8() {
+        let idx = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_permutexvar_epi8(idx, a);
+        let e = _mm_set1_epi8(14);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask_permutexvar_epi8() {
+        let idx = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_permutexvar_epi8(a, 0, idx, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutexvar_epi8(a, 0b11111111_11111111, idx, a);
+        let e = _mm_set1_epi8(14);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_maskz_permutexvar_epi8() {
+        let idx = _mm_set1_epi8(1);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_permutexvar_epi8(0, idx, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutexvar_epi8(0b11111111_11111111, idx, a);
+        let e = _mm_set1_epi8(14);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_multishift_epi64_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_multishift_epi64_epi8(a, b);
+        let e = _mm512_set1_epi8(1 << 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_mask_multishift_epi64_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_mask_multishift_epi64_epi8(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_multishift_epi64_epi8(
+            a,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e = _mm512_set1_epi8(1 << 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi")]
+    unsafe fn test_mm512_maskz_multishift_epi64_epi8() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let r = _mm512_maskz_multishift_epi64_epi8(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_multishift_epi64_epi8(
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+            a,
+            b,
+        );
+        let e = _mm512_set1_epi8(1 << 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_multishift_epi64_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_multishift_epi64_epi8(a, b);
+        let e = _mm256_set1_epi8(1 << 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_mask_multishift_epi64_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_mask_multishift_epi64_epi8(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_multishift_epi64_epi8(a, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm256_set1_epi8(1 << 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm256_maskz_multishift_epi64_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let r = _mm256_maskz_multishift_epi64_epi8(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_multishift_epi64_epi8(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm256_set1_epi8(1 << 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_multishift_epi64_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_multishift_epi64_epi8(a, b);
+        let e = _mm_set1_epi8(1 << 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_mask_multishift_epi64_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_mask_multishift_epi64_epi8(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_multishift_epi64_epi8(a, 0b11111111_11111111, a, b);
+        let e = _mm_set1_epi8(1 << 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi,avx512vl")]
+    unsafe fn test_mm_maskz_multishift_epi64_epi8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_maskz_multishift_epi64_epi8(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_multishift_epi64_epi8(0b11111111_11111111, a, b);
+        let e = _mm_set1_epi8(1 << 7);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs b/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
new file mode 100644
index 000000000..1c81840ba
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -0,0 +1,4121 @@
+use crate::{
+    arch::asm,
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")]
+pub unsafe fn _mm512_mask_expandloadu_epi16(
+    src: __m512i,
+    k: __mmask32,
+    mem_addr: *const i16,
+) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vpexpandw {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")]
+pub unsafe fn _mm512_maskz_expandloadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vpexpandw {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx")]
+pub unsafe fn _mm256_mask_expandloadu_epi16(
+    src: __m256i,
+    k: __mmask16,
+    mem_addr: *const i16,
+) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vpexpandw {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_expandloadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vpexpandw {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_expandloadu_epi16(
+    src: __m128i,
+    k: __mmask8,
+    mem_addr: *const i16,
+) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vpexpandw {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_expandloadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vpexpandw {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")]
+pub unsafe fn _mm512_mask_expandloadu_epi8(
+    src: __m512i,
+    k: __mmask64,
+    mem_addr: *const i8,
+) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+        vpl!("vpexpandb {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")]
+pub unsafe fn _mm512_maskz_expandloadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+        vpl!("vpexpandb {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(zmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2,avx512vl,avx")]
+pub unsafe fn _mm256_mask_expandloadu_epi8(
+    src: __m256i,
+    k: __mmask32,
+    mem_addr: *const i8,
+) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+        vpl!("vpexpandb {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_expandloadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+        vpl!("vpexpandb {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(ymm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_expandloadu_epi8(
+    src: __m128i,
+    k: __mmask16,
+    mem_addr: *const i8,
+) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+        vpl!("vpexpandb {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_expandloadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+        vpl!("vpexpandb {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm512_mask_compressstoreu_epi16(base_addr: *mut u8, k: __mmask32, a: __m512i) {
+    vcompressstorew(base_addr as *mut _, a.as_i16x32(), k)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm256_mask_compressstoreu_epi16(base_addr: *mut u8, k: __mmask16, a: __m256i) {
+    vcompressstorew256(base_addr as *mut _, a.as_i16x16(), k)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi16)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm_mask_compressstoreu_epi16(base_addr: *mut u8, k: __mmask8, a: __m128i) {
+    vcompressstorew128(base_addr as *mut _, a.as_i16x8(), k)
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm512_mask_compressstoreu_epi8(base_addr: *mut u8, k: __mmask64, a: __m512i) {
+    vcompressstoreb(base_addr as *mut _, a.as_i8x64(), k)
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm256_mask_compressstoreu_epi8(base_addr: *mut u8, k: __mmask32, a: __m256i) {
+    vcompressstoreb256(base_addr as *mut _, a.as_i8x32(), k)
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi8)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm_mask_compressstoreu_epi8(base_addr: *mut u8, k: __mmask16, a: __m128i) {
+    vcompressstoreb128(base_addr as *mut _, a.as_i8x16(), k)
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi16&expand=1192)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    transmute(vpcompressw(a.as_i16x32(), src.as_i16x32(), k))
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi16&expand=1193)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    transmute(vpcompressw(
+        a.as_i16x32(),
+        _mm512_setzero_si512().as_i16x32(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi16&expand=1190)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    transmute(vpcompressw256(a.as_i16x16(), src.as_i16x16(), k))
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi16&expand=1191)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    transmute(vpcompressw256(
+        a.as_i16x16(),
+        _mm256_setzero_si256().as_i16x16(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi16&expand=1188)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpcompressw128(a.as_i16x8(), src.as_i16x8(), k))
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi16&expand=1189)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpcompressw128(
+        a.as_i16x8(),
+        _mm_setzero_si128().as_i16x8(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi8&expand=1210)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    transmute(vpcompressb(a.as_i8x64(), src.as_i8x64(), k))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi8&expand=1211)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    transmute(vpcompressb(
+        a.as_i8x64(),
+        _mm512_setzero_si512().as_i8x64(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi8&expand=1208)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    transmute(vpcompressb256(a.as_i8x32(), src.as_i8x32(), k))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi8&expand=1209)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    transmute(vpcompressb256(
+        a.as_i8x32(),
+        _mm256_setzero_si256().as_i8x32(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi8&expand=1206)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    transmute(vpcompressb128(a.as_i8x16(), src.as_i8x16(), k))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi8&expand=1207)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    transmute(vpcompressb128(
+        a.as_i8x16(),
+        _mm_setzero_si128().as_i8x16(),
+        k,
+    ))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi16&expand=2310)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    transmute(vpexpandw(a.as_i16x32(), src.as_i16x32(), k))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi16&expand=2311)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    transmute(vpexpandw(
+        a.as_i16x32(),
+        _mm512_setzero_si512().as_i16x32(),
+        k,
+    ))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi16&expand=2308)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    transmute(vpexpandw256(a.as_i16x16(), src.as_i16x16(), k))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi16&expand=2309)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    transmute(vpexpandw256(
+        a.as_i16x16(),
+        _mm256_setzero_si256().as_i16x16(),
+        k,
+    ))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi16&expand=2306)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpexpandw128(a.as_i16x8(), src.as_i16x8(), k))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi16&expand=2307)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpexpandw128(
+        a.as_i16x8(),
+        _mm_setzero_si128().as_i16x8(),
+        k,
+    ))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi8&expand=2328)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    transmute(vpexpandb(a.as_i8x64(), src.as_i8x64(), k))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi8&expand=2329)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    transmute(vpexpandb(
+        a.as_i8x64(),
+        _mm512_setzero_si512().as_i8x64(),
+        k,
+    ))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi8&expand=2326)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    transmute(vpexpandb256(a.as_i8x32(), src.as_i8x32(), k))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi8&expand=2327)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    transmute(vpexpandb256(
+        a.as_i8x32(),
+        _mm256_setzero_si256().as_i8x32(),
+        k,
+    ))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi8&expand=2324)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    transmute(vpexpandb128(a.as_i8x16(), src.as_i8x16(), k))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi8&expand=2325)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    transmute(vpexpandb128(
+        a.as_i8x16(),
+        _mm_setzero_si128().as_i8x16(),
+        k,
+    ))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi64&expand=5087)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshldvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi64&expand=5085)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi64&expand=5086)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi64&expand=5084)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshldvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi64&expand=5082)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi64&expand=5083)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi64&expand=5081)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshldvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi64&expand=5079)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi64&expand=5080)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi32&expand=5078)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshldvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi32&expand=5076)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi32&expand=5077)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm512_maskz_shldv_epi32(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi32&expand=5075)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshldvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi32&expand=5073)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi32&expand=5074)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi32&expand=5072)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshldvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi32&expand=5070)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi32&expand=5071)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi16&expand=5069)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshldvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi16&expand=5067)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi16&expand=5068)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm512_maskz_shldv_epi16(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi16&expand=5066)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshldvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi16&expand=5064)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi16&expand=5065)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm256_maskz_shldv_epi16(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi16&expand=5063)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshldvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi16&expand=5061)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi16&expand=5062)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi64&expand=5141)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshrdvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi64&expand=5139)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi64&expand=5140)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi64&expand=5138)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshrdvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi64&expand=5136)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi64&expand=5137)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi64&expand=5135)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshrdvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi64&expand=5133)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi64&expand=5134)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi32&expand=5132)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshrdvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi32&expand=5130)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi32&expand=5131)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm512_maskz_shrdv_epi32(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi32&expand=5129)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshrdvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi32&expand=5127)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi32&expand=5128)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi32&expand=5126)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshrdvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi32&expand=5124)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi32&expand=5125)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi16&expand=5123)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshrdvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi16&expand=5121)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi16&expand=5122)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm512_maskz_shrdv_epi16(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi16&expand=5120)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshrdvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi16&expand=5118)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi16&expand=5119)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm256_maskz_shrdv_epi16(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi16&expand=5117)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshrdvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi16&expand=5115)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi16&expand=5116)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi64&expand=5060)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shldi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    transmute(vpshldvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi64&expand=5058)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shldi_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x8 = vpshldvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi64&expand=5059)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shldi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x8 = vpshldvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
+    );
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi64&expand=5057)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shldi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    transmute(vpshldvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi64&expand=5055)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shldi_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x4 = vpshldvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi64&expand=5056)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shldi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x4 = vpshldvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
+    );
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi64&expand=5054)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shldi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    transmute(vpshldvq128(
+        a.as_i64x2(),
+        b.as_i64x2(),
+        _mm_set1_epi64x(imm8).as_i64x2(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi64&expand=5052)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shldi_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshldvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi64&expand=5053)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shldi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshldvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi32&expand=5051)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shldi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    transmute(vpshldvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi32&expand=5049)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shldi_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x16 = vpshldvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi32&expand=5050)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shldi_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x16 = vpshldvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
+    );
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi32&expand=5048)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shldi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(vpshldvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi32&expand=5046)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shldi_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x8 = vpshldvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi32&expand=5047)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shldi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x8 = vpshldvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
+    );
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi32&expand=5045)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shldi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(vpshldvd128(
+        a.as_i32x4(),
+        b.as_i32x4(),
+        _mm_set1_epi32(IMM8).as_i32x4(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi32&expand=5043)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shldi_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi32&expand=5044)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shldi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi16&expand=5042)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shldi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    transmute(vpshldvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi16&expand=5040)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shldi_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x32 = vpshldvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi16&expand=5041)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shldi_epi16<const IMM8: i32>(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x32 = vpshldvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
+    );
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi16&expand=5039)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shldi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    transmute(vpshldvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi16&expand=5037)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shldi_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x16 = vpshldvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi16&expand=5038)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shldi_epi16<const IMM8: i32>(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x16 = vpshldvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
+    );
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi16&expand=5036)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shldi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    transmute(vpshldvw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_set1_epi16(imm8).as_i16x8(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi16&expand=5034)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shldi_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshldvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi16&expand=5035)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shldi_epi16<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshldvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi64&expand=5114)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shrdi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    transmute(vpshrdvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi64&expand=5112)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shrdi_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x8 = vpshrdvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi64&expand=5113)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 255))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x8 = vpshrdvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
+    );
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi64&expand=5111)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shrdi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    transmute(vpshrdvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi64&expand=5109)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shrdi_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x4 = vpshrdvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi64&expand=5110)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x4 = vpshrdvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
+    );
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi64&expand=5108)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shrdi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    transmute(vpshrdvq128(
+        a.as_i64x2(),
+        b.as_i64x2(),
+        _mm_set1_epi64x(imm8).as_i64x2(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi64&expand=5106)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shrdi_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshrdvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi64&expand=5107)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shrdi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshrdvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi32&expand=5105)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shrdi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    transmute(vpshrdvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi32&expand=5103)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shrdi_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x16 = vpshrdvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi32&expand=5104)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x16 = vpshrdvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
+    );
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi32&expand=5102)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shrdi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    transmute(vpshrdvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi32&expand=5100)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shrdi_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x8 = vpshrdvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi32&expand=5101)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x8 = vpshrdvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
+    );
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi32&expand=5099)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shrdi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(vpshrdvd128(
+        a.as_i32x4(),
+        b.as_i32x4(),
+        _mm_set1_epi32(IMM8).as_i32x4(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi32&expand=5097)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shrdi_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi32&expand=5098)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shrdi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi16&expand=5096)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shrdi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi16&expand=5094)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shrdi_epi16<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i16x32 = vpshrdvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi16&expand=5095)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i16x32 = vpshrdvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
+    );
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi16&expand=5093)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shrdi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi16&expand=5091)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shrdi_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i16x16 = vpshrdvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi16&expand=5092)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x16 = vpshrdvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
+    );
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi16&expand=5090)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shrdi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    transmute(vpshrdvw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_set1_epi16(imm8).as_i16x8(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi16&expand=5088)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shrdi_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshrdvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi16&expand=5089)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shrdi_epi16<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshrdvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.mask.compress.store.w.512"]
+    fn vcompressstorew(mem: *mut i8, data: i16x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.w.256"]
+    fn vcompressstorew256(mem: *mut i8, data: i16x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.w.128"]
+    fn vcompressstorew128(mem: *mut i8, data: i16x8, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.store.b.512"]
+    fn vcompressstoreb(mem: *mut i8, data: i8x64, mask: u64);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.b.256"]
+    fn vcompressstoreb256(mem: *mut i8, data: i8x32, mask: u32);
+    #[link_name = "llvm.x86.avx512.mask.compress.store.b.128"]
+    fn vcompressstoreb128(mem: *mut i8, data: i8x16, mask: u16);
+
+    #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
+    fn vpcompressw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.compress.w.256"]
+    fn vpcompressw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.w.128"]
+    fn vpcompressw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.b.512"]
+    fn vpcompressb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.compress.b.256"]
+    fn vpcompressb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.compress.b.128"]
+    fn vpcompressb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.w.512"]
+    fn vpexpandw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.w.256"]
+    fn vpexpandw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.w.128"]
+    fn vpexpandw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.b.512"]
+    fn vpexpandb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.expand.b.256"]
+    fn vpexpandb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.b.128"]
+    fn vpexpandb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.fshl.v8i64"]
+    fn vpshldvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
+    #[link_name = "llvm.fshl.v4i64"]
+    fn vpshldvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
+    #[link_name = "llvm.fshl.v2i64"]
+    fn vpshldvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
+    #[link_name = "llvm.fshl.v16i32"]
+    fn vpshldvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
+    #[link_name = "llvm.fshl.v8i32"]
+    fn vpshldvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
+    #[link_name = "llvm.fshl.v4i32"]
+    fn vpshldvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.fshl.v32i16"]
+    fn vpshldvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
+    #[link_name = "llvm.fshl.v16i16"]
+    fn vpshldvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
+    #[link_name = "llvm.fshl.v8i16"]
+    fn vpshldvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
+
+    #[link_name = "llvm.fshr.v8i64"]
+    fn vpshrdvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
+    #[link_name = "llvm.fshr.v4i64"]
+    fn vpshrdvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
+    #[link_name = "llvm.fshr.v2i64"]
+    fn vpshrdvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
+    #[link_name = "llvm.fshr.v16i32"]
+    fn vpshrdvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
+    #[link_name = "llvm.fshr.v8i32"]
+    fn vpshrdvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
+    #[link_name = "llvm.fshr.v4i32"]
+    fn vpshrdvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.fshr.v32i16"]
+    fn vpshrdvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
+    #[link_name = "llvm.fshr.v16i16"]
+    fn vpshrdvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
+    #[link_name = "llvm.fshr.v8i16"]
+    fn vpshrdvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::hint::black_box;
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compress_epi16() {
+        let src = _mm512_set1_epi16(200);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_compress_epi16(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_compress_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_compress_epi16(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi16() {
+        let src = _mm256_set1_epi16(200);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_compress_epi16(src, 0b01010101_01010101, a);
+        let e = _mm256_set_epi16(
+            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_compress_epi16(0b01010101_01010101, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi16() {
+        let src = _mm_set1_epi16(200);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_compress_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(200, 200, 200, 200, 1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_compress_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compress_epi8() {
+        let src = _mm512_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_compress_epi8(
+            src,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+            33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,  63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_compress_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_compress_epi8(
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+            33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi8() {
+        let src = _mm256_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_compress_epi8(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_compress_epi8(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi8() {
+        let src = _mm_set1_epi8(100);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_compress_epi8(src, 0b01010101_01010101, a);
+        let e = _mm_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_compress_epi8(0b01010101_01010101, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expand_epi16() {
+        let src = _mm512_set1_epi16(200);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_expand_epi16(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            200, 16, 200, 17, 200, 18, 200, 19, 200, 20, 200, 21, 200, 22, 200, 23,
+            200, 24, 200, 25, 200, 26, 200, 27, 200, 28, 200, 29, 200, 30, 200, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expand_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_expand_epi16(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
+                                 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi16() {
+        let src = _mm256_set1_epi16(200);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_expand_epi16(src, 0b01010101_01010101, a);
+        let e = _mm256_set_epi16(
+            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_expand_epi16(0b01010101_01010101, a);
+        let e = _mm256_set_epi16(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi16() {
+        let src = _mm_set1_epi16(200);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_expand_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(200, 4, 200, 5, 200, 6, 200, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_expand_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 4, 0, 5, 0, 6, 0, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expand_epi8() {
+        let src = _mm512_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_expand_epi8(
+            src,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            100, 32, 100, 33, 100, 34, 100, 35, 100, 36, 100, 37, 100, 38, 100, 39,
+            100, 40, 100, 41, 100, 42, 100, 43, 100, 44, 100, 45, 100, 46, 100, 47,
+            100, 48, 100, 49, 100, 50, 100, 51, 100, 52, 100, 53, 100, 54, 100, 55,
+            100, 56, 100, 57, 100, 58, 100, 59, 100, 60, 100, 61, 100, 62, 100, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expand_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_expand_epi8(
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 32, 0, 33, 0, 34, 0, 35, 0, 36, 0, 37, 0, 38, 0, 39,
+            0, 40, 0, 41, 0, 42, 0, 43, 0, 44, 0, 45, 0, 46, 0, 47,
+            0, 48, 0, 49, 0, 50, 0, 51, 0, 52, 0, 53, 0, 54, 0, 55,
+            0, 56, 0, 57, 0, 58, 0, 59, 0, 60, 0, 61, 0, 62, 0, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi8() {
+        let src = _mm256_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_expand_epi8(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            100, 16, 100, 17, 100, 18, 100, 19, 100, 20, 100, 21, 100, 22, 100, 23,
+            100, 24, 100, 25, 100, 26, 100, 27, 100, 28, 100, 29, 100, 30, 100, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_expand_epi8(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
+            0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi8() {
+        let src = _mm_set1_epi8(100);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_expand_epi8(src, 0b01010101_01010101, a);
+        let e = _mm_set_epi8(
+            100, 8, 100, 9, 100, 10, 100, 11, 100, 12, 100, 13, 100, 14, 100, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_expand_epi8(0b01010101_01010101, a);
+        let e = _mm_set_epi8(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_shldv_epi64(a, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi64(a, 0b11111111, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi64(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_shldv_epi64(a, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi64(a, 0b00001111, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi64(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_shldv_epi64(a, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi64(a, 0b00000011, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi64(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_shldv_epi32(a, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi32(a, 0b11111111_11111111, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi32(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_shldv_epi32(a, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi32(a, 0b11111111, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi32(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_shldv_epi32(a, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi32(a, 0b00001111, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi32(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_shldv_epi16(a, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_shldv_epi16(a, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi16(a, 0b11111111_11111111, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi16(0b11111111_11111111, a, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_shldv_epi16(a, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi16(a, 0b11111111, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi16(0b11111111, a, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_shrdv_epi64(a, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_shrdv_epi64(a, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_shrdv_epi64(a, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_shrdv_epi32(a, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_shrdv_epi32(a, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_shrdv_epi32(a, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_shrdv_epi16(a, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_shrdv_epi16(a, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_shrdv_epi16(a, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_shldi_epi64::<2>(a, b);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_maskz_shldi_epi64::<2>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi64::<2>(0b11111111, a, b);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_shldi_epi64::<2>(a, b);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_maskz_shldi_epi64::<2>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi64::<2>(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_shldi_epi64::<2>(a, b);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_maskz_shldi_epi64::<2>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi64::<2>(0b00000011, a, b);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_shldi_epi32::<2>(a, b);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_maskz_shldi_epi32::<2>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi32::<2>(0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_shldi_epi32::<2>(a, b);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_maskz_shldi_epi32::<2>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi32::<2>(0b11111111, a, b);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_shldi_epi32::<2>(a, b);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_maskz_shldi_epi32::<2>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi32::<2>(0b00001111, a, b);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_shldi_epi16::<2>(a, b);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_maskz_shldi_epi16::<2>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi16::<2>(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_shldi_epi16::<2>(a, b);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_maskz_shldi_epi16::<2>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi16::<2>(0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_shldi_epi16::<2>(a, b);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0b11111111, a, b);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_maskz_shldi_epi16::<2>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi16::<2>(0b11111111, a, b);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_shrdi_epi64::<1>(a, b);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_shrdi_epi64::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi64::<1>(0b11111111, a, b);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_shrdi_epi64::<1>(a, b);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_maskz_shrdi_epi64::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi64::<1>(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_shrdi_epi64::<1>(a, b);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_maskz_shrdi_epi64::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi64::<1>(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_shrdi_epi32::<1>(a, b);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_shrdi_epi32::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi32::<1>(0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_shrdi_epi32::<1>(a, b);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_shrdi_epi32::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi32::<1>(0b11111111, a, b);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_shrdi_epi32::<1>(a, b);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0b00001111, a, b);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_maskz_shrdi_epi32::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi32::<1>(0b00001111, a, b);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_shrdi_epi16::<1>(a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_shrdi_epi16::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi16::<1>(0b11111111_11111111_11111111_11111111, a, b);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_shrdi_epi16::<1>(a, b);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_shrdi_epi16::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi16::<1>(0b11111111_11111111, a, b);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_shrdi_epi16::<1>(a, b);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0b11111111, a, b);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_maskz_shrdi_epi16::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi16::<1>(0b11111111, a, b);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expandloadu_epi16() {
+        let src = _mm512_set1_epi16(42);
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm512_mask_expandloadu_epi16(src, m, black_box(p));
+        let e = _mm512_set_epi16(
+            16, 15, 14, 42, 13, 42, 42, 42, 12, 11, 42, 42, 10, 42, 9, 42, 8, 7, 6, 5, 42, 42, 42,
+            42, 42, 42, 42, 42, 4, 3, 2, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expandloadu_epi16() {
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm512_maskz_expandloadu_epi16(m, black_box(p));
+        let e = _mm512_set_epi16(
+            16, 15, 14, 0, 13, 0, 0, 0, 12, 11, 0, 0, 10, 0, 9, 0, 8, 7, 6, 5, 0, 0, 0, 0, 0, 0, 0,
+            0, 4, 3, 2, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi16() {
+        let src = _mm256_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_mask_expandloadu_epi16(src, m, black_box(p));
+        let e = _mm256_set_epi16(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_maskz_expandloadu_epi16(m, black_box(p));
+        let e = _mm256_set_epi16(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi16() {
+        let src = _mm_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_mask_expandloadu_epi16(src, m, black_box(p));
+        let e = _mm_set_epi16(4, 3, 2, 42, 1, 42, 42, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11101000;
+        let r = _mm_maskz_expandloadu_epi16(m, black_box(p));
+        let e = _mm_set_epi16(4, 3, 2, 0, 1, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expandloadu_epi8() {
+        let src = _mm512_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111_11111111_00000000_10101010_01010101;
+        let r = _mm512_mask_expandloadu_epi8(src, m, black_box(p));
+        let e = _mm512_set_epi8(
+            32, 31, 30, 42, 29, 42, 42, 42, 28, 27, 42, 42, 26, 42, 25, 42, 24, 23, 22, 21, 42, 42,
+            42, 42, 42, 42, 42, 42, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 42, 42, 42, 42,
+            42, 42, 42, 42, 8, 42, 7, 42, 6, 42, 5, 42, 42, 4, 42, 3, 42, 2, 42, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expandloadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111_11111111_00000000_10101010_01010101;
+        let r = _mm512_maskz_expandloadu_epi8(m, black_box(p));
+        let e = _mm512_set_epi8(
+            32, 31, 30, 0, 29, 0, 0, 0, 28, 27, 0, 0, 26, 0, 25, 0, 24, 23, 22, 21, 0, 0, 0, 0, 0,
+            0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0,
+            7, 0, 6, 0, 5, 0, 0, 4, 0, 3, 0, 2, 0, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expandloadu_epi8() {
+        let src = _mm256_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm256_mask_expandloadu_epi8(src, m, black_box(p));
+        let e = _mm256_set_epi8(
+            16, 15, 14, 42, 13, 42, 42, 42, 12, 11, 42, 42, 10, 42, 9, 42, 8, 7, 6, 5, 42, 42, 42,
+            42, 42, 42, 42, 42, 4, 3, 2, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expandloadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010_11110000_00001111;
+        let r = _mm256_maskz_expandloadu_epi8(m, black_box(p));
+        let e = _mm256_set_epi8(
+            16, 15, 14, 0, 13, 0, 0, 0, 12, 11, 0, 0, 10, 0, 9, 0, 8, 7, 6, 5, 0, 0, 0, 0, 0, 0, 0,
+            0, 4, 3, 2, 1,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expandloadu_epi8() {
+        let src = _mm_set1_epi8(42);
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_mask_expandloadu_epi8(src, m, black_box(p));
+        let e = _mm_set_epi8(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expandloadu_epi8() {
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_maskz_expandloadu_epi8(m, black_box(p));
+        let e = _mm_set_epi8(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compressstoreu_epi16() {
+        let a = _mm512_set_epi16(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
+            10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let mut r = [0_i16; 32];
+        _mm512_mask_compressstoreu_epi16(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i16; 32]);
+        _mm512_mask_compressstoreu_epi16(
+            r.as_mut_ptr() as *mut _,
+            0b11110000_11001010_11111111_00000000,
+            a,
+        );
+        assert_eq!(
+            &r,
+            &[
+                9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 24, 29, 30, 31, 32, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi16() {
+        let a = _mm256_set_epi16(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let mut r = [0_i16; 16];
+        _mm256_mask_compressstoreu_epi16(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i16; 16]);
+        _mm256_mask_compressstoreu_epi16(r.as_mut_ptr() as *mut _, 0b11110000_11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi16() {
+        let a = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        let mut r = [0_i16; 8];
+        _mm_mask_compressstoreu_epi16(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i16; 8]);
+        _mm_mask_compressstoreu_epi16(r.as_mut_ptr() as *mut _, 0b11110000, a);
+        assert_eq!(&r, &[5, 6, 7, 8, 0, 0, 0, 0]);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compressstoreu_epi8() {
+        let a = _mm512_set_epi8(
+            64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43,
+            42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21,
+            20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let mut r = [0_i8; 64];
+        _mm512_mask_compressstoreu_epi8(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i8; 64]);
+        _mm512_mask_compressstoreu_epi8(
+            r.as_mut_ptr() as *mut _,
+            0b11110000_11001010_11111111_00000000_10101010_01010101_11110000_00001111,
+            a,
+        );
+        assert_eq!(
+            &r,
+            &[
+                1, 2, 3, 4, 13, 14, 15, 16, 17, 19, 21, 23, 26, 28, 30, 32, 41, 42, 43, 44, 45, 46,
+                47, 48, 50, 52, 55, 56, 61, 62, 63, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compressstoreu_epi8() {
+        let a = _mm256_set_epi8(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
+            10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let mut r = [0_i8; 32];
+        _mm256_mask_compressstoreu_epi8(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i8; 32]);
+        _mm256_mask_compressstoreu_epi8(
+            r.as_mut_ptr() as *mut _,
+            0b11110000_11001010_11111111_00000000,
+            a,
+        );
+        assert_eq!(
+            &r,
+            &[
+                9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 24, 29, 30, 31, 32, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compressstoreu_epi8() {
+        let a = _mm_set_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let mut r = [0_i8; 16];
+        _mm_mask_compressstoreu_epi8(r.as_mut_ptr() as *mut _, 0, a);
+        assert_eq!(&r, &[0_i8; 16]);
+        _mm_mask_compressstoreu_epi8(r.as_mut_ptr() as *mut _, 0b11110000_11001010, a);
+        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
new file mode 100644
index 000000000..ff2c773ec
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
@@ -0,0 +1,939 @@
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssd_epi32&expand=2219)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssd_epi32&expand=2220)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm512_mask_dpwssd_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssd_epi32&expand=2221)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm512_maskz_dpwssd_epi32(
+    k: __mmask16,
+    src: __m512i,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_epi32&expand=2216)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssd_epi32&expand=2217)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm256_mask_dpwssd_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssd_epi32&expand=2218)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm256_maskz_dpwssd_epi32(
+    k: __mmask8,
+    src: __m256i,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_epi32&expand=2213)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssd_epi32&expand=2214)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssd_epi32&expand=2215)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssds_epi32&expand=2228)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssds_epi32&expand=2229)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm512_mask_dpwssds_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssds_epi32&expand=2230)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm512_maskz_dpwssds_epi32(
+    k: __mmask16,
+    src: __m512i,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_epi32&expand=2225)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssds_epi32&expand=2226)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm256_mask_dpwssds_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssds_epi32&expand=2227)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm256_maskz_dpwssds_epi32(
+    k: __mmask8,
+    src: __m256i,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_epi32&expand=2222)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssds_epi32&expand=2223)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssds_epi32&expand=2224)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm_maskz_dpwssds_epi32(
+    k: __mmask8,
+    src: __m128i,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusd_epi32&expand=2201)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusd_epi32&expand=2202)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm512_mask_dpbusd_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusd_epi32&expand=2203)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm512_maskz_dpbusd_epi32(
+    k: __mmask16,
+    src: __m512i,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_epi32&expand=2198)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusd_epi32&expand=2199)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm256_mask_dpbusd_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusd_epi32&expand=2200)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm256_maskz_dpbusd_epi32(
+    k: __mmask8,
+    src: __m256i,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_epi32&expand=2195)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusd_epi32&expand=2196)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusd_epi32&expand=2197)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusds_epi32&expand=2210)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusds_epi32&expand=2211)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm512_mask_dpbusds_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusds_epi32&expand=2212)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm512_maskz_dpbusds_epi32(
+    k: __mmask16,
+    src: __m512i,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_epi32&expand=2207)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusds_epi32&expand=2208)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm256_mask_dpbusds_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusds_epi32&expand=2209)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm256_maskz_dpbusds_epi32(
+    k: __mmask8,
+    src: __m256i,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_epi32&expand=2204)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusds_epi32&expand=2205)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusds_epi32&expand=2206)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm_maskz_dpbusds_epi32(
+    k: __mmask8,
+    src: __m128i,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.vpdpwssd.512"]
+    fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpwssd.256"]
+    fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpwssd.128"]
+    fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpwssds.512"]
+    fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpwssds.256"]
+    fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpwssds.128"]
+    fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpbusd.512"]
+    fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpbusd.256"]
+    fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpbusd.128"]
+    fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpbusds.512"]
+    fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpbusds.256"]
+    fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpbusds.128"]
+    fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_dpwssd_epi32(src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssd_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_dpwssds_epi32(src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_dpbusd_epi32(src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusd_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_dpbusds_epi32(src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vpclmulqdq.rs b/library/stdarch/crates/core_arch/src/x86/avx512vpclmulqdq.rs
new file mode 100644
index 000000000..9bfeb903a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vpclmulqdq.rs
@@ -0,0 +1,258 @@
+//! Vectorized Carry-less Multiplication (VCLMUL)
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
+//!
+//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.pclmulqdq.256"]
+    fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
+    #[link_name = "llvm.x86.pclmulqdq.512"]
+    fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
+}
+
+// for some odd reason on x86_64 we generate the correct long name instructions
+// but on i686 we generate the short name + imm8
+// so we need to special-case on that...
+
+/// Performs a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2^k) - in each of the 4 128-bit lanes.
+///
+/// The immediate byte is used for determining which halves of each lane `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+/// All lanes share immediate byte.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_clmulepi64_epi128)
+#[inline]
+#[target_feature(enable = "avx512vpclmulqdq,avx512f")]
+// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
+#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    pclmulqdq_512(a, b, IMM8 as u8)
+}
+
+/// Performs a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2^k) - in each of the 2 128-bit lanes.
+///
+/// The immediate byte is used for determining which halves of each lane `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+/// All lanes share immediate byte.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_clmulepi64_epi128)
+#[inline]
+#[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    pclmulqdq_256(a, b, IMM8 as u8)
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __mXXXi happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    macro_rules! verify_kat_pclmul {
+        ($broadcast:ident, $clmul:ident, $assert:ident) => {
+            // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
+         let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
+         let a = $broadcast(a);
+         let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
+         let b = $broadcast(b);
+         let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
+         let r00 = $broadcast(r00);
+         let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
+         let r01 = $broadcast(r01);
+         let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
+         let r10 = $broadcast(r10);
+         let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
+         let r11 = $broadcast(r11);
+
+         $assert($clmul::<0x00>(a, b), r00);
+         $assert($clmul::<0x10>(a, b), r01);
+         $assert($clmul::<0x01>(a, b), r10);
+         $assert($clmul::<0x11>(a, b), r11);
+
+         let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
+         let a0 = $broadcast(a0);
+         let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
+         let r = $broadcast(r);
+         $assert($clmul::<0x00>(a0, a0), r);
+        }
+    }
+
+    macro_rules! unroll {
+        ($target:ident[4] = $op:ident::<4>($source:ident);) => {
+            $target[3] = $op::<3>($source);
+            $target[2] = $op::<2>($source);
+            unroll! {$target[2] = $op::<2>($source);}
+        };
+        ($target:ident[2] = $op:ident::<2>($source:ident);) => {
+            $target[1] = $op::<1>($source);
+            $target[0] = $op::<0>($source);
+        };
+        (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => {
+            assert_eq_m128i($op::<3>($vec_res), $lin_res[3]);
+            assert_eq_m128i($op::<2>($vec_res), $lin_res[2]);
+            unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);}
+        };
+        (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => {
+            assert_eq_m128i($op::<1>($vec_res), $lin_res[1]);
+            assert_eq_m128i($op::<0>($vec_res), $lin_res[0]);
+        };
+    }
+
+    // this function tests one of the possible 4 instances
+    // with different inputs across lanes
+    #[target_feature(enable = "avx512vpclmulqdq,avx512f")]
+    unsafe fn verify_512_helper(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
+    ) {
+        let a = _mm512_set_epi64(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+            0xAB05CFFA2621154C,
+            0x1171B47A186174C9,
+            0x8C6B6C0E7595CEC9,
+            0xBE3E7D4934E961BD,
+        );
+        let b = _mm512_set_epi64(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+            0xB1927AB22F31D0EC,
+            0xA9A5DA619BE4D7AF,
+            0xCA2590F56884FDC6,
+            0x19BE9F660038BDB5,
+        );
+
+        let mut a_decomp = [_mm_setzero_si128(); 4];
+        unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);}
+        let mut b_decomp = [_mm_setzero_si128(); 4];
+        unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);}
+
+        let r = vectorized(a, b);
+        let mut e_decomp = [_mm_setzero_si128(); 4];
+        for i in 0..4 {
+            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
+        }
+        unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);}
+    }
+
+    // this function tests one of the possible 4 instances
+    // with different inputs across lanes for the VL version
+    #[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
+    unsafe fn verify_256_helper(
+        linear: unsafe fn(__m128i, __m128i) -> __m128i,
+        vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
+    ) {
+        let a = _mm512_set_epi64(
+            0xDCB4DB3657BF0B7D,
+            0x18DB0601068EDD9F,
+            0xB76B908233200DC5,
+            0xE478235FA8E22D5E,
+            0xAB05CFFA2621154C,
+            0x1171B47A186174C9,
+            0x8C6B6C0E7595CEC9,
+            0xBE3E7D4934E961BD,
+        );
+        let b = _mm512_set_epi64(
+            0x672F6F105A94CEA7,
+            0x8298B8FFCA5F829C,
+            0xA3927047B3FB61D8,
+            0x978093862CDE7187,
+            0xB1927AB22F31D0EC,
+            0xA9A5DA619BE4D7AF,
+            0xCA2590F56884FDC6,
+            0x19BE9F660038BDB5,
+        );
+
+        let mut a_decomp = [_mm_setzero_si128(); 2];
+        unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);}
+        let mut b_decomp = [_mm_setzero_si128(); 2];
+        unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);}
+
+        let r = vectorized(
+            _mm512_extracti64x4_epi64::<0>(a),
+            _mm512_extracti64x4_epi64::<0>(b),
+        );
+        let mut e_decomp = [_mm_setzero_si128(); 2];
+        for i in 0..2 {
+            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
+        }
+        unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);}
+    }
+
+    #[simd_test(enable = "avx512vpclmulqdq,avx512f")]
+    unsafe fn test_mm512_clmulepi64_epi128() {
+        verify_kat_pclmul!(
+            _mm512_broadcast_i32x4,
+            _mm512_clmulepi64_epi128,
+            assert_eq_m512i
+        );
+
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b),
+        );
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b),
+        );
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b),
+        );
+        verify_512_helper(
+            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
+            |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b),
+        );
+    }
+
+    #[simd_test(enable = "avx512vpclmulqdq,avx512vl")]
+    unsafe fn test_mm256_clmulepi64_epi128() {
+        verify_kat_pclmul!(
+            _mm256_broadcastsi128_si256,
+            _mm256_clmulepi64_epi128,
+            assert_eq_m256i
+        );
+
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b),
+        );
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b),
+        );
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b),
+        );
+        verify_256_helper(
+            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
+            |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b),
+        );
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs b/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs
new file mode 100644
index 000000000..3b97c4c19
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs
@@ -0,0 +1,541 @@
+//! Vectorized Population Count Instructions for Double- and Quadwords (VPOPCNTDQ)
+//!
+//! The intrinsics here correspond to those in the `immintrin.h` C header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::simd::i32x16;
+use crate::core_arch::simd::i32x4;
+use crate::core_arch::simd::i32x8;
+use crate::core_arch::simd::i64x2;
+use crate::core_arch::simd::i64x4;
+use crate::core_arch::simd::i64x8;
+use crate::core_arch::simd_llvm::simd_select_bitmask;
+use crate::core_arch::x86::__m128i;
+use crate::core_arch::x86::__m256i;
+use crate::core_arch::x86::__m512i;
+use crate::core_arch::x86::__mmask16;
+use crate::core_arch::x86::__mmask8;
+use crate::core_arch::x86::_mm256_setzero_si256;
+use crate::core_arch::x86::_mm512_setzero_si512;
+use crate::core_arch::x86::_mm_setzero_si128;
+use crate::core_arch::x86::m128iExt;
+use crate::core_arch::x86::m256iExt;
+use crate::core_arch::x86::m512iExt;
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.ctpop.v16i32"]
+    fn popcnt_v16i32(x: i32x16) -> i32x16;
+    #[link_name = "llvm.ctpop.v8i32"]
+    fn popcnt_v8i32(x: i32x8) -> i32x8;
+    #[link_name = "llvm.ctpop.v4i32"]
+    fn popcnt_v4i32(x: i32x4) -> i32x4;
+
+    #[link_name = "llvm.ctpop.v8i64"]
+    fn popcnt_v8i64(x: i64x8) -> i64x8;
+    #[link_name = "llvm.ctpop.v4i64"]
+    fn popcnt_v4i64(x: i64x4) -> i64x4;
+    #[link_name = "llvm.ctpop.v2i64"]
+    fn popcnt_v2i64(x: i64x2) -> i64x2;
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub unsafe fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
+    transmute(popcnt_v16i32(a.as_i32x16()))
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub unsafe fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, popcnt_v16i32(a.as_i32x16()), zero))
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub unsafe fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v16i32(a.as_i32x16()),
+        src.as_i32x16(),
+    ))
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub unsafe fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
+    transmute(popcnt_v8i32(a.as_i32x8()))
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub unsafe fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, popcnt_v8i32(a.as_i32x8()), zero))
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub unsafe fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v8i32(a.as_i32x8()),
+        src.as_i32x8(),
+    ))
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub unsafe fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
+    transmute(popcnt_v4i32(a.as_i32x4()))
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub unsafe fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, popcnt_v4i32(a.as_i32x4()), zero))
+}
+
+/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_popcnt_epi32)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntd))]
+pub unsafe fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v4i32(a.as_i32x4()),
+        src.as_i32x4(),
+    ))
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub unsafe fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
+    transmute(popcnt_v8i64(a.as_i64x8()))
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub unsafe fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, popcnt_v8i64(a.as_i64x8()), zero))
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub unsafe fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v8i64(a.as_i64x8()),
+        src.as_i64x8(),
+    ))
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub unsafe fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
+    transmute(popcnt_v4i64(a.as_i64x4()))
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub unsafe fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, popcnt_v4i64(a.as_i64x4()), zero))
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub unsafe fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v4i64(a.as_i64x4()),
+        src.as_i64x4(),
+    ))
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub unsafe fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
+    transmute(popcnt_v2i64(a.as_i64x2()))
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub unsafe fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, popcnt_v2i64(a.as_i64x2()), zero))
+}
+
+/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
+///
+/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
+/// Otherwise the computation result is written into the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_popcnt_epi64)
+#[inline]
+#[target_feature(enable = "avx512vpopcntdq,avx512vl")]
+#[cfg_attr(test, assert_instr(vpopcntq))]
+pub unsafe fn _mm_mask_popcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(
+        k,
+        popcnt_v2i64(a.as_i64x2()),
+        src.as_i64x2(),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_popcnt_epi32() {
+        let test_data = _mm512_set_epi32(
+            0,
+            1,
+            -1,
+            2,
+            7,
+            0xFF_FE,
+            0x7F_FF_FF_FF,
+            -100,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        let actual_result = _mm512_popcnt_epi32(test_data);
+        let reference_result =
+            _mm512_set_epi32(0, 1, 32, 1, 3, 15, 31, 28, 1, 5, 6, 3, 10, 17, 8, 1);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi32() {
+        let test_data = _mm512_set_epi32(
+            0,
+            1,
+            -1,
+            2,
+            7,
+            0xFF_FE,
+            0x7F_FF_FF_FF,
+            -100,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm512_mask_popcnt_epi32(test_data, mask, test_data);
+        let reference_result = _mm512_set_epi32(
+            0,
+            1,
+            32,
+            1,
+            3,
+            15,
+            31,
+            28,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi32() {
+        let test_data = _mm512_set_epi32(
+            0,
+            1,
+            -1,
+            2,
+            7,
+            0xFF_FE,
+            0x7F_FF_FF_FF,
+            -100,
+            0x40_00_00_00,
+            103,
+            371,
+            552,
+            432_948,
+            818_826_998,
+            255,
+            256,
+        );
+        let mask = 0xFF_00;
+        let actual_result = _mm512_maskz_popcnt_epi32(mask, test_data);
+        let reference_result = _mm512_set_epi32(0, 1, 32, 1, 3, 15, 31, 28, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi32() {
+        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        let actual_result = _mm256_popcnt_epi32(test_data);
+        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 3, 15, 31, 28);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi32() {
+        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm256_mask_popcnt_epi32(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi32() {
+        let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm256_maskz_popcnt_epi32(mask, test_data);
+        let reference_result = _mm256_set_epi32(0, 1, 32, 1, 0, 0, 0, 0);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm_popcnt_epi32() {
+        let test_data = _mm_set_epi32(0, 1, -1, -100);
+        let actual_result = _mm_popcnt_epi32(test_data);
+        let reference_result = _mm_set_epi32(0, 1, 32, 28);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi32() {
+        let test_data = _mm_set_epi32(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm_mask_popcnt_epi32(test_data, mask, test_data);
+        let reference_result = _mm_set_epi32(0, 1, 32, -100);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi32() {
+        let test_data = _mm_set_epi32(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm_maskz_popcnt_epi32(mask, test_data);
+        let reference_result = _mm_set_epi32(0, 1, 32, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_popcnt_epi64() {
+        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        let actual_result = _mm512_popcnt_epi64(test_data);
+        let reference_result = _mm512_set_epi64(0, 1, 64, 1, 3, 15, 63, 60);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_mask_popcnt_epi64() {
+        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm512_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result =
+            _mm512_set_epi64(0, 1, 64, 1, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512f")]
+    unsafe fn test_mm512_maskz_popcnt_epi64() {
+        let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100);
+        let mask = 0xF0;
+        let actual_result = _mm512_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm512_set_epi64(0, 1, 64, 1, 0, 0, 0, 0);
+        assert_eq_m512i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm256_popcnt_epi64() {
+        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
+        let actual_result = _mm256_popcnt_epi64(test_data);
+        let reference_result = _mm256_set_epi64x(0, 1, 64, 60);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm256_mask_popcnt_epi64() {
+        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm256_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result = _mm256_set_epi64x(0, 1, 64, -100);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm256_maskz_popcnt_epi64() {
+        let test_data = _mm256_set_epi64x(0, 1, -1, -100);
+        let mask = 0xE;
+        let actual_result = _mm256_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm256_set_epi64x(0, 1, 64, 0);
+        assert_eq_m256i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm_popcnt_epi64() {
+        let test_data = _mm_set_epi64x(0, 1);
+        let actual_result = _mm_popcnt_epi64(test_data);
+        let reference_result = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(actual_result, reference_result);
+        let test_data = _mm_set_epi64x(-1, -100);
+        let actual_result = _mm_popcnt_epi64(test_data);
+        let reference_result = _mm_set_epi64x(64, 60);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm_mask_popcnt_epi64() {
+        let test_data = _mm_set_epi64x(0, -100);
+        let mask = 0x2;
+        let actual_result = _mm_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result = _mm_set_epi64x(0, -100);
+        assert_eq_m128i(actual_result, reference_result);
+        let test_data = _mm_set_epi64x(-1, 1);
+        let mask = 0x2;
+        let actual_result = _mm_mask_popcnt_epi64(test_data, mask, test_data);
+        let reference_result = _mm_set_epi64x(64, 1);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+
+    #[simd_test(enable = "avx512vpopcntdq,avx512vl")]
+    unsafe fn test_mm_maskz_popcnt_epi64() {
+        let test_data = _mm_set_epi64x(0, 1);
+        let mask = 0x2;
+        let actual_result = _mm_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(actual_result, reference_result);
+        let test_data = _mm_set_epi64x(-1, -100);
+        let mask = 0x2;
+        let actual_result = _mm_maskz_popcnt_epi64(mask, test_data);
+        let reference_result = _mm_set_epi64x(64, 0);
+        assert_eq_m128i(actual_result, reference_result);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bmi1.rs b/library/stdarch/crates/core_arch/src/x86/bmi1.rs
new file mode 100644
index 000000000..0f769f33b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bmi1.rs
@@ -0,0 +1,178 @@
+//! Bit Manipulation Instruction (BMI) Set 1.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+    _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32))
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
+/// to be extracted, and bits `[15,8]` specify the length of the range.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    x86_bmi_bextr_32(a, control)
+}
+
+/// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_andn_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
+    !a & b
+}
+
+/// Extracts lowest set isolated bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsi_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsi_u32(x: u32) -> u32 {
+    x & x.wrapping_neg()
+}
+
+/// Gets mask up to lowest set bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsmsk_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
+    x ^ (x.wrapping_sub(1_u32))
+}
+
+/// Resets the lowest set bit of `x`.
+///
+/// If `x` is sets CF.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsr_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsr_u32(x: u32) -> u32 {
+    x & (x.wrapping_sub(1))
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
+    x.trailing_zeros()
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_tzcnt_32)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_tzcnt_32(x: u32) -> i32 {
+    x.trailing_zeros() as i32
+}
+
+extern "C" {
+    #[link_name = "llvm.x86.bmi.bextr.32"]
+    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_bextr_u32() {
+        let r = _bextr_u32(0b0101_0000u32, 4, 4);
+        assert_eq!(r, 0b0000_0101u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_andn_u32() {
+        assert_eq!(_andn_u32(0, 0), 0);
+        assert_eq!(_andn_u32(0, 1), 1);
+        assert_eq!(_andn_u32(1, 0), 0);
+        assert_eq!(_andn_u32(1, 1), 0);
+
+        let r = _andn_u32(0b0000_0000u32, 0b0000_0000u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b0000_0000u32, 0b1111_1111u32);
+        assert_eq!(r, 0b1111_1111u32);
+
+        let r = _andn_u32(0b1111_1111u32, 0b0000_0000u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b1111_1111u32, 0b1111_1111u32);
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = _andn_u32(0b0100_0000u32, 0b0101_1101u32);
+        assert_eq!(r, 0b0001_1101u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsi_u32() {
+        assert_eq!(_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsmsk_u32() {
+        let r = _blsmsk_u32(0b0011_0000u32);
+        assert_eq!(r, 0b0001_1111u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsr_u32() {
+        // TODO: test the behavior when the input is `0`.
+        let r = _blsr_u32(0b0011_0000u32);
+        assert_eq!(r, 0b0010_0000u32);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u32() {
+        assert_eq!(_tzcnt_u32(0b0000_0001u32), 0u32);
+        assert_eq!(_tzcnt_u32(0b0000_0000u32), 32u32);
+        assert_eq!(_tzcnt_u32(0b1001_0000u32), 4u32);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bmi2.rs b/library/stdarch/crates/core_arch/src/x86/bmi2.rs
new file mode 100644
index 000000000..b08b8733c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bmi2.rs
@@ -0,0 +1,133 @@
+//! Bit Manipulation Instruction (BMI) Set 2.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Unsigned multiply without affecting flags.
+///
+/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
+/// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mulx_u32)
+#[inline]
+// LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
+#[cfg_attr(all(test, target_arch = "x86"), assert_instr(mul))]
+#[target_feature(enable = "bmi2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
+    let result: u64 = (a as u64) * (b as u64);
+    *hi = (result >> 32) as u32;
+    result as u32
+}
+
+/// Zeroes higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bzhi_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(bzhi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
+    x86_bmi2_bzhi_32(a, index)
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pdep_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pdep))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
+    x86_bmi2_pdep_32(a, mask)
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pext_u32)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pext))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
+    x86_bmi2_pext_32(a, mask)
+}
+
+extern "C" {
+    #[link_name = "llvm.x86.bmi.bzhi.32"]
+    fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32;
+    #[link_name = "llvm.x86.bmi.pdep.32"]
+    fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32;
+    #[link_name = "llvm.x86.bmi.pext.32"]
+    fn x86_bmi2_pext_32(x: u32, y: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pext_u32() {
+        let n = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0000_0011_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b0001_0111_0100_0011u32;
+
+        assert_eq!(_pext_u32(n, m0), s0);
+        assert_eq!(_pext_u32(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pdep_u32() {
+        let n = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0010_0000_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b1110_1001_0010_0011u32;
+
+        assert_eq!(_pdep_u32(n, m0), s0);
+        assert_eq!(_pdep_u32(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_bzhi_u32() {
+        let n = 0b1111_0010u32;
+        let s = 0b0001_0010u32;
+        assert_eq!(_bzhi_u32(n, 5), s);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_mulx_u32() {
+        let a: u32 = 4_294_967_200;
+        let b: u32 = 2;
+        let mut hi = 0;
+        let lo = _mulx_u32(a, b, &mut hi);
+        /*
+        result = 8589934400
+               = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
+                   ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                */
+        assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32);
+        assert_eq!(hi, 0b0001u32);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bswap.rs b/library/stdarch/crates/core_arch/src/x86/bswap.rs
new file mode 100644
index 000000000..fcaad26fb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bswap.rs
@@ -0,0 +1,28 @@
+//! Byte swap intrinsics.
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Returns an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bswap)
+#[inline]
+#[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bswap(x: i32) -> i32 {
+    x.swap_bytes()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bswap() {
+        unsafe {
+            assert_eq!(_bswap(0x0EADBE0F), 0x0FBEAD0E);
+            assert_eq!(_bswap(0x00000000), 0x00000000);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/bt.rs b/library/stdarch/crates/core_arch/src/x86/bt.rs
new file mode 100644
index 000000000..338cb97a3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/bt.rs
@@ -0,0 +1,135 @@
+use crate::arch::asm;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// x32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+#[cfg(target_pointer_width = "32")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b:e}, ({p:e})")
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b:e}, ({p})")
+    };
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`.
+#[inline]
+#[cfg_attr(test, assert_instr(bt))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittest(p: *const i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(readonly, nostack, pure, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then sets the bit to `1`.
+#[inline]
+#[cfg_attr(test, assert_instr(bts))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandset(p: *mut i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btsl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then resets that bit to `0`.
+#[inline]
+#[cfg_attr(test, assert_instr(btr))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandreset(p: *mut i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btrl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then inverts that bit.
+#[inline]
+#[cfg_attr(test, assert_instr(btc))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandcomplement(p: *mut i32, b: i32) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btcl"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+
+    #[test]
+    fn test_bittest() {
+        unsafe {
+            let a = 0b0101_0000i32;
+            assert_eq!(_bittest(&a as _, 4), 1);
+            assert_eq!(_bittest(&a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    fn test_bittestandset() {
+        unsafe {
+            let mut a = 0b0101_0000i32;
+            assert_eq!(_bittestandset(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset(&mut a as _, 5), 0);
+            assert_eq!(_bittestandset(&mut a as _, 5), 1);
+        }
+    }
+
+    #[test]
+    fn test_bittestandreset() {
+        unsafe {
+            let mut a = 0b0101_0000i32;
+            assert_eq!(_bittestandreset(&mut a as _, 4), 1);
+            assert_eq!(_bittestandreset(&mut a as _, 4), 0);
+            assert_eq!(_bittestandreset(&mut a as _, 5), 0);
+            assert_eq!(_bittestandreset(&mut a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    fn test_bittestandcomplement() {
+        unsafe {
+            let mut a = 0b0101_0000i32;
+            assert_eq!(_bittestandcomplement(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement(&mut a as _, 4), 0);
+            assert_eq!(_bittestandcomplement(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement(&mut a as _, 5), 0);
+            assert_eq!(_bittestandcomplement(&mut a as _, 5), 1);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/cpuid.rs b/library/stdarch/crates/core_arch/src/x86/cpuid.rs
new file mode 100644
index 000000000..6b90295ef
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/cpuid.rs
@@ -0,0 +1,197 @@
+//! `cpuid` intrinsics
+#![allow(clippy::module_name_repetitions)]
+
+use crate::arch::asm;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Result of the `cpuid` instruction.
+#[allow(clippy::missing_inline_in_public_items)]
+// ^^ the derived impl of Debug for CpuidResult is not #[inline] and that's OK.
+#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub struct CpuidResult {
+    /// EAX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub eax: u32,
+    /// EBX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub ebx: u32,
+    /// ECX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub ecx: u32,
+    /// EDX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub edx: u32,
+}
+
+/// Returns the result of the `cpuid` instruction for a given `leaf` (`EAX`)
+/// and
+/// `sub_leaf` (`ECX`).
+///
+/// The highest-supported leaf value is returned by the first tuple argument of
+/// [`__get_cpuid_max(0)`](fn.__get_cpuid_max.html). For leaves containung
+/// sub-leaves, the second tuple argument returns the highest-supported
+/// sub-leaf
+/// value.
+///
+/// The [CPUID Wikipedia page][wiki_cpuid] contains how to query which
+/// information using the `EAX` and `ECX` registers, and the interpretation of
+/// the results returned in `EAX`, `EBX`, `ECX`, and `EDX`.
+///
+/// The references are:
+/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+///   Instruction Set Reference, A-Z][intel64_ref].
+/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+///   System Instructions][amd64_ref].
+///
+/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
+/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+#[inline]
+#[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
+    let eax;
+    let ebx;
+    let ecx;
+    let edx;
+
+    // LLVM sometimes reserves `ebx` for its internal use, we so we need to use
+    // a scratch register for it instead.
+    #[cfg(target_arch = "x86")]
+    {
+        asm!(
+            "movl %ebx, {0}",
+            "cpuid",
+            "xchgl %ebx, {0}",
+            lateout(reg) ebx,
+            inlateout("eax") leaf => eax,
+            inlateout("ecx") sub_leaf => ecx,
+            lateout("edx") edx,
+            options(nostack, preserves_flags, att_syntax),
+        );
+    }
+    #[cfg(target_arch = "x86_64")]
+    {
+        asm!(
+            "movq %rbx, {0:r}",
+            "cpuid",
+            "xchgq %rbx, {0:r}",
+            lateout(reg) ebx,
+            inlateout("eax") leaf => eax,
+            inlateout("ecx") sub_leaf => ecx,
+            lateout("edx") edx,
+            options(nostack, preserves_flags, att_syntax),
+        );
+    }
+    CpuidResult { eax, ebx, ecx, edx }
+}
+
+/// See [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline]
+#[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
+    __cpuid_count(leaf, 0)
+}
+
+/// Does the host support the `cpuid` instruction?
+#[inline]
+pub fn has_cpuid() -> bool {
+    #[cfg(target_env = "sgx")]
+    {
+        false
+    }
+    #[cfg(all(not(target_env = "sgx"), target_arch = "x86_64"))]
+    {
+        true
+    }
+    #[cfg(all(not(target_env = "sgx"), target_arch = "x86"))]
+    {
+        // Optimization for i586 and i686 Rust targets which SSE enabled
+        // and support cpuid:
+        #[cfg(target_feature = "sse")]
+        {
+            true
+        }
+
+        // If SSE is not enabled, detect whether cpuid is available:
+        #[cfg(not(target_feature = "sse"))]
+        unsafe {
+            // On `x86` the `cpuid` instruction is not always available.
+            // This follows the approach indicated in:
+            // http://wiki.osdev.org/CPUID#Checking_CPUID_availability
+            // https://software.intel.com/en-us/articles/using-cpuid-to-detect-the-presence-of-sse-41-and-sse-42-instruction-sets/
+            // which detects whether `cpuid` is available by checking whether
+            // the 21st bit of the EFLAGS register is modifiable or not.
+            // If it is, then `cpuid` is available.
+            let result: u32;
+            asm!(
+                // Read eflags and save a copy of it
+                "pushfd",
+                "pop {result}",
+                "mov {result}, {saved_flags}",
+                // Flip 21st bit of the flags
+                "xor $0x200000, {result}",
+                // Load the modified flags and read them back.
+                // Bit 21 can only be modified if cpuid is available.
+                "push {result}",
+                "popfd",
+                "pushfd",
+                "pop {result}",
+                // Use xor to find out whether bit 21 has changed
+                "xor {saved_flags}, {result}",
+                result = out(reg) result,
+                saved_flags = out(reg) _,
+                options(nomem, att_syntax),
+            );
+            // There is a race between popfd (A) and pushfd (B)
+            // where other bits beyond 21st may have been modified due to
+            // interrupts, a debugger stepping through the asm, etc.
+            //
+            // Therefore, explicitly check whether the 21st bit
+            // was modified or not.
+            //
+            // If the result is zero, the cpuid bit was not modified.
+            // If the result is `0x200000` (non-zero), then the cpuid
+            // was correctly modified and the CPU supports the cpuid
+            // instruction:
+            (result & 0x200000) != 0
+        }
+    }
+}
+
+/// Returns the highest-supported `leaf` (`EAX`) and sub-leaf (`ECX`) `cpuid`
+/// values.
+///
+/// If `cpuid` is supported, and `leaf` is zero, then the first tuple argument
+/// contains the highest `leaf` value that `cpuid` supports. For `leaf`s
+/// containing sub-leafs, the second tuple argument contains the
+/// highest-supported sub-leaf value.
+///
+/// See also [`__cpuid`](fn.__cpuid.html) and
+/// [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
+    let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
+    (eax, ebx)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+
+    #[test]
+    fn test_always_has_cpuid() {
+        // all currently-tested targets have the instruction
+        // FIXME: add targets without `cpuid` to CI
+        assert!(cpuid::has_cpuid());
+    }
+
+    #[test]
+    fn test_has_cpuid_idempotent() {
+        assert_eq!(cpuid::has_cpuid(), cpuid::has_cpuid());
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/eflags.rs b/library/stdarch/crates/core_arch/src/x86/eflags.rs
new file mode 100644
index 000000000..e9ebdf22b
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/eflags.rs
@@ -0,0 +1,85 @@
+//! `i386` intrinsics
+
+use crate::arch::asm;
+
+/// Reads EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__readeflags)
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __readeflags() -> u32 {
+    let eflags: u32;
+    asm!("pushfd", "pop {}", out(reg) eflags, options(nomem, att_syntax));
+    eflags
+}
+
+/// Reads EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__readeflags)
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __readeflags() -> u64 {
+    let eflags: u64;
+    asm!("pushfq", "pop {}", out(reg) eflags, options(nomem, att_syntax));
+    eflags
+}
+
+/// Write EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__writeeflags)
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __writeeflags(eflags: u32) {
+    asm!("push {}", "popfd", in(reg) eflags, options(nomem, att_syntax));
+}
+
+/// Write EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__writeeflags)
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.29.0",
+    note = "See issue #51810 - use inline assembly instead"
+)]
+#[doc(hidden)]
+pub unsafe fn __writeeflags(eflags: u64) {
+    asm!("push {}", "popfq", in(reg) eflags, options(nomem, att_syntax));
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+
+    #[test]
+    #[allow(deprecated)]
+    fn test_eflags() {
+        unsafe {
+            // reads eflags, writes them back, reads them again,
+            // and compare for equality:
+            let v = __readeflags();
+            __writeeflags(v);
+            let u = __readeflags();
+            assert_eq!(v, u);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/f16c.rs b/library/stdarch/crates/core_arch/src/x86/f16c.rs
new file mode 100644
index 000000000..8b25fd65e
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/f16c.rs
@@ -0,0 +1,112 @@
+//! [F16C intrinsics].
+//!
+//! [F16C intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fp16&expand=1769
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    //    hint::unreachable_unchecked,
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.vcvtph2ps.128"]
+    fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
+    #[link_name = "llvm.x86.vcvtph2ps.256"]
+    fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
+    #[link_name = "llvm.x86.vcvtps2ph.128"]
+    fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
+    #[link_name = "llvm.x86.vcvtps2ph.256"]
+    fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8;
+}
+
+/// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of
+/// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide
+/// vector.
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtph2ps"))]
+pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 {
+    transmute(llvm_vcvtph2ps_128(transmute(a)))
+}
+
+/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
+/// `a` into 8 x 32-bit float values stored in a 256-bit wide vector.
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtph2ps"))]
+pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
+    transmute(llvm_vcvtph2ps_256(transmute(a)))
+}
+
+/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
+/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit
+/// vector.
+///
+/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
+///
+/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions,
+/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions,
+/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions,
+/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions,
+/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtps_ph<const IMM_ROUNDING: i32>(a: __m128) -> __m128i {
+    static_assert_imm3!(IMM_ROUNDING);
+    let a = a.as_f32x4();
+    let r = llvm_vcvtps2ph_128(a, IMM_ROUNDING);
+    transmute(r)
+}
+
+/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x
+/// 16-bit half-precision float values stored in a 128-bit wide vector.
+///
+/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
+///
+/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions,
+/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions,
+/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions,
+/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions,
+/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_cvtps_ph<const IMM_ROUNDING: i32>(a: __m256) -> __m128i {
+    static_assert_imm3!(IMM_ROUNDING);
+    let a = a.as_f32x8();
+    let r = llvm_vcvtps2ph_256(a, IMM_ROUNDING);
+    transmute(r)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{core_arch::x86::*, mem::transmute};
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm_cvtph_ps() {
+        let array = [1_f32, 2_f32, 3_f32, 4_f32];
+        let float_vec: __m128 = transmute(array);
+        let halfs: __m128i = _mm_cvtps_ph::<0>(float_vec);
+        let floats: __m128 = _mm_cvtph_ps(halfs);
+        let result: [f32; 4] = transmute(floats);
+        assert_eq!(result, array);
+    }
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm256_cvtph_ps() {
+        let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32];
+        let float_vec: __m256 = transmute(array);
+        let halfs: __m128i = _mm256_cvtps_ph::<0>(float_vec);
+        let floats: __m256 = _mm256_cvtph_ps(halfs);
+        let result: [f32; 8] = transmute(floats);
+        assert_eq!(result, array);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/fma.rs b/library/stdarch/crates/core_arch/src/x86/fma.rs
new file mode 100644
index 000000000..476f4538c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/fma.rs
@@ -0,0 +1,795 @@
+//! Fused Multiply-Add instruction set (FMA)
+//!
+//! The FMA instruction set is an extension to the 128 and 256-bit SSE
+//! instructions in the x86 microprocessor instruction set to perform fused
+//! multiply–add (FMA) operations.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
+//! instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use crate::core_arch::simd_llvm::simd_fma;
+use crate::core_arch::x86::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    simd_fma(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    simd_fma(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    simd_fma(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    simd_fma(a, b, c)
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and add the intermediate result to the lower element in `c`.
+/// Stores the result in the lower element of the returned value, and copy the
+/// upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmaddsd(a, b, c)
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and add the intermediate result to the lower element in `c`.
+/// Stores the result in the lower element of the returned value, and copy the
+/// 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmaddss(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmaddsubpd(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfmaddsubpd256(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmaddsubps(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively add and subtract packed elements in `c` to/from
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmaddsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfmaddsubps256(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmsubpd(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfmsubpd256(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub213ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmsubps(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub213ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfmsubps256(a, b, c)
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and subtract the lower element in `c` from the intermediate
+/// result. Store the result in the lower element of the returned value, and
+/// copy the upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmsubsd(a, b, c)
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`,  and subtract the lower element in `c` from the intermediate
+/// result. Store the result in the lower element of the returned value, and
+/// copy the 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmsubss(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfmsubaddpd(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfmsubaddpd256(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfmsubaddps(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and alternatively subtract and add packed elements in `c` from/to
+/// the intermediate result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfmsubadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfmsubaddps256(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfnmaddpd(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfnmaddpd256(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfnmaddps(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and add the negated intermediate result to packed elements in `c`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfnmaddps256(a, b, c)
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and add the negated intermediate result to the lower element
+/// in `c`. Store the result in the lower element of the returned value, and
+/// copy the upper element from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfnmaddsd(a, b, c)
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and add the negated intermediate result to the lower element
+/// in `c`. Store the result in the lower element of the returned value, and
+/// copy the 3 upper elements from `a` to the upper elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmadd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfnmaddss(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfnmsubpd(a, b, c)
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_pd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    vfnmsubpd256(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfnmsubps(a, b, c)
+}
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and subtract packed elements in `c` from the negated intermediate
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ps)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    vfnmsubps256(a, b, c)
+}
+
+/// Multiplies the lower double-precision (64-bit) floating-point elements in
+/// `a` and `b`, and subtract packed elements in `c` from the negated
+/// intermediate result. Store the result in the lower element of the returned
+/// value, and copy the upper element from `a` to the upper elements of the
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sd)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    vfnmsubsd(a, b, c)
+}
+
+/// Multiplies the lower single-precision (32-bit) floating-point elements in
+/// `a` and `b`, and subtract packed elements in `c` from the negated
+/// intermediate result. Store the result in the lower element of the
+/// returned value, and copy the 3 upper elements from `a` to the upper
+/// elements of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ss)
+#[inline]
+#[target_feature(enable = "fma")]
+#[cfg_attr(test, assert_instr(vfnmsub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
+    vfnmsubss(a, b, c)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.fma.vfmadd.sd"]
+    fn vfmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmadd.ss"]
+    fn vfmaddss(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmaddsub.pd"]
+    fn vfmaddsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmaddsub.pd.256"]
+    fn vfmaddsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfmaddsub.ps"]
+    fn vfmaddsubps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmaddsub.ps.256"]
+    fn vfmaddsubps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfmsub.pd"]
+    fn vfmsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmsub.pd.256"]
+    fn vfmsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfmsub.ps"]
+    fn vfmsubps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmsub.ps.256"]
+    fn vfmsubps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfmsub.sd"]
+    fn vfmsubsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmsub.ss"]
+    fn vfmsubss(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmsubadd.pd"]
+    fn vfmsubaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfmsubadd.pd.256"]
+    fn vfmsubaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfmsubadd.ps"]
+    fn vfmsubaddps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfmsubadd.ps.256"]
+    fn vfmsubaddps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfnmadd.pd"]
+    fn vfnmaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfnmadd.pd.256"]
+    fn vfnmaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfnmadd.ps"]
+    fn vfnmaddps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfnmadd.ps.256"]
+    fn vfnmaddps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfnmadd.sd"]
+    fn vfnmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfnmadd.ss"]
+    fn vfnmaddss(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfnmsub.pd"]
+    fn vfnmsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfnmsub.pd.256"]
+    fn vfnmsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
+    #[link_name = "llvm.x86.fma.vfnmsub.ps"]
+    fn vfnmsubps(a: __m128, b: __m128, c: __m128) -> __m128;
+    #[link_name = "llvm.x86.fma.vfnmsub.ps.256"]
+    fn vfnmsubps256(a: __m256, b: __m256, c: __m256) -> __m256;
+    #[link_name = "llvm.x86.fma.vfnmsub.sd"]
+    fn vfnmsubsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.fma.vfnmsub.ss"]
+    fn vfnmsubss(a: __m128, b: __m128, c: __m128) -> __m128;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., 15.);
+        assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(9., 15., 22., 15.);
+        assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., 15., 22., 15.);
+        assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
+        assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., 2.);
+        assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmadd_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., 2., 3., 4.);
+        assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmaddsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., 15.);
+        assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmaddsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(1., 15., 20., 15.);
+        assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmaddsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., 15., 20., 15.);
+        assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmaddsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
+        assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., -3.);
+        assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(1., -3., 20., 1.);
+        assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., -3., 20., 1.);
+        assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
+        assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(1., 2.);
+        assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsub_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(1., 2., 3., 4.);
+        assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsubadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(9., -3.);
+        assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsubadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(9., -3., 22., 1.);
+        assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fmsubadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(9., -3., 22., 1.);
+        assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fmsubadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
+        assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-1., 3.);
+        assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmadd_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(-1., 3., -20., -1.);
+        assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-1., 3., -20., -1.);
+        assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmadd_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
+        assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-1., 2.);
+        assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmadd_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-1., 2., 3., 4.);
+        assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-9., -15.);
+        assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmsub_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 3., 7., 2.);
+        let c = _mm256_setr_pd(4., 9., 1., 7.);
+        let r = _mm256_setr_pd(-9., -15., -22., -15.);
+        assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_ps() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-9., -15., -22., -15.);
+        assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm256_fnmsub_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
+        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
+        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
+        let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
+        assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 3.);
+        let c = _mm_setr_pd(4., 9.);
+        let r = _mm_setr_pd(-9., 2.);
+        assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
+    }
+
+    #[simd_test(enable = "fma")]
+    unsafe fn test_mm_fnmsub_ss() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let b = _mm_setr_ps(5., 3., 7., 2.);
+        let c = _mm_setr_ps(4., 9., 1., 7.);
+        let r = _mm_setr_ps(-9., 2., 3., 4.);
+        assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/fxsr.rs b/library/stdarch/crates/core_arch/src/x86/fxsr.rs
new file mode 100644
index 000000000..8ea1bfab7
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/fxsr.rs
@@ -0,0 +1,112 @@
+//! FXSR floating-point context fast save and restore.
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.fxsave"]
+    fn fxsave(p: *mut u8);
+    #[link_name = "llvm.x86.fxrstor"]
+    fn fxrstor(p: *const u8);
+}
+
+/// Saves the `x87` FPU, `MMX` technology, `XMM`, and `MXCSR` registers to the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxsave)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxsave(mem_addr: *mut u8) {
+    fxsave(mem_addr)
+}
+
+/// Restores the `XMM`, `MMX`, `MXCSR`, and `x87` FPU registers from the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// The contents of this memory region should have been written to by a
+/// previous
+/// `_fxsave` or `_fxsave64` intrinsic.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxrstor)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxrstor(mem_addr: *const u8) {
+    fxrstor(mem_addr)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use std::{cmp::PartialEq, fmt};
+    use stdarch_test::simd_test;
+
+    #[repr(align(16))]
+    struct FxsaveArea {
+        data: [u8; 512], // 512 bytes
+    }
+
+    impl FxsaveArea {
+        fn new() -> FxsaveArea {
+            FxsaveArea { data: [0; 512] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            &mut self.data[0] as *mut _ as *mut u8
+        }
+    }
+
+    impl PartialEq<FxsaveArea> for FxsaveArea {
+        fn eq(&self, other: &FxsaveArea) -> bool {
+            for i in 0..self.data.len() {
+                if self.data[i] != other.data[i] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    impl fmt::Debug for FxsaveArea {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "[")?;
+            for i in 0..self.data.len() {
+                write!(f, "{}", self.data[i])?;
+                if i != self.data.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")
+        }
+    }
+
+    #[simd_test(enable = "fxsr")]
+    unsafe fn fxsave() {
+        let mut a = FxsaveArea::new();
+        let mut b = FxsaveArea::new();
+
+        fxsr::_fxsave(a.ptr());
+        fxsr::_fxrstor(a.ptr());
+        fxsr::_fxsave(b.ptr());
+        assert_eq!(a, b);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/macros.rs b/library/stdarch/crates/core_arch/src/x86/macros.rs
new file mode 100644
index 000000000..e686e65b3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/macros.rs
@@ -0,0 +1,104 @@
+//! Utility macros.
+//!
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// not a round number.
+pub(crate) struct ValidateConstRound<const IMM: i32>;
+impl<const IMM: i32> ValidateConstRound<IMM> {
+    pub(crate) const VALID: () = {
+        assert!(
+            IMM == 4 || IMM == 8 || IMM == 9 || IMM == 10 || IMM == 11,
+            "Invalid IMM value"
+        );
+    };
+}
+
+#[allow(unused)]
+macro_rules! static_assert_rounding {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86::macros::ValidateConstRound::<$imm>::VALID;
+    };
+}
+
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// not a sae number.
+pub(crate) struct ValidateConstSae<const IMM: i32>;
+impl<const IMM: i32> ValidateConstSae<IMM> {
+    pub(crate) const VALID: () = {
+        assert!(IMM == 4 || IMM == 8, "Invalid IMM value");
+    };
+}
+
+#[allow(unused)]
+macro_rules! static_assert_sae {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86::macros::ValidateConstSae::<$imm>::VALID;
+    };
+}
+
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// not a mantissas sae number.
+pub(crate) struct ValidateConstMantissasSae<const IMM: i32>;
+impl<const IMM: i32> ValidateConstMantissasSae<IMM> {
+    pub(crate) const VALID: () = {
+        assert!(IMM == 4 || IMM == 8 || IMM == 12, "Invalid IMM value");
+    };
+}
+
+#[allow(unused)]
+macro_rules! static_assert_mantissas_sae {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86::macros::ValidateConstMantissasSae::<$imm>::VALID;
+    };
+}
+
+// Helper struct used to trigger const eval errors when the unsigned const generic immediate value
+// `IMM` is out of `[MIN-MAX]` range.
+pub(crate) struct ValidateConstImmU32<const IMM: u32, const MIN: u32, const MAX: u32>;
+impl<const IMM: u32, const MIN: u32, const MAX: u32> ValidateConstImmU32<IMM, MIN, MAX> {
+    pub(crate) const VALID: () = {
+        assert!(IMM >= MIN && IMM <= MAX, "IMM value not in expected range");
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm_u8 {
+    ($imm:ident) => {
+        let _ =
+            $crate::core_arch::x86::macros::ValidateConstImmU32::<$imm, 0, { (1 << 8) - 1 }>::VALID;
+    };
+}
+
+// Helper struct used to trigger const eval errors when the const generic immediate value `SCALE` is
+// not valid for gather instructions: the only valid scale values are 1, 2, 4 and 8.
+pub(crate) struct ValidateConstGatherScale<const SCALE: i32>;
+impl<const SCALE: i32> ValidateConstGatherScale<SCALE> {
+    pub(crate) const VALID: () = {
+        assert!(
+            SCALE == 1 || SCALE == 2 || SCALE == 4 || SCALE == 8,
+            "Invalid SCALE value"
+        );
+    };
+}
+
+#[allow(unused)]
+macro_rules! static_assert_imm8_scale {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86::macros::ValidateConstGatherScale::<$imm>::VALID;
+    };
+}
+
+#[cfg(test)]
+macro_rules! assert_approx_eq {
+    ($a:expr, $b:expr, $eps:expr) => {{
+        let (a, b) = (&$a, &$b);
+        assert!(
+            (*a - *b).abs() < $eps,
+            "assertion failed: `(left !== right)` \
+             (left: `{:?}`, right: `{:?}`, expect diff: `{:?}`, real diff: `{:?}`)",
+            *a,
+            *b,
+            $eps,
+            (*a - *b).abs()
+        );
+    }};
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs
new file mode 100644
index 000000000..547bfe67d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/mod.rs
@@ -0,0 +1,860 @@
+//! `x86` and `x86_64` intrinsics.
+
+use crate::{intrinsics, marker::Sized, mem::transmute};
+
+#[macro_use]
+mod macros;
+
+types! {
+    /// 128-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m128i` type defined by Intel,
+    /// representing a 128-bit SIMD register. Usage of this type typically
+    /// corresponds to the `sse` and up target features for x86/x86_64.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x16` - sixteen `i8` variables packed together
+    /// * `i16x8` - eight `i16` variables packed together
+    /// * `i32x4` - four `i32` variables packed together
+    /// * `i64x2` - two `i64` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// Note that this means that an instance of `__m128i` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    ///
+    /// Most intrinsics using `__m128i` are prefixed with `_mm_` and the
+    /// integer types tend to correspond to suffixes like "epi8" or "epi32".
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "sse2")]
+    /// # unsafe fn foo() {
+    /// let all_bytes_zero = _mm_setzero_si128();
+    /// let all_bytes_one = _mm_set1_epi8(1);
+    /// let four_i32 = _mm_set_epi32(1, 2, 3, 4);
+    /// # }
+    /// # if is_x86_feature_detected!("sse2") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m128i(i64, i64);
+
+    /// 128-bit wide set of four `f32` types, x86-specific
+    ///
+    /// This type is the same as the `__m128` type defined by Intel,
+    /// representing a 128-bit SIMD register which internally is consisted of
+    /// four packed `f32` instances. Usage of this type typically corresponds
+    /// to the `sse` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m128i`, the integer version of the 128-bit
+    /// registers, this `__m128` type has *one* interpretation. Each instance
+    /// of `__m128` always corresponds to `f32x4`, or four `f32` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m128` are prefixed with `_mm_` and are
+    /// suffixed with "ps" (or otherwise contain "ps"). Not to be confused with
+    /// "pd" which is used for `__m128d`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "sse")]
+    /// # unsafe fn foo() {
+    /// let four_zeros = _mm_setzero_ps();
+    /// let four_ones = _mm_set1_ps(1.0);
+    /// let four_floats = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+    /// # }
+    /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m128(f32, f32, f32, f32);
+
+    /// 128-bit wide set of two `f64` types, x86-specific
+    ///
+    /// This type is the same as the `__m128d` type defined by Intel,
+    /// representing a 128-bit SIMD register which internally is consisted of
+    /// two packed `f64` instances. Usage of this type typically corresponds
+    /// to the `sse` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m128i`, the integer version of the 128-bit
+    /// registers, this `__m128d` type has *one* interpretation. Each instance
+    /// of `__m128d` always corresponds to `f64x2`, or two `f64` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m128d` are prefixed with `_mm_` and are
+    /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
+    /// "ps" which is used for `__m128`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "sse")]
+    /// # unsafe fn foo() {
+    /// let two_zeros = _mm_setzero_pd();
+    /// let two_ones = _mm_set1_pd(1.0);
+    /// let two_floats = _mm_set_pd(1.0, 2.0);
+    /// # }
+    /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m128d(f64, f64);
+
+    /// 256-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m256i` type defined by Intel,
+    /// representing a 256-bit SIMD register. Usage of this type typically
+    /// corresponds to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x32` - thirty two `i8` variables packed together
+    /// * `i16x16` - sixteen `i16` variables packed together
+    /// * `i32x8` - eight `i32` variables packed together
+    /// * `i64x4` - four `i64` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// Note that this means that an instance of `__m256i` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "avx")]
+    /// # unsafe fn foo() {
+    /// let all_bytes_zero = _mm256_setzero_si256();
+    /// let all_bytes_one = _mm256_set1_epi8(1);
+    /// let eight_i32 = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+    /// # }
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m256i(i64, i64, i64, i64);
+
+    /// 256-bit wide set of eight `f32` types, x86-specific
+    ///
+    /// This type is the same as the `__m256` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// eight packed `f32` instances. Usage of this type typically corresponds
+    /// to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m256i`, the integer version of the 256-bit
+    /// registers, this `__m256` type has *one* interpretation. Each instance
+    /// of `__m256` always corresponds to `f32x8`, or eight `f32` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m256` are prefixed with `_mm256_` and are
+    /// suffixed with "ps" (or otherwise contain "ps"). Not to be confused with
+    /// "pd" which is used for `__m256d`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "avx")]
+    /// # unsafe fn foo() {
+    /// let eight_zeros = _mm256_setzero_ps();
+    /// let eight_ones = _mm256_set1_ps(1.0);
+    /// let eight_floats = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+    /// # }
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m256(f32, f32, f32, f32, f32, f32, f32, f32);
+
+    /// 256-bit wide set of four `f64` types, x86-specific
+    ///
+    /// This type is the same as the `__m256d` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// four packed `f64` instances. Usage of this type typically corresponds
+    /// to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m256i`, the integer version of the 256-bit
+    /// registers, this `__m256d` type has *one* interpretation. Each instance
+    /// of `__m256d` always corresponds to `f64x4`, or four `f64` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m256d` are prefixed with `_mm256_` and are
+    /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
+    /// "ps" which is used for `__m256`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[cfg(target_arch = "x86")]
+    /// use std::arch::x86::*;
+    /// #[cfg(target_arch = "x86_64")]
+    /// use std::arch::x86_64::*;
+    ///
+    /// # fn main() {
+    /// # #[target_feature(enable = "avx")]
+    /// # unsafe fn foo() {
+    /// let four_zeros = _mm256_setzero_pd();
+    /// let four_ones = _mm256_set1_pd(1.0);
+    /// let four_floats = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+    /// # }
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
+    /// # }
+    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    pub struct __m256d(f64, f64, f64, f64);
+
+    /// 512-bit wide integer vector type, x86-specific
+    ///
+    /// This type is the same as the `__m512i` type defined by Intel,
+    /// representing a 512-bit SIMD register. Usage of this type typically
+    /// corresponds to the `avx512*` and up target features for x86/x86_64.
+    ///
+    /// Internally this type may be viewed as:
+    ///
+    /// * `i8x64` - sixty-four `i8` variables packed together
+    /// * `i16x32` - thirty-two `i16` variables packed together
+    /// * `i32x16` - sixteen `i32` variables packed together
+    /// * `i64x8` - eight `i64` variables packed together
+    ///
+    /// (as well as unsigned versions). Each intrinsic may interpret the
+    /// internal bits differently, check the documentation of the intrinsic
+    /// to see how it's being used.
+    ///
+    /// Note that this means that an instance of `__m512i` typically just means
+    /// a "bag of bits" which is left up to interpretation at the point of use.
+    pub struct __m512i(i64, i64, i64, i64, i64, i64, i64, i64);
+
+    /// 512-bit wide set of sixteen `f32` types, x86-specific
+    ///
+    /// This type is the same as the `__m512` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// eight packed `f32` instances. Usage of this type typically corresponds
+    /// to the `avx512*` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m512i`, the integer version of the 512-bit
+    /// registers, this `__m512` type has *one* interpretation. Each instance
+    /// of `__m512` always corresponds to `f32x16`, or sixteen `f32` types
+    /// packed together.
+    ///
+    /// Most intrinsics using `__m512` are prefixed with `_mm512_` and are
+    /// suffixed with "ps" (or otherwise contain "ps"). Not to be confused with
+    /// "pd" which is used for `__m512d`.
+    pub struct __m512(
+        f32, f32, f32, f32, f32, f32, f32, f32,
+        f32, f32, f32, f32, f32, f32, f32, f32,
+    );
+
+    /// 512-bit wide set of eight `f64` types, x86-specific
+    ///
+    /// This type is the same as the `__m512d` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// eight packed `f64` instances. Usage of this type typically corresponds
+    /// to the `avx` and up target features for x86/x86_64.
+    ///
+    /// Note that unlike `__m512i`, the integer version of the 512-bit
+    /// registers, this `__m512d` type has *one* interpretation. Each instance
+    /// of `__m512d` always corresponds to `f64x4`, or eight `f64` types packed
+    /// together.
+    ///
+    /// Most intrinsics using `__m512d` are prefixed with `_mm512_` and are
+    /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
+    /// "ps" which is used for `__m512`.
+    pub struct __m512d(f64, f64, f64, f64, f64, f64, f64, f64);
+
+    /// 128-bit wide set of eight 'u16' types, x86-specific
+    ///
+    /// This type is representing a 128-bit SIMD register which internally is consisted of
+    /// eight packed `u16` instances. Its purpose is for bf16 related intrinsic
+    /// implementations.
+    pub struct __m128bh(u16, u16, u16, u16, u16, u16, u16, u16);
+
+    /// 256-bit wide set of 16 'u16' types, x86-specific
+    ///
+    /// This type is the same as the `__m128bh` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// 16 packed `u16` instances. Its purpose is for bf16 related intrinsic
+    /// implementations.
+    pub struct __m256bh(
+        u16, u16, u16, u16, u16, u16, u16, u16,
+        u16, u16, u16, u16, u16, u16, u16, u16
+    );
+
+    /// 512-bit wide set of 32 'u16' types, x86-specific
+    ///
+    /// This type is the same as the `__m128bh` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// 32 packed `u16` instances. Its purpose is for bf16 related intrinsic
+    /// implementations.
+    pub struct __m512bh(
+        u16, u16, u16, u16, u16, u16, u16, u16,
+        u16, u16, u16, u16, u16, u16, u16, u16,
+        u16, u16, u16, u16, u16, u16, u16, u16,
+        u16, u16, u16, u16, u16, u16, u16, u16
+    );
+}
+
+/// The `__mmask64` type used in AVX-512 intrinsics, a 64-bit integer
+#[allow(non_camel_case_types)]
+pub type __mmask64 = u64;
+
+/// The `__mmask32` type used in AVX-512 intrinsics, a 32-bit integer
+#[allow(non_camel_case_types)]
+pub type __mmask32 = u32;
+
+/// The `__mmask16` type used in AVX-512 intrinsics, a 16-bit integer
+#[allow(non_camel_case_types)]
+pub type __mmask16 = u16;
+
+/// The `__mmask8` type used in AVX-512 intrinsics, a 8-bit integer
+#[allow(non_camel_case_types)]
+pub type __mmask8 = u8;
+
+/// The `_MM_CMPINT_ENUM` type used to specify comparison operations in AVX-512 intrinsics.
+#[allow(non_camel_case_types)]
+pub type _MM_CMPINT_ENUM = i32;
+
+/// The `MM_MANTISSA_NORM_ENUM` type used to specify mantissa normalized operations in AVX-512 intrinsics.
+#[allow(non_camel_case_types)]
+pub type _MM_MANTISSA_NORM_ENUM = i32;
+
+/// The `MM_MANTISSA_SIGN_ENUM` type used to specify mantissa signed operations in AVX-512 intrinsics.
+#[allow(non_camel_case_types)]
+pub type _MM_MANTISSA_SIGN_ENUM = i32;
+
+/// The `MM_PERM_ENUM` type used to specify shuffle operations in AVX-512 intrinsics.
+#[allow(non_camel_case_types)]
+pub type _MM_PERM_ENUM = i32;
+
+#[cfg(test)]
+mod test;
+#[cfg(test)]
+pub use self::test::*;
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m128iExt: Sized {
+    fn as_m128i(self) -> __m128i;
+
+    #[inline]
+    fn as_u8x16(self) -> crate::core_arch::simd::u8x16 {
+        unsafe { transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_u16x8(self) -> crate::core_arch::simd::u16x8 {
+        unsafe { transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_u32x4(self) -> crate::core_arch::simd::u32x4 {
+        unsafe { transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_u64x2(self) -> crate::core_arch::simd::u64x2 {
+        unsafe { transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_i8x16(self) -> crate::core_arch::simd::i8x16 {
+        unsafe { transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_i16x8(self) -> crate::core_arch::simd::i16x8 {
+        unsafe { transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_i32x4(self) -> crate::core_arch::simd::i32x4 {
+        unsafe { transmute(self.as_m128i()) }
+    }
+
+    #[inline]
+    fn as_i64x2(self) -> crate::core_arch::simd::i64x2 {
+        unsafe { transmute(self.as_m128i()) }
+    }
+}
+
+impl m128iExt for __m128i {
+    #[inline]
+    fn as_m128i(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m256iExt: Sized {
+    fn as_m256i(self) -> __m256i;
+
+    #[inline]
+    fn as_u8x32(self) -> crate::core_arch::simd::u8x32 {
+        unsafe { transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_u16x16(self) -> crate::core_arch::simd::u16x16 {
+        unsafe { transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_u32x8(self) -> crate::core_arch::simd::u32x8 {
+        unsafe { transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_u64x4(self) -> crate::core_arch::simd::u64x4 {
+        unsafe { transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_i8x32(self) -> crate::core_arch::simd::i8x32 {
+        unsafe { transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_i16x16(self) -> crate::core_arch::simd::i16x16 {
+        unsafe { transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_i32x8(self) -> crate::core_arch::simd::i32x8 {
+        unsafe { transmute(self.as_m256i()) }
+    }
+
+    #[inline]
+    fn as_i64x4(self) -> crate::core_arch::simd::i64x4 {
+        unsafe { transmute(self.as_m256i()) }
+    }
+}
+
+impl m256iExt for __m256i {
+    #[inline]
+    fn as_m256i(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m128Ext: Sized {
+    fn as_m128(self) -> __m128;
+
+    #[inline]
+    fn as_f32x4(self) -> crate::core_arch::simd::f32x4 {
+        unsafe { transmute(self.as_m128()) }
+    }
+}
+
+impl m128Ext for __m128 {
+    #[inline]
+    fn as_m128(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m128dExt: Sized {
+    fn as_m128d(self) -> __m128d;
+
+    #[inline]
+    fn as_f64x2(self) -> crate::core_arch::simd::f64x2 {
+        unsafe { transmute(self.as_m128d()) }
+    }
+}
+
+impl m128dExt for __m128d {
+    #[inline]
+    fn as_m128d(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m256Ext: Sized {
+    fn as_m256(self) -> __m256;
+
+    #[inline]
+    fn as_f32x8(self) -> crate::core_arch::simd::f32x8 {
+        unsafe { transmute(self.as_m256()) }
+    }
+}
+
+impl m256Ext for __m256 {
+    #[inline]
+    fn as_m256(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m256dExt: Sized {
+    fn as_m256d(self) -> __m256d;
+
+    #[inline]
+    fn as_f64x4(self) -> crate::core_arch::simd::f64x4 {
+        unsafe { transmute(self.as_m256d()) }
+    }
+}
+
+impl m256dExt for __m256d {
+    #[inline]
+    fn as_m256d(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m512iExt: Sized {
+    fn as_m512i(self) -> __m512i;
+
+    #[inline]
+    fn as_u8x64(self) -> crate::core_arch::simd::u8x64 {
+        unsafe { transmute(self.as_m512i()) }
+    }
+
+    #[inline]
+    fn as_i8x64(self) -> crate::core_arch::simd::i8x64 {
+        unsafe { transmute(self.as_m512i()) }
+    }
+
+    #[inline]
+    fn as_u16x32(self) -> crate::core_arch::simd::u16x32 {
+        unsafe { transmute(self.as_m512i()) }
+    }
+
+    #[inline]
+    fn as_i16x32(self) -> crate::core_arch::simd::i16x32 {
+        unsafe { transmute(self.as_m512i()) }
+    }
+
+    #[inline]
+    fn as_u32x16(self) -> crate::core_arch::simd::u32x16 {
+        unsafe { transmute(self.as_m512i()) }
+    }
+
+    #[inline]
+    fn as_i32x16(self) -> crate::core_arch::simd::i32x16 {
+        unsafe { transmute(self.as_m512i()) }
+    }
+
+    #[inline]
+    fn as_u64x8(self) -> crate::core_arch::simd::u64x8 {
+        unsafe { transmute(self.as_m512i()) }
+    }
+
+    #[inline]
+    fn as_i64x8(self) -> crate::core_arch::simd::i64x8 {
+        unsafe { transmute(self.as_m512i()) }
+    }
+}
+
+impl m512iExt for __m512i {
+    #[inline]
+    fn as_m512i(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m512Ext: Sized {
+    fn as_m512(self) -> __m512;
+
+    #[inline]
+    fn as_f32x16(self) -> crate::core_arch::simd::f32x16 {
+        unsafe { transmute(self.as_m512()) }
+    }
+}
+
+impl m512Ext for __m512 {
+    #[inline]
+    fn as_m512(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m512dExt: Sized {
+    fn as_m512d(self) -> __m512d;
+
+    #[inline]
+    fn as_f64x8(self) -> crate::core_arch::simd::f64x8 {
+        unsafe { transmute(self.as_m512d()) }
+    }
+}
+
+impl m512dExt for __m512d {
+    #[inline]
+    fn as_m512d(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m128bhExt: Sized {
+    fn as_m128bh(self) -> __m128bh;
+
+    #[inline]
+    fn as_u16x8(self) -> crate::core_arch::simd::u16x8 {
+        unsafe { transmute(self.as_m128bh()) }
+    }
+
+    #[inline]
+    fn as_i16x8(self) -> crate::core_arch::simd::i16x8 {
+        unsafe { transmute(self.as_m128bh()) }
+    }
+
+    #[inline]
+    fn as_u32x4(self) -> crate::core_arch::simd::u32x4 {
+        unsafe { transmute(self.as_m128bh()) }
+    }
+
+    #[inline]
+    fn as_i32x4(self) -> crate::core_arch::simd::i32x4 {
+        unsafe { transmute(self.as_m128bh()) }
+    }
+}
+
+impl m128bhExt for __m128bh {
+    #[inline]
+    fn as_m128bh(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m256bhExt: Sized {
+    fn as_m256bh(self) -> __m256bh;
+
+    #[inline]
+    fn as_u16x16(self) -> crate::core_arch::simd::u16x16 {
+        unsafe { transmute(self.as_m256bh()) }
+    }
+
+    #[inline]
+    fn as_i16x16(self) -> crate::core_arch::simd::i16x16 {
+        unsafe { transmute(self.as_m256bh()) }
+    }
+
+    #[inline]
+    fn as_u32x8(self) -> crate::core_arch::simd::u32x8 {
+        unsafe { transmute(self.as_m256bh()) }
+    }
+
+    #[inline]
+    fn as_i32x8(self) -> crate::core_arch::simd::i32x8 {
+        unsafe { transmute(self.as_m256bh()) }
+    }
+}
+
+impl m256bhExt for __m256bh {
+    #[inline]
+    fn as_m256bh(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m512bhExt: Sized {
+    fn as_m512bh(self) -> __m512bh;
+
+    #[inline]
+    fn as_u16x32(self) -> crate::core_arch::simd::u16x32 {
+        unsafe { transmute(self.as_m512bh()) }
+    }
+
+    #[inline]
+    fn as_i16x32(self) -> crate::core_arch::simd::i16x32 {
+        unsafe { transmute(self.as_m512bh()) }
+    }
+
+    #[inline]
+    fn as_u32x16(self) -> crate::core_arch::simd::u32x16 {
+        unsafe { transmute(self.as_m512bh()) }
+    }
+
+    #[inline]
+    fn as_i32x16(self) -> crate::core_arch::simd::i32x16 {
+        unsafe { transmute(self.as_m512bh()) }
+    }
+}
+
+impl m512bhExt for __m512bh {
+    #[inline]
+    fn as_m512bh(self) -> Self {
+        self
+    }
+}
+
+mod eflags;
+pub use self::eflags::*;
+
+mod fxsr;
+pub use self::fxsr::*;
+
+mod bswap;
+pub use self::bswap::*;
+
+mod rdtsc;
+pub use self::rdtsc::*;
+
+mod cpuid;
+pub use self::cpuid::*;
+mod xsave;
+pub use self::xsave::*;
+
+mod sse;
+pub use self::sse::*;
+mod sse2;
+pub use self::sse2::*;
+mod sse3;
+pub use self::sse3::*;
+mod ssse3;
+pub use self::ssse3::*;
+mod sse41;
+pub use self::sse41::*;
+mod sse42;
+pub use self::sse42::*;
+mod avx;
+pub use self::avx::*;
+mod avx2;
+pub use self::avx2::*;
+mod fma;
+pub use self::fma::*;
+
+mod abm;
+pub use self::abm::*;
+mod bmi1;
+pub use self::bmi1::*;
+
+mod bmi2;
+pub use self::bmi2::*;
+
+#[cfg(not(stdarch_intel_sde))]
+mod sse4a;
+#[cfg(not(stdarch_intel_sde))]
+pub use self::sse4a::*;
+
+#[cfg(not(stdarch_intel_sde))]
+mod tbm;
+#[cfg(not(stdarch_intel_sde))]
+pub use self::tbm::*;
+
+mod pclmulqdq;
+pub use self::pclmulqdq::*;
+
+mod aes;
+pub use self::aes::*;
+
+mod rdrand;
+pub use self::rdrand::*;
+
+mod sha;
+pub use self::sha::*;
+
+mod adx;
+pub use self::adx::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Generates the trap instruction `UD2`
+#[cfg_attr(test, assert_instr(ud2))]
+#[inline]
+pub unsafe fn ud2() -> ! {
+    intrinsics::abort()
+}
+
+mod avx512f;
+pub use self::avx512f::*;
+
+mod avx512bw;
+pub use self::avx512bw::*;
+
+mod avx512cd;
+pub use self::avx512cd::*;
+
+mod avx512ifma;
+pub use self::avx512ifma::*;
+
+mod avx512vbmi;
+pub use self::avx512vbmi::*;
+
+mod avx512vbmi2;
+pub use self::avx512vbmi2::*;
+
+mod avx512vnni;
+pub use self::avx512vnni::*;
+
+mod avx512bitalg;
+pub use self::avx512bitalg::*;
+
+mod avx512gfni;
+pub use self::avx512gfni::*;
+
+mod avx512vpopcntdq;
+pub use self::avx512vpopcntdq::*;
+
+mod avx512vaes;
+pub use self::avx512vaes::*;
+
+mod avx512vpclmulqdq;
+pub use self::avx512vpclmulqdq::*;
+
+mod bt;
+pub use self::bt::*;
+
+mod rtm;
+pub use self::rtm::*;
+
+mod f16c;
+pub use self::f16c::*;
+
+mod avx512bf16;
+pub use self::avx512bf16::*;
diff --git a/library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs b/library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs
new file mode 100644
index 000000000..a2ebdf9c8
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/pclmulqdq.rs
@@ -0,0 +1,70 @@
+//! Carry-less Multiplication (CLMUL)
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
+//!
+//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use crate::core_arch::x86::__m128i;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.pclmulqdq"]
+    fn pclmulqdq(a: __m128i, round_key: __m128i, imm8: u8) -> __m128i;
+}
+
+/// Performs a carry-less multiplication of two 64-bit polynomials over the
+/// finite field GF(2^k).
+///
+/// The immediate byte is used for determining which halves of `a` and `b`
+/// should be used. Immediate bits other than 0 and 4 are ignored.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128)
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+#[cfg_attr(all(test, not(target_os = "linux")), assert_instr(pclmulqdq, IMM8 = 0))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmullqlqdq, IMM8 = 0))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmulhqlqdq, IMM8 = 1))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmullqhqdq, IMM8 = 16))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmulhqhqdq, IMM8 = 17))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_clmulepi64_si128<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    pclmulqdq(a, b, IMM8 as u8)
+}
+
+#[cfg(test)]
+mod tests {
+    // The constants in the tests below are just bit patterns. They should not
+    // be interpreted as integers; signedness does not make sense for them, but
+    // __m128i happens to be defined in terms of signed integers.
+    #![allow(overflowing_literals)]
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "pclmulqdq")]
+    unsafe fn test_mm_clmulepi64_si128() {
+        // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
+        let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
+        let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
+        let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
+        let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
+        let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
+        let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
+
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a, b), r00);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x10>(a, b), r01);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x01>(a, b), r10);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x11>(a, b), r11);
+
+        let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
+        let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a0, a0), r);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/rdrand.rs b/library/stdarch/crates/core_arch/src/x86/rdrand.rs
new file mode 100644
index 000000000..c6bab9148
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/rdrand.rs
@@ -0,0 +1,75 @@
+//! RDRAND and RDSEED instructions for returning random numbers from an Intel
+//! on-chip hardware random number generator which has been seeded by an
+//! on-chip entropy source.
+#![allow(clippy::module_name_repetitions)]
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.rdrand.16"]
+    fn x86_rdrand16_step() -> (u16, i32);
+    #[link_name = "llvm.x86.rdrand.32"]
+    fn x86_rdrand32_step() -> (u32, i32);
+    #[link_name = "llvm.x86.rdseed.16"]
+    fn x86_rdseed16_step() -> (u16, i32);
+    #[link_name = "llvm.x86.rdseed.32"]
+    fn x86_rdseed32_step() -> (u32, i32);
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Read a hardware generated 16-bit random value and store the result in val.
+/// Returns 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand16_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
+    let (v, flag) = x86_rdrand16_step();
+    *val = v;
+    flag
+}
+
+/// Read a hardware generated 32-bit random value and store the result in val.
+/// Returns 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand32_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
+    let (v, flag) = x86_rdrand32_step();
+    *val = v;
+    flag
+}
+
+/// Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed16_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
+    let (v, flag) = x86_rdseed16_step();
+    *val = v;
+    flag
+}
+
+/// Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed32_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed32_step(val: &mut u32) -> i32 {
+    let (v, flag) = x86_rdseed32_step();
+    *val = v;
+    flag
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/rdtsc.rs b/library/stdarch/crates/core_arch/src/x86/rdtsc.rs
new file mode 100644
index 000000000..67f6e48fa
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/rdtsc.rs
@@ -0,0 +1,77 @@
+//! RDTSC instructions.
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Reads the current value of the processor’s time-stamp counter.
+///
+/// The processor monotonically increments the time-stamp counter MSR
+/// every clock cycle and resets it to 0 whenever the processor is
+/// reset.
+///
+/// The RDTSC instruction is not a serializing instruction. It does
+/// not necessarily wait until all previous instructions have been
+/// executed before reading the counter. Similarly, subsequent
+/// instructions may begin execution before the read operation is
+/// performed.
+///
+/// On processors that support the Intel 64 architecture, the
+/// high-order 32 bits of each of RAX and RDX are cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdtsc)
+#[inline]
+#[cfg_attr(test, assert_instr(rdtsc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdtsc() -> u64 {
+    rdtsc()
+}
+
+/// Reads the current value of the processor’s time-stamp counter and
+/// the `IA32_TSC_AUX MSR`.
+///
+/// The processor monotonically increments the time-stamp counter MSR
+/// every clock cycle and resets it to 0 whenever the processor is
+/// reset.
+///
+/// The RDTSCP instruction waits until all previous instructions have
+/// been executed before reading the counter. However, subsequent
+/// instructions may begin execution before the read operation is
+/// performed.
+///
+/// On processors that support the Intel 64 architecture, the
+/// high-order 32 bits of each of RAX, RDX, and RCX are cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__rdtscp)
+#[inline]
+#[cfg_attr(test, assert_instr(rdtscp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn __rdtscp(aux: *mut u32) -> u64 {
+    rdtscp(aux as *mut _)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.rdtsc"]
+    fn rdtsc() -> u64;
+    #[link_name = "llvm.x86.rdtscp"]
+    fn rdtscp(aux: *mut u8) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn _rdtsc() {
+        let r = rdtsc::_rdtsc();
+        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn _rdtscp() {
+        let mut aux = 0;
+        let r = rdtsc::__rdtscp(&mut aux);
+        assert_ne!(r, 0); // The chances of this being 0 are infinitesimal
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/rtm.rs b/library/stdarch/crates/core_arch/src/x86/rtm.rs
new file mode 100644
index 000000000..dab73cde9
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/rtm.rs
@@ -0,0 +1,162 @@
+//! Intel's Restricted Transactional Memory (RTM).
+//!
+//! This CPU feature is available on Intel Broadwell or later CPUs (and some Haswell).
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_rtm] provides a quick overview of the assembly instructions, and
+//! Intel's [programming considerations][intel_consid] details what sorts of instructions within a
+//! transaction are likely to cause an abort.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_rtm]: https://en.wikipedia.org/wiki/Transactional_Synchronization_Extensions#Restricted_Transactional_Memory
+//! [intel_consid]: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-intel-transactional-synchronization-extensions-intel-tsx-programming-considerations
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+extern "C" {
+    #[link_name = "llvm.x86.xbegin"]
+    fn x86_xbegin() -> i32;
+    #[link_name = "llvm.x86.xend"]
+    fn x86_xend();
+    #[link_name = "llvm.x86.xabort"]
+    fn x86_xabort(imm8: i8);
+    #[link_name = "llvm.x86.xtest"]
+    fn x86_xtest() -> i32;
+}
+
+/// Transaction successfully started.
+pub const _XBEGIN_STARTED: u32 = !0;
+
+/// Transaction explicitly aborted with xabort. The parameter passed to xabort is available with
+/// `_xabort_code(status)`.
+#[allow(clippy::identity_op)]
+pub const _XABORT_EXPLICIT: u32 = 1 << 0;
+
+/// Transaction retry is possible.
+pub const _XABORT_RETRY: u32 = 1 << 1;
+
+/// Transaction abort due to a memory conflict with another thread.
+pub const _XABORT_CONFLICT: u32 = 1 << 2;
+
+/// Transaction abort due to the transaction using too much memory.
+pub const _XABORT_CAPACITY: u32 = 1 << 3;
+
+/// Transaction abort due to a debug trap.
+pub const _XABORT_DEBUG: u32 = 1 << 4;
+
+/// Transaction abort in a inner nested transaction.
+pub const _XABORT_NESTED: u32 = 1 << 5;
+
+/// Specifies the start of a restricted transactional memory (RTM) code region and returns a value
+/// indicating status.
+///
+/// [Intel's documentation](https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-xbegin).
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xbegin))]
+pub unsafe fn _xbegin() -> u32 {
+    x86_xbegin() as _
+}
+
+/// Specifies the end of a restricted transactional memory (RTM) code region.
+///
+/// [Intel's documentation](https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-xend).
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xend))]
+pub unsafe fn _xend() {
+    x86_xend()
+}
+
+/// Forces a restricted transactional memory (RTM) region to abort.
+///
+/// [Intel's documentation](https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-xabort).
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xabort, IMM8 = 0x0))]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn _xabort<const IMM8: u32>() {
+    static_assert_imm_u8!(IMM8);
+    x86_xabort(IMM8 as i8)
+}
+
+/// Queries whether the processor is executing in a transactional region identified by restricted
+/// transactional memory (RTM) or hardware lock elision (HLE).
+///
+/// [Intel's documentation](https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-xtest).
+#[inline]
+#[target_feature(enable = "rtm")]
+#[cfg_attr(test, assert_instr(xtest))]
+pub unsafe fn _xtest() -> u8 {
+    x86_xtest() as _
+}
+
+/// Retrieves the parameter passed to [`_xabort`] when [`_xbegin`]'s status has the
+/// `_XABORT_EXPLICIT` flag set.
+#[inline]
+pub const fn _xabort_code(status: u32) -> u32 {
+    (status >> 24) & 0xFF
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "rtm")]
+    unsafe fn test_xbegin_xend() {
+        let mut x = 0;
+        for _ in 0..10 {
+            let code = rtm::_xbegin();
+            if code == _XBEGIN_STARTED {
+                x += 1;
+                rtm::_xend();
+                assert_eq!(x, 1);
+                break;
+            }
+            assert_eq!(x, 0);
+        }
+    }
+
+    #[simd_test(enable = "rtm")]
+    unsafe fn test_xabort() {
+        const ABORT_CODE: u32 = 42;
+        // aborting outside a transactional region does nothing
+        _xabort::<ABORT_CODE>();
+
+        for _ in 0..10 {
+            let mut x = 0;
+            let code = rtm::_xbegin();
+            if code == _XBEGIN_STARTED {
+                x += 1;
+                rtm::_xabort::<ABORT_CODE>();
+            } else if code & _XABORT_EXPLICIT != 0 {
+                let test_abort_code = rtm::_xabort_code(code);
+                assert_eq!(test_abort_code, ABORT_CODE);
+            }
+            assert_eq!(x, 0);
+        }
+    }
+
+    #[simd_test(enable = "rtm")]
+    unsafe fn test_xtest() {
+        assert_eq!(_xtest(), 0);
+
+        for _ in 0..10 {
+            let code = rtm::_xbegin();
+            if code == _XBEGIN_STARTED {
+                let in_tx = _xtest();
+                rtm::_xend();
+
+                // putting the assert inside the transaction would abort the transaction on fail
+                // without any output/panic/etc
+                assert_eq!(in_tx, 1);
+                break;
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sha.rs b/library/stdarch/crates/core_arch/src/x86/sha.rs
new file mode 100644
index 000000000..cfb330cfb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sha.rs
@@ -0,0 +1,221 @@
+use crate::{
+    core_arch::{simd::*, x86::*},
+    mem::transmute,
+};
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sha1msg1"]
+    fn sha1msg1(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1msg2"]
+    fn sha1msg2(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1nexte"]
+    fn sha1nexte(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha1rnds4"]
+    fn sha1rnds4(a: i32x4, b: i32x4, c: i8) -> i32x4;
+    #[link_name = "llvm.x86.sha256msg1"]
+    fn sha256msg1(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha256msg2"]
+    fn sha256msg2(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sha256rnds2"]
+    fn sha256rnds2(a: i32x4, b: i32x4, k: i32x4) -> i32x4;
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Performs an intermediate calculation for the next four SHA1 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and returning the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1msg1_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha1msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(sha1msg1(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Performs the final calculation for the next four SHA1 message values
+/// (unsigned 32-bit integers) using the intermediate result in `a` and the
+/// previous message values in `b`, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1msg2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha1msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(sha1msg2(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Calculate SHA1 state variable E after four rounds of operation from the
+/// current SHA1 state variable `a`, add that value to the scheduled values
+/// (unsigned 32-bit integers) in `b`, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1nexte_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1nexte))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(sha1nexte(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Performs four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D)
+/// from `a` and some pre-computed sum of the next 4 round message values
+/// (unsigned 32-bit integers), and state variable E from `b`, and return the
+/// updated SHA1 state (A,B,C,D). `FUNC` contains the logic functions and round
+/// constants.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1rnds4_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha1rnds4, FUNC = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha1rnds4_epu32<const FUNC: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm2!(FUNC);
+    transmute(sha1rnds4(a.as_i32x4(), b.as_i32x4(), FUNC as i8))
+}
+
+/// Performs an intermediate calculation for the next four SHA256 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256msg1_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha256msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(sha256msg1(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Performs the final calculation for the next four SHA256 message values
+/// (unsigned 32-bit integers) using previous message values from `a` and `b`,
+/// and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256msg2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(sha256msg2(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Performs 2 rounds of SHA256 operation using an initial SHA256 state
+/// (C,D,G,H) from `a`, an initial SHA256 state (A,B,E,F) from `b`, and a
+/// pre-computed sum of the next 2 round message values (unsigned 32-bit
+/// integers) and the corresponding round constants from `k`, and store the
+/// updated SHA256 state (A,B,E,F) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256rnds2_epu32)
+#[inline]
+#[target_feature(enable = "sha")]
+#[cfg_attr(test, assert_instr(sha256rnds2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m128i {
+    transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4()))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        f32,
+        f64::{self, NAN},
+        i32,
+        mem::{self, transmute},
+    };
+
+    use crate::{
+        core_arch::{simd::*, x86::*},
+        hint::black_box,
+    };
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1msg1_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x98829f34f74ad457, 0xda2b1a44d0b5ad3c);
+        let r = _mm_sha1msg1_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1msg2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xf714b202d863d47d, 0x90c30d946b3d3b35);
+        let r = _mm_sha1msg2_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1nexte_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x2589d5be923f82a4, 0x59f111f13956c25b);
+        let r = _mm_sha1nexte_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha1rnds4_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0x32b13cd8322f5268, 0xc54420862bd9246f);
+        let r = _mm_sha1rnds4_epu32::<0>(a, b);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0x6d4c43e56a3c25d9, 0xa7e00fb775cbd3fe);
+        let r = _mm_sha1rnds4_epu32::<1>(a, b);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0xb304e383c01222f4, 0x66f6b3b1f89d8001);
+        let r = _mm_sha1rnds4_epu32::<2>(a, b);
+        assert_eq_m128i(r, expected);
+
+        let expected = _mm_set_epi64x(0x8189b758bfabfa79, 0xdb08f6e78cae098b);
+        let r = _mm_sha1rnds4_epu32::<3>(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256msg1_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xeb84973fd5cda67d, 0x2857b88f406b09ee);
+        let r = _mm_sha256msg1_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256msg2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let expected = _mm_set_epi64x(0xb58777ce887fd851, 0x15d1ec8b73ac8450);
+        let r = _mm_sha256msg2_epu32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sha")]
+    #[allow(overflowing_literals)]
+    unsafe fn test_mm_sha256rnds2_epu32() {
+        let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
+        let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
+        let k = _mm_set_epi64x(0, 0x12835b01d807aa98);
+        let expected = _mm_set_epi64x(0xd3063037effb15ea, 0x187ee3db0d6d1d19);
+        let r = _mm_sha256rnds2_epu32(a, b, k);
+        assert_eq_m128i(r, expected);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs
new file mode 100644
index 000000000..2c4295ef6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@@ -0,0 +1,3276 @@
+//! Streaming SIMD Extensions (SSE)
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    intrinsics, mem, ptr,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Adds the first component of `a` and `b`, the other components are copied
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(addss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
+    addss(a, b)
+}
+
+/// Adds __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(addps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
+    simd_add(a, b)
+}
+
+/// Subtracts the first component of `b` from `a`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(subss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
+    subss(a, b)
+}
+
+/// Subtracts __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(subps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
+    simd_sub(a, b)
+}
+
+/// Multiplies the first component of `a` and `b`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(mulss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
+    mulss(a, b)
+}
+
+/// Multiplies __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(mulps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
+    simd_mul(a, b)
+}
+
+/// Divides the first component of `b` by `a`, the other components are
+/// copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(divss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
+    divss(a, b)
+}
+
+/// Divides __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(divps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
+    simd_div(a, b)
+}
+
+/// Returns the square root of the first single-precision (32-bit)
+/// floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
+    sqrtss(a)
+}
+
+/// Returns the square root of packed single-precision (32-bit) floating-point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
+    sqrtps(a)
+}
+
+/// Returns the approximate reciprocal of the first single-precision
+/// (32-bit) floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rcpss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
+    rcpss(a)
+}
+
+/// Returns the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rcpps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
+    rcpps(a)
+}
+
+/// Returns the approximate reciprocal square root of the first single-precision
+/// (32-bit) floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rsqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
+    rsqrtss(a)
+}
+
+/// Returns the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(rsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
+    rsqrtps(a)
+}
+
+/// Compares the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the minimum value in the first element of the return
+/// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(minss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
+    minss(a, b)
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(minps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
+    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
+    minps(a, b)
+}
+
+/// Compares the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the maximum value in the first element of the return
+/// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(maxss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
+    maxss(a, b)
+}
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(maxps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
+    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
+    maxps(a, b)
+}
+
+/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `and` instructions, so ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(simd_and(a, b))
+}
+
+/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
+/// elements.
+///
+/// Computes `!a & b` for each bit in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `not` and `and` instructions, so ignore
+// it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andnps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    let mask: __m128i = mem::transmute(i32x4::splat(-1));
+    mem::transmute(simd_and(simd_xor(mask, a), b))
+}
+
+/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `or` instructions, so we ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(orps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(simd_or(a, b))
+}
+
+/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
+/// elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// i586 only seems to generate plain `xor` instructions, so we ignore it.
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(xorps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
+    let a: __m128i = mem::transmute(a);
+    let b: __m128i = mem::transmute(b);
+    mem::transmute(simd_xor(a, b))
+}
+
+/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
+/// the result will be `0xffffffff` if the two inputs are equal, or `0`
+/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpeqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 0)
+}
+
+/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
+/// of the result will be `0xffffffff` if `a.extract(0)` is less than
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 1)
+}
+
+/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
+/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
+/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 2)
+}
+
+/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
+/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4!(a, cmpss(b, a, 1), [4, 1, 2, 3])
+}
+
+/// Compares the lowest `f32` of both inputs for greater than or equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
+/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
+/// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4!(a, cmpss(b, a, 2), [4, 1, 2, 3])
+}
+
+/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
+/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpneqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 4)
+}
+
+/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
+/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
+/// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 5)
+}
+
+/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
+/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
+/// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 6)
+}
+
+/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
+/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
+/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
+/// the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4!(a, cmpss(b, a, 5), [4, 1, 2, 3])
+}
+
+/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
+/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
+/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
+/// bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4!(a, cmpss(b, a, 6), [4, 1, 2, 3])
+}
+
+/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
+/// the result will be `0xffffffff` if neither of `a.extract(0)` or
+/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 7)
+}
+
+/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
+/// of the result will be `0xffffffff` if any of `a.extract(0)` or
+/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
+/// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpunordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
+    cmpss(a, b, 3)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input elements
+/// were equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpeqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 0)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 1)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is less than or equal to the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 2)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 1)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 2)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input elements
+/// are **not** equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpneqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 4)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** less than the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 5)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** less than or equal to the corresponding element in `b`, or
+/// `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(a, b, 6)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** greater than the corresponding element in `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 5)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// The result in the output vector will be `0xffffffff` if the input element
+/// in `a` is **not** greater than or equal to the corresponding element in `b`,
+/// or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 6)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// Returns four floats that have one of two possible bit patterns. The element
+/// in the output vector will be `0xffffffff` if the input elements in `a` and
+/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 7)
+}
+
+/// Compares each of the four floats in `a` to the corresponding element in `b`.
+/// Returns four floats that have one of two possible bit patterns. The element
+/// in the output vector will be `0xffffffff` if the input elements in `a` and
+/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cmpunordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
+    cmpps(b, a, 3)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
+    comieq_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
+    comilt_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
+    comile_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than the one from `b`, or `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
+    comigt_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than or equal to the one from `b`, or
+/// `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
+    comige_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are **not** equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
+    comineq_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are equal, or `0` otherwise. This instruction will not signal
+/// an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
+    ucomieq_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
+/// This instruction will not signal an exception if either argument is a quiet
+/// NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
+    ucomilt_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
+/// otherwise. This instruction will not signal an exception if either argument
+/// is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
+    ucomile_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than the one from `b`, or `0`
+/// otherwise. This instruction will not signal an exception if either argument
+/// is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
+    ucomigt_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if the value from `a` is greater than or equal to the one from `b`, or
+/// `0` otherwise. This instruction will not signal an exception if either
+/// argument is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
+    ucomige_ss(a, b)
+}
+
+/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
+/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
+/// signal an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
+    ucomineq_ss(a, b)
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
+///
+/// The result is rounded according to the current rounding mode. If the result
+/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
+/// (`i32::MIN`) or an invalid operation floating point exception if
+/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
+    cvtss2si(a)
+}
+
+/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
+    _mm_cvtss_si32(a)
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
+/// with
+/// truncation.
+///
+/// The result is rounded always using truncation (round towards zero). If the
+/// result cannot be represented as a 32 bit integer the result will be
+/// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
+/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
+    cvttss2si(a)
+}
+
+/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
+    _mm_cvttss_si32(a)
+}
+
+/// Extracts the lowest 32 bit float from the input vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32)
+#[inline]
+#[target_feature(enable = "sse")]
+// No point in using assert_instrs. In Unix x86_64 calling convention this is a
+// no-op, and on Windows it's just a `mov`.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
+    simd_extract(a, 0)
+}
+
+/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
+/// vector `a` with the lowest 32 bit float replaced by the converted integer.
+///
+/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
+/// input).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
+    cvtsi2ss(a, b)
+}
+
+/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
+    _mm_cvtsi32_ss(a, b)
+}
+
+/// Construct a `__m128` with the lowest element set to `a` and the rest set to
+/// zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
+    __m128(a, 0.0, 0.0, 0.0)
+}
+
+/// Construct a `__m128` with all element set to `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
+    __m128(a, a, a, a)
+}
+
+/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
+    _mm_set1_ps(a)
+}
+
+/// Construct a `__m128` from four floating point values highest to lowest.
+///
+/// Note that `a` will be the highest 32 bits of the result, and `d` the
+/// lowest. This matches the standard way of writing bit patterns on x86:
+///
+/// ```text
+///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
+///        +---------+---------+---------+---------+
+///        |    a    |    b    |    c    |    d    |   result
+///        +---------+---------+---------+---------+
+/// ```
+///
+/// Alternatively:
+///
+/// ```text
+/// let v = _mm_set_ps(d, c, b, a);
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
+    __m128(d, c, b, a)
+}
+
+/// Construct a `__m128` from four floating point values lowest to highest.
+///
+/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
+/// bits of the result, and `d` the highest.
+///
+/// ```text
+/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(
+    all(test, any(target_os = "windows", target_arch = "x86_64")),
+    assert_instr(unpcklps)
+)]
+// On a 32-bit architecture on non-Windows it just copies the operands from the stack.
+#[cfg_attr(
+    all(test, all(not(target_os = "windows"), target_arch = "x86")),
+    assert_instr(movaps)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
+    __m128(a, b, c, d)
+}
+
+/// Construct a `__m128` with all elements initialized to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setzero_ps() -> __m128 {
+    __m128(0.0, 0.0, 0.0, 0.0)
+}
+
+/// A utility function for creating masks to use with Intel shuffle and
+/// permute intrinsics.
+#[inline]
+#[allow(non_snake_case)]
+#[unstable(feature = "stdarch", issue = "27731")]
+pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
+    ((z << 6) | (y << 4) | (x << 2) | w) as i32
+}
+
+/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
+/// `b` using `MASK`.
+///
+/// The lower half of result takes values from `a` and the higher half from
+/// `b`. Mask is split to 2 control bits each to index the element from inputs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps)
+///
+/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
+/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
+/// as is the case for [other shuffle intrinsics](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_).
+/// Performing an implicit type conversion between an unsigned integer and a signed integer
+/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm8!(MASK);
+    simd_shuffle4!(
+        a,
+        b,
+        <const MASK: i32> [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+        ],
+    )
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the higher half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
+}
+
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the lower half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
+}
+
+/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
+/// lower half of result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehl_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
+    // TODO; figure why this is a different instruction on Windows?
+    simd_shuffle4!(a, b, [6, 7, 2, 3])
+}
+
+/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
+/// higher half of result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movelh_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4!(a, b, [0, 1, 4, 5])
+}
+
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 4 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+// FIXME: LLVM9 trunk has the following bug:
+// https://github.com/rust-lang/stdarch/issues/794
+// so we only temporarily test this on i686 and x86_64 but not on i586:
+#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(movmskps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
+    movmskps(a)
+}
+
+/// Construct a `__m128` with the lowest element read from `p` and the other
+/// elements set to zero.
+///
+/// This corresponds to instructions `VMOVSS` / `MOVSS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
+    __m128(*p, 0.0, 0.0, 0.0)
+}
+
+/// Construct a `__m128` by duplicating the value read from `p` into all
+/// elements.
+///
+/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
+/// shuffling.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
+    let a = *p;
+    __m128(a, a, a, a)
+}
+
+/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
+    _mm_load1_ps(p)
+}
+
+/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
+/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
+/// memory.
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
+    *(p as *const __m128)
+}
+
+/// Loads four `f32` values from memory into a `__m128`. There are no
+/// restrictions
+/// on memory alignment. For aligned memory
+/// [`_mm_load_ps`](fn._mm_load_ps.html)
+/// may be faster.
+///
+/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
+    // Note: Using `*p` would require `f32` alignment, but `movups` has no
+    // alignment restrictions.
+    let mut dst = _mm_undefined_ps();
+    ptr::copy_nonoverlapping(
+        p as *const u8,
+        &mut dst as *mut __m128 as *mut u8,
+        mem::size_of::<__m128>(),
+    );
+    dst
+}
+
+/// Loads four `f32` values from aligned memory into a `__m128` in reverse
+/// order.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// let a0 = *p;
+/// let a1 = *p.offset(1);
+/// let a2 = *p.offset(2);
+/// let a3 = *p.offset(3);
+/// __m128::new(a3, a2, a1, a0)
+/// ```
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
+/// shuffling.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
+    let a = _mm_load_ps(p);
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
+}
+
+/// Loads unaligned 64-bits of integer data from memory into new vector.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64)
+#[inline]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
+pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
+    transmute(i64x2(ptr::read_unaligned(mem_addr as *const i64), 0))
+}
+
+/// Stores the lowest 32 bit float of `a` into memory.
+///
+/// This intrinsic corresponds to the `MOVSS` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
+    *p = simd_extract(a, 0);
+}
+
+/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
+/// memory.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// let x = a.extract(0);
+/// *p = x;
+/// *p.offset(1) = x;
+/// *p.offset(2) = x;
+/// *p.offset(3) = x;
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
+    let b: __m128 = simd_shuffle4!(a, a, [0, 0, 0, 0]);
+    *(p as *mut __m128) = b;
+}
+
+/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
+    _mm_store1_ps(p, a);
+}
+
+/// Stores four 32-bit floats into *aligned* memory.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
+/// memory.
+///
+/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
+    *(p as *mut __m128) = a;
+}
+
+/// Stores four 32-bit floats into memory. There are no restrictions on memory
+/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
+/// faster.
+///
+/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
+    ptr::copy_nonoverlapping(
+        &a as *const __m128 as *const u8,
+        p as *mut u8,
+        mem::size_of::<__m128>(),
+    );
+}
+
+/// Stores four 32-bit floats into *aligned* memory in reverse order.
+///
+/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
+/// protection fault will be triggered (fatal program crash).
+///
+/// Functionally equivalent to the following code sequence (assuming `p`
+/// satisfies the alignment restrictions):
+///
+/// ```text
+/// *p = a.extract(3);
+/// *p.offset(1) = a.extract(2);
+/// *p.offset(2) = a.extract(1);
+/// *p.offset(3) = a.extract(0);
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
+    let b: __m128 = simd_shuffle4!(a, a, [3, 2, 1, 0]);
+    *(p as *mut __m128) = b;
+}
+
+/// Returns a `__m128` with the first component from `b` and the remaining
+/// components from `a`.
+///
+/// In other words for any `a` and `b`:
+/// ```text
+/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
+    simd_shuffle4!(a, b, [4, 1, 2, 3])
+}
+
+/// Performs a serializing operation on all store-to-memory instructions that
+/// were issued prior to this instruction.
+///
+/// Guarantees that every store instruction that precedes, in program order, is
+/// globally visible before any store instruction which follows the fence in
+/// program order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sfence)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(sfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sfence() {
+    sfence()
+}
+
+/// Gets the unsigned 32-bit value of the MXCSR control and status register.
+///
+/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getcsr)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(stmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_getcsr() -> u32 {
+    let mut result = 0_i32;
+    stmxcsr((&mut result) as *mut _ as *mut i8);
+    result as u32
+}
+
+/// Sets the MXCSR register with the 32-bit unsigned integer value.
+///
+/// This register constrols how SIMD instructions handle floating point
+/// operations. Modifying this register only affects the current thread.
+///
+/// It contains several groups of flags:
+///
+/// * *Exception flags* report which exceptions occurred since last they were
+/// reset.
+///
+/// * *Masking flags* can be used to mask (ignore) certain exceptions. By
+/// default
+/// these flags are all set to 1, so all exceptions are masked. When an
+/// an exception is masked, the processor simply sets the exception flag and
+/// continues the operation. If the exception is unmasked, the flag is also set
+/// but additionally an exception handler is invoked.
+///
+/// * *Rounding mode flags* control the rounding mode of floating point
+/// instructions.
+///
+/// * The *denormals-are-zero mode flag* turns all numbers which would be
+/// denormalized (exponent bits are all zeros) into zeros.
+///
+/// ## Exception Flags
+///
+/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
+///   Infinity by Infinity).
+///
+/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
+///   number. Mainly this can cause loss of precision.
+///
+/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
+///
+/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
+/// result was too large to be represented (e.g., an `f32` with absolute
+/// value
+///   greater than `2^128`).
+///
+/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
+/// result was too small to be represented in a normalized way (e.g., an
+/// `f32`
+///   with absulte value smaller than `2^-126`.)
+///
+/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
+///   precision exception). This means some precision was lost due to rounding.
+///   For example, the fraction `1/3` cannot be represented accurately in a
+///   32 or 64 bit float and computing it would cause this exception to be
+///   raised. Precision exceptions are very common, so they are usually masked.
+///
+/// Exception flags can be read and set using the convenience functions
+/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
+/// check if an operation caused some overflow:
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
+///                             // perform calculations
+/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
+///     // handle overflow
+/// }
+/// ```
+///
+/// ## Masking Flags
+///
+/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
+/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
+/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
+///
+/// A single masking bit can be set via
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
+/// ```
+///
+/// However, since mask bits are by default all set to 1, it is more common to
+/// want to *disable* certain bits. For example, to unmask the underflow
+/// exception, use:
+///
+/// ```rust,ignore
+/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
+/// exception
+/// ```
+///
+/// Warning: an unmasked exception will cause an exception handler to be
+/// called.
+/// The standard handler will simply terminate the process. So, in this case
+/// any underflow exception would terminate the current process with something
+/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
+///
+/// ## Rounding Mode
+///
+/// The rounding mode is describe using two bits. It can be read and set using
+/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
+/// `_MM_SET_ROUNDING_MODE(mode)`.
+///
+/// The rounding modes are:
+///
+/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
+///   value. If two values are equally close, round to even (i.e., least
+///   significant bit will be zero).
+///
+/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
+///
+/// * `_MM_ROUND_UP`: Round toward positive Infinity.
+///
+/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
+///
+/// Example:
+///
+/// ```rust,ignore
+/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
+/// ```
+///
+/// ## Denormals-are-zero/Flush-to-zero Mode
+///
+/// If this bit is set, values that would be denormalized will be set to zero
+/// instead. This is turned off by default.
+///
+/// You can read and enable/disable this mode via the helper functions
+/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
+///
+/// ```rust,ignore
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
+/// ```
+///
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setcsr)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(ldmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setcsr(val: u32) {
+    ldmxcsr(&val as *const _ as *const i8);
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
+/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_EXCEPT_MASK: u32 = 0x003f;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_INVALID: u32 = 0x0080;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_DENORM: u32 = 0x0100;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_INEXACT: u32 = 0x1000;
+/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_MASK_MASK: u32 = 0x1f80;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_NEAREST: u32 = 0x0000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_DOWN: u32 = 0x2000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_UP: u32 = 0x4000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
+
+/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_ROUND_MASK: u32 = 0x6000;
+
+/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_MASK)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
+    _mm_getcsr() & _MM_MASK_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_STATE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
+    _mm_getcsr() & _MM_EXCEPT_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
+    _mm_getcsr() & _MM_FLUSH_ZERO_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
+    _mm_getcsr() & _MM_ROUND_MASK
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_MASK)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_STATE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
+    let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
+    // println!("setting csr={:x}", val);
+    _mm_setcsr(val)
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
+}
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T0: i32 = 3;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T1: i32 = 2;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_T2: i32 = 1;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_NTA: i32 = 0;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_ET0: i32 = 7;
+
+/// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_HINT_ET1: i32 = 6;
+
+/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
+///
+/// The `STRATEGY` must be one of:
+///
+/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
+///   cache hierarchy.
+///
+/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
+///
+/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
+/// an   implementation-specific choice (e.g., L2 if there is no L3).
+///
+/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
+///   non-temporal access (NTA) hint. It may be a place closer than main memory
+///   but outside of the cache hierarchy. This is used to reduce access latency
+///   without polluting the cache.
+///
+/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
+///   [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
+///   and `_MM_HINT_T1` but indicate an anticipation to write to the address.
+///
+/// The actual implementation depends on the particular CPU. This instruction
+/// is considered a hint, so the CPU is also free to simply ignore the request.
+///
+/// The amount of prefetched data depends on the cache line size of the
+/// specific CPU, but it will be at least 32 bytes.
+///
+/// Common caveats:
+///
+/// * Most modern CPUs already automatically prefetch data based on predicted
+///   access patterns.
+///
+/// * Data is usually not fetched if this would cause a TLB miss or a page
+///   fault.
+///
+/// * Too much prefetching can cause unnecessary cache evictions.
+///
+/// * Prefetching may also fail if there are not enough memory-subsystem
+///   resources (e.g., request buffers).
+///
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_prefetch)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
+#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
+#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
+#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
+    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
+    // `locality` and `rw` are based on our `STRATEGY`.
+    prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
+}
+
+/// Returns vector of type __m128 with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_undefined_ps() -> __m128 {
+    _mm_set1_ps(0.0)
+}
+
+/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_TRANSPOSE4_PS)
+#[inline]
+#[allow(non_snake_case)]
+#[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _MM_TRANSPOSE4_PS(
+    row0: &mut __m128,
+    row1: &mut __m128,
+    row2: &mut __m128,
+    row3: &mut __m128,
+) {
+    let tmp0 = _mm_unpacklo_ps(*row0, *row1);
+    let tmp2 = _mm_unpacklo_ps(*row2, *row3);
+    let tmp1 = _mm_unpackhi_ps(*row0, *row1);
+    let tmp3 = _mm_unpackhi_ps(*row2, *row3);
+
+    *row0 = _mm_movelh_ps(tmp0, tmp2);
+    *row1 = _mm_movehl_ps(tmp2, tmp0);
+    *row2 = _mm_movelh_ps(tmp1, tmp3);
+    *row3 = _mm_movehl_ps(tmp3, tmp1);
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse.add.ss"]
+    fn addss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.sub.ss"]
+    fn subss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.mul.ss"]
+    fn mulss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.div.ss"]
+    fn divss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.sqrt.ss"]
+    fn sqrtss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.sqrt.ps"]
+    fn sqrtps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rcp.ss"]
+    fn rcpss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rcp.ps"]
+    fn rcpps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rsqrt.ss"]
+    fn rsqrtss(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.rsqrt.ps"]
+    fn rsqrtps(a: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.min.ss"]
+    fn minss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.min.ps"]
+    fn minps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.max.ss"]
+    fn maxss(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.max.ps"]
+    fn maxps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse.movmsk.ps"]
+    fn movmskps(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cmp.ps"]
+    fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
+    #[link_name = "llvm.x86.sse.comieq.ss"]
+    fn comieq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comilt.ss"]
+    fn comilt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comile.ss"]
+    fn comile_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comigt.ss"]
+    fn comigt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comige.ss"]
+    fn comige_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.comineq.ss"]
+    fn comineq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomieq.ss"]
+    fn ucomieq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomilt.ss"]
+    fn ucomilt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomile.ss"]
+    fn ucomile_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomigt.ss"]
+    fn ucomigt_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomige.ss"]
+    fn ucomige_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.ucomineq.ss"]
+    fn ucomineq_ss(a: __m128, b: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvtss2si"]
+    fn cvtss2si(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvttss2si"]
+    fn cvttss2si(a: __m128) -> i32;
+    #[link_name = "llvm.x86.sse.cvtsi2ss"]
+    fn cvtsi2ss(a: __m128, b: i32) -> __m128;
+    #[link_name = "llvm.x86.sse.sfence"]
+    fn sfence();
+    #[link_name = "llvm.x86.sse.stmxcsr"]
+    fn stmxcsr(p: *mut i8);
+    #[link_name = "llvm.x86.sse.ldmxcsr"]
+    fn ldmxcsr(p: *const i8);
+    #[link_name = "llvm.prefetch"]
+    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
+    #[link_name = "llvm.x86.sse.cmp.ss"]
+    fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
+}
+
+/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception _may_ be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(movntps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m128, a);
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{hint::black_box, mem::transmute};
+    use std::{boxed, f32::NAN};
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::{simd::*, x86::*};
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_add_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_add_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_add_ss() {
+        let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_add_ss(a, b);
+        assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_sub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sub_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_sub_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_mul_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_mul_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_mul_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_mul_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_div_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
+        let r = _mm_div_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_div_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_div_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sqrt_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_sqrt_ss(a);
+        let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sqrt_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_sqrt_ps(a);
+        let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rcp_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rcp_ss(a);
+        let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rcp_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rcp_ps(a);
+        let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rsqrt_ss() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rsqrt_ss(a);
+        let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_rsqrt_ps() {
+        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
+        let r = _mm_rsqrt_ps(a);
+        let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
+        let rel_err = 0.00048828125;
+        for i in 0..4 {
+            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_min_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_min_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_min_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_min_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
+
+        // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
+        // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
+        // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
+        // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
+        // `r1` to `a` and `r2` to `b`.
+        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
+        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
+        let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_max_ss() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_max_ss(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_max_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_max_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_and_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_and_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0001));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_andnot_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0100));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_or_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_or_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0111));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_xor_ps() {
+        let a = transmute(u32x4::splat(0b0011));
+        let b = transmute(u32x4::splat(0b0101));
+        let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
+        let e = transmute(u32x4::splat(0b0110));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpeq_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
+        let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
+        let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
+        assert_eq!(r, e);
+
+        let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
+        let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
+        assert_eq!(r2, e2);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmplt_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) < b.extract(0)
+        let c1 = 0u32; // a.extract(0) < c.extract(0)
+        let d1 = !0u32; // a.extract(0) < d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmple_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) <= b.extract(0)
+        let c1 = !0u32; // a.extract(0) <= c.extract(0)
+        let d1 = !0u32; // a.extract(0) <= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpgt_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) > b.extract(0)
+        let c1 = 0u32; // a.extract(0) > c.extract(0)
+        let d1 = 0u32; // a.extract(0) > d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpge_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) >= b.extract(0)
+        let c1 = !0u32; // a.extract(0) >= c.extract(0)
+        let d1 = 0u32; // a.extract(0) >= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpneq_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) != b.extract(0)
+        let c1 = 0u32; // a.extract(0) != c.extract(0)
+        let d1 = !0u32; // a.extract(0) != d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnlt_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) >= b.extract(0)
+        let c1 = !0u32; // a.extract(0) >= c.extract(0)
+        let d1 = 0u32; // a.extract(0) >= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnle_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence
+        // of NaNs (signaling or quiet). If so, we should add tests for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) > b.extract(0)
+        let c1 = 0u32; // a.extract(0) > c.extract(0)
+        let d1 = 0u32; // a.extract(0) > d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpngt_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) <= b.extract(0)
+        let c1 = !0u32; // a.extract(0) <= c.extract(0)
+        let d1 = !0u32; // a.extract(0) <= d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnge_ss() {
+        // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
+        // must be a difference. It may have to do with behavior in the
+        // presence of NaNs (signaling or quiet). If so, we should add tests
+        // for those.
+
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) < b.extract(0)
+        let c1 = 0u32; // a.extract(0) < c.extract(0)
+        let d1 = !0u32; // a.extract(0) < d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpord_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = !0u32; // a.extract(0) ord b.extract(0)
+        let c1 = 0u32; // a.extract(0) ord c.extract(0)
+        let d1 = !0u32; // a.extract(0) ord d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpunord_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
+        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
+        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
+
+        let b1 = 0u32; // a.extract(0) unord b.extract(0)
+        let c1 = !0u32; // a.extract(0) unord c.extract(0)
+        let d1 = 0u32; // a.extract(0) unord d.extract(0)
+
+        let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
+        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        assert_eq!(rb, eb);
+
+        let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
+        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        assert_eq!(rc, ec);
+
+        let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
+        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        assert_eq!(rd, ed);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpeq_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, fls, tru, fls);
+        let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmplt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, fls);
+        let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmple_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, tru, fls);
+        let r: u32x4 = transmute(_mm_cmple_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpgt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, fls, fls);
+        let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpge_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, fls);
+        let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpneq_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, tru, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnlt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnle_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpngt_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpnge_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
+        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, tru);
+        let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpord_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
+        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(tru, fls, fls, fls);
+        let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cmpunord_ps() {
+        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
+        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
+        let tru = !0u32;
+        let fls = 0u32;
+
+        let e = u32x4::new(fls, tru, tru, tru);
+        let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comieq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comieq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comilt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comilt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comile_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comile_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comigt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comige_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comineq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 1, 1];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_comineq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomieq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomieq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomilt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomilt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomile_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomile_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomigt_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomigt_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomige_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[1i32, 0, 1, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomige_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_ucomineq_ss() {
+        let aa = &[3.0f32, 12.0, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, 1.5, NAN];
+
+        let ee = &[0i32, 1, 1, 1];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            let r = _mm_ucomineq_ss(a, b);
+
+            assert_eq!(
+                ee[i], r,
+                "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
+        // If one of the arguments is a quiet NaN `comieq_ss` should signal an
+        // Invalid Operation Exception while `ucomieq_ss` should not.
+        let aa = &[3.0f32, NAN, 23.0, NAN];
+        let bb = &[3.0f32, 47.5, NAN, NAN];
+
+        let ee = &[1i32, 0, 0, 0];
+        let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+
+            _MM_SET_EXCEPTION_STATE(0);
+            let r1 = _mm_comieq_ss(*black_box(&a), b);
+            let s1 = _MM_GET_EXCEPTION_STATE();
+
+            _MM_SET_EXCEPTION_STATE(0);
+            let r2 = _mm_ucomieq_ss(*black_box(&a), b);
+            let s2 = _MM_GET_EXCEPTION_STATE();
+
+            assert_eq!(
+                ee[i], r1,
+                "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r1, ee[i], i
+            );
+            assert_eq!(
+                ee[i], r2,
+                "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r2, ee[i], i
+            );
+            assert_eq!(
+                s1,
+                exc[i] * _MM_EXCEPT_INVALID,
+                "_mm_comieq_ss() set exception flags: {} (i={})",
+                s1,
+                i
+            );
+            assert_eq!(
+                s2,
+                0, // ucomieq_ss should not signal an exception
+                "_mm_ucomieq_ss() set exception flags: {} (i={})",
+                s2,
+                i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_si32() {
+        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
+        let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
+        for i in 0..inputs.len() {
+            let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
+            let e = result[i];
+            let r = _mm_cvtss_si32(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvttss_si32() {
+        let inputs = &[
+            (42.0f32, 42i32),
+            (-31.4, -31),
+            (-33.5, -33),
+            (-34.5, -34),
+            (10.999, 10),
+            (-5.99, -5),
+            (4.0e10, i32::MIN),
+            (4.0e-10, 0),
+            (NAN, i32::MIN),
+            (2147483500.1, 2147483520),
+        ];
+        for i in 0..inputs.len() {
+            let (xi, e) = inputs[i];
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvttss_si32(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtsi32_ss() {
+        let inputs = &[
+            (4555i32, 4555.0f32),
+            (322223333, 322223330.0),
+            (-432, -432.0),
+            (-322223333, -322223330.0),
+        ];
+
+        for i in 0..inputs.len() {
+            let (x, f) = inputs[i];
+            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+            let r = _mm_cvtsi32_ss(a, x);
+            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
+            assert_eq_m128(e, r);
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_f32() {
+        let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
+        assert_eq!(_mm_cvtss_f32(a), 312.0134);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set_ss() {
+        let r = _mm_set_ss(black_box(4.25));
+        assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set1_ps() {
+        let r1 = _mm_set1_ps(black_box(4.25));
+        let r2 = _mm_set_ps1(black_box(4.25));
+        assert_eq!(get_m128(r1, 0), 4.25);
+        assert_eq!(get_m128(r1, 1), 4.25);
+        assert_eq!(get_m128(r1, 2), 4.25);
+        assert_eq!(get_m128(r1, 3), 4.25);
+        assert_eq!(get_m128(r2, 0), 4.25);
+        assert_eq!(get_m128(r2, 1), 4.25);
+        assert_eq!(get_m128(r2, 2), 4.25);
+        assert_eq!(get_m128(r2, 3), 4.25);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_set_ps() {
+        let r = _mm_set_ps(
+            black_box(1.0),
+            black_box(2.0),
+            black_box(3.0),
+            black_box(4.0),
+        );
+        assert_eq!(get_m128(r, 0), 4.0);
+        assert_eq!(get_m128(r, 1), 3.0);
+        assert_eq!(get_m128(r, 2), 2.0);
+        assert_eq!(get_m128(r, 3), 1.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_setr_ps() {
+        let r = _mm_setr_ps(
+            black_box(1.0),
+            black_box(2.0),
+            black_box(3.0),
+            black_box(4.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_setzero_ps() {
+        let r = *black_box(&_mm_setzero_ps());
+        assert_eq_m128(r, _mm_set1_ps(0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_shuffle() {
+        assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
+        assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
+        assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_shuffle_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
+        assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_unpackhi_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_unpackhi_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_unpacklo_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_unpacklo_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movehl_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_movehl_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movelh_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_movelh_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load_ss() {
+        let a = 42.0f32;
+        let r = _mm_load_ss(&a as *const f32);
+        assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load1_ps() {
+        let a = 42.0f32;
+        let r = _mm_load1_ps(&a as *const f32);
+        assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_load_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+
+        let mut p = vals.as_ptr();
+        let mut fixup = 0.0f32;
+
+        // Make sure p is aligned, otherwise we might get a
+        // (signal: 11, SIGSEGV: invalid memory reference)
+
+        let unalignment = (p as usize) & 0xf;
+        if unalignment != 0 {
+            let delta = ((16 - unalignment) >> 2) as isize;
+            fixup = delta as f32;
+            p = p.offset(delta);
+        }
+
+        let r = _mm_load_ps(p);
+        let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadu_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = vals.as_ptr().offset(3);
+        let r = _mm_loadu_ps(black_box(p));
+        assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_loadr_ps() {
+        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+
+        let mut p = vals.as_ptr();
+        let mut fixup = 0.0f32;
+
+        // Make sure p is aligned, otherwise we might get a
+        // (signal: 11, SIGSEGV: invalid memory reference)
+
+        let unalignment = (p as usize) & 0xf;
+        if unalignment != 0 {
+            let delta = ((16 - unalignment) >> 2) as isize;
+            fixup = delta as f32;
+            p = p.offset(delta);
+        }
+
+        let r = _mm_loadr_ps(p);
+        let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_si64() {
+        let a = _mm_setr_epi64x(5, 6);
+        let r = _mm_loadu_si64(&a as *const _ as *const _);
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store_ss() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        _mm_store_ss(vals.as_mut_ptr().offset(1), a);
+
+        assert_eq!(vals[0], 0.0);
+        assert_eq!(vals[1], 1.0);
+        assert_eq!(vals[2], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store1_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        if (p as usize) & 0xf != 0 {
+            ofs = ((16 - (p as usize)) & 0xf) >> 2;
+            p = p.add(ofs);
+        }
+
+        _mm_store1_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 1.0);
+        assert_eq!(vals[ofs + 2], 1.0);
+        assert_eq!(vals[ofs + 3], 1.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_store_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Align p to 16-byte boundary
+        if (p as usize) & 0xf != 0 {
+            ofs = ((16 - (p as usize)) & 0xf) >> 2;
+            p = p.add(ofs);
+        }
+
+        _mm_store_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+        assert_eq!(vals[ofs + 2], 3.0);
+        assert_eq!(vals[ofs + 3], 4.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storer_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Align p to 16-byte boundary
+        if (p as usize) & 0xf != 0 {
+            ofs = ((16 - (p as usize)) & 0xf) >> 2;
+            p = p.add(ofs);
+        }
+
+        _mm_storer_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 4.0);
+        assert_eq!(vals[ofs + 1], 3.0);
+        assert_eq!(vals[ofs + 2], 2.0);
+        assert_eq!(vals[ofs + 3], 1.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_storeu_ps() {
+        let mut vals = [0.0f32; 8];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Make sure p is **not** aligned to 16-byte boundary
+        if (p as usize) & 0xf == 0 {
+            ofs = 1;
+            p = p.offset(1);
+        }
+
+        _mm_storeu_ps(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+        assert_eq!(vals[ofs + 2], 3.0);
+        assert_eq!(vals[ofs + 3], 4.0);
+        assert_eq!(vals[ofs + 4], 0.0);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_move_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_move_ss(a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(e, r);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_movemask_ps() {
+        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0101);
+
+        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0111);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_sfence() {
+        _mm_sfence();
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_getcsr_setcsr_1() {
+        let saved_csr = _mm_getcsr();
+
+        let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
+        let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
+
+        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+        let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
+
+        _mm_setcsr(saved_csr);
+
+        let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
+        assert_eq_m128(r, exp); // first component is a denormalized f32
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_getcsr_setcsr_2() {
+        // Same as _mm_setcsr_1 test, but with opposite flag value.
+
+        let saved_csr = _mm_getcsr();
+
+        let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
+        let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
+
+        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+        let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
+
+        _mm_setcsr(saved_csr);
+
+        let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
+        assert_eq_m128(r, exp); // first component is a denormalized f32
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_getcsr_setcsr_underflow() {
+        _MM_SET_EXCEPTION_STATE(0);
+
+        let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
+        let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0);
+
+        assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
+
+        let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
+
+        let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
+        assert_eq_m128(r, exp);
+
+        let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0;
+        assert_eq!(underflow, true);
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_MM_TRANSPOSE4_PS() {
+        let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
+        let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
+
+        _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
+
+        assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
+        assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
+        assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
+        assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
+    }
+
+    #[repr(align(16))]
+    struct Memory {
+        pub data: [f32; 4],
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_stream_ps() {
+        let a = _mm_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 4] };
+
+        _mm_stream_ps(&mut mem.data[0] as *mut f32, a);
+        for i in 0..4 {
+            assert_eq!(mem.data[i], get_m128(a, i));
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
new file mode 100644
index 000000000..5a9120042
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -0,0 +1,4886 @@
+//! Streaming SIMD Extensions 2 (SSE2)
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    intrinsics,
+    mem::{self, transmute},
+    ptr,
+};
+
+/// Provides a hint to the processor that the code sequence is a spin-wait loop.
+///
+/// This can help improve the performance and power consumption of spin-wait
+/// loops.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_pause)
+#[inline]
+#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_pause() {
+    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
+    // the SSE2 target-feature - therefore it does not require any target features
+    pause()
+}
+
+/// Invalidates and flushes the cache line that contains `p` from all levels of
+/// the cache hierarchy.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clflush)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(clflush))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_clflush(p: *const u8) {
+    clflush(p)
+}
+
+/// Performs a serializing operation on all load-from-memory instructions
+/// that were issued prior to this instruction.
+///
+/// Guarantees that every load instruction that precedes, in program order, is
+/// globally visible before any load instruction which follows the fence in
+/// program order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lfence)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(lfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_lfence() {
+    lfence()
+}
+
+/// Performs a serializing operation on all load-from-memory and store-to-memory
+/// instructions that were issued prior to this instruction.
+///
+/// Guarantees that every memory access that precedes, in program order, the
+/// memory fence instruction is globally visible before any memory instruction
+/// which follows the fence in program order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mfence)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(mfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mfence() {
+    mfence()
+}
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_add(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_add(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_add(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(paddusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pavgb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pavgb(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pavgw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pavgw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
+///
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+/// intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmaddwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmaddwd(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmaxsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmaxsw(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmaxub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmaxub(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pminsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pminsw(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pminub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pminub(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmulhw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmulhw(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmulhuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmulhuw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// low 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmullw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
+/// in `a` and `b`.
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epu32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmuludq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmuludq(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Sum the absolute differences of packed unsigned 8-bit integers.
+///
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to produce
+/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
+/// the low 16 bits of 64-bit elements returned.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psadbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(psadbw(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_sub(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_sub(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epu8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epu16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psubusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    _mm_slli_si128_impl::<IMM8>(a)
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_slli_si128` intrinsic into a compile-time constant.
+#[inline]
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 {
+            i
+        } else {
+            16 - shift + i
+        }
+    }
+    let zero = _mm_set1_epi8(0).as_i8x16();
+    transmute::<i8x16, _>(simd_shuffle16!(
+        zero,
+        a.as_i8x16(),
+        <const IMM8: i32> [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    ))
+}
+
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    _mm_slli_si128_impl::<IMM8>(a)
+}
+
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    _mm_srli_si128_impl::<IMM8>(a)
+}
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(pslliw(a.as_i16x8(), IMM8))
+}
+
+/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psllw(a.as_i16x8(), count.as_i16x8()))
+}
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(psllid(a.as_i32x4(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
+    transmute(pslld(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(pslliq(a.as_i64x2(), IMM8))
+}
+
+/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psllq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(psraiw(a.as_i16x8(), IMM8))
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psraw(a.as_i16x8(), count.as_i16x8()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(psraid(a.as_i32x4(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrad(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    _mm_srli_si128_impl::<IMM8>(a)
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_srli_si128` intrinsic into a compile-time constant.
+#[inline]
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        if (shift as u32) > 15 {
+            i + 16
+        } else {
+            i + (shift as u32)
+        }
+    }
+    let zero = _mm_set1_epi8(0).as_i8x16();
+    let x: i8x16 = simd_shuffle16!(
+        a.as_i8x16(),
+        zero,
+        <const IMM8: i32> [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    );
+    transmute(x)
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(psrliw(a.as_i16x8(), IMM8))
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrlw(a.as_i16x8(), count.as_i16x8()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(psrlid(a.as_i32x4(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrld(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(psrliq(a.as_i64x2(), IMM8))
+}
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(psrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
+    transmute(psrlq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
+    simd_and(a, b)
+}
+
+/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
+/// then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
+    simd_and(simd_xor(_mm_set1_epi8(-1), a), b)
+}
+
+/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(orps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
+    simd_or(a, b)
+}
+
+/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
+    simd_xor(a, b)
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpeqb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpeqw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpeqd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Converts the lower two packed 32-bit integers in `a` to packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtdq2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
+    let a = a.as_i32x4();
+    simd_cast::<i32x2, __m128d>(simd_shuffle2!(a, a, [0, 1]))
+}
+
+/// Returns `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
+    simd_insert(a, 0, b as f64)
+}
+
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtdq2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
+    cvtdq2ps(a.as_i32x4())
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i {
+    transmute(cvtps2dq(a))
+}
+
+/// Returns a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
+    transmute(i32x4::new(a, 0, 0, 0))
+}
+
+/// Returns the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
+    simd_extract(a.as_i32x4(), 0)
+}
+
+/// Sets packed 64-bit integers with the supplied values, from highest to
+/// lowest.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
+    transmute(i64x2::new(e0, e1))
+}
+
+/// Sets packed 32-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    transmute(i32x4::new(e0, e1, e2, e3))
+}
+
+/// Sets packed 16-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+}
+
+/// Sets packed 8-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    #[rustfmt::skip]
+    transmute(i8x16::new(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    ))
+}
+
+/// Broadcasts 64-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i {
+    _mm_set_epi64x(a, a)
+}
+
+/// Broadcasts 32-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i {
+    _mm_set_epi32(a, a, a, a)
+}
+
+/// Broadcasts 16-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i {
+    _mm_set_epi16(a, a, a, a, a, a, a, a)
+}
+
+/// Broadcasts 8-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i {
+    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+/// Sets packed 32-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    _mm_set_epi32(e0, e1, e2, e3)
+}
+
+/// Sets packed 16-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
+}
+
+/// Sets packed 8-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+// no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    #[rustfmt::skip]
+    _mm_set_epi8(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+
+/// Returns a vector with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setzero_si128() -> __m128i {
+    _mm_set1_epi64x(0)
+}
+
+/// Loads 64-bit integer from memory into first element of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+// FIXME movsd on windows
+#[cfg_attr(
+    all(
+        test,
+        not(windows),
+        not(all(target_os = "linux", target_arch = "x86_64")),
+        target_arch = "x86_64"
+    ),
+    assert_instr(movq)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
+    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
+}
+
+/// Loads 128-bits of integer data from memory into a new vector.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
+    *mem_addr
+}
+
+/// Loads 128-bits of integer data from memory into a new vector.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
+    let mut dst: __m128i = _mm_undefined_si128();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m128i as *mut u8,
+        mem::size_of::<__m128i>(),
+    );
+    dst
+}
+
+/// Conditionally store 8-bit integer elements from `a` into memory using
+/// `mask`.
+///
+/// Elements are not stored when the highest bit is not set in the
+/// corresponding element.
+///
+/// `mem_addr` should correspond to a 128-bit memory location and does not need
+/// to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(maskmovdqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
+    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
+}
+
+/// Stores 128-bits of integer data from `a` into memory.
+///
+/// `mem_addr` must be aligned on a 16-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
+    *mem_addr = a;
+}
+
+/// Stores 128-bits of integer data from `a` into memory.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
+    storeudq(mem_addr as *mut i8, a);
+}
+
+/// Stores the lower 64-bit integer `a` to a memory location.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+// FIXME mov on windows, movlps on i686
+#[cfg_attr(
+    all(
+        test,
+        not(windows),
+        not(all(target_os = "linux", target_arch = "x86_64")),
+        target_arch = "x86_64"
+    ),
+    assert_instr(movq)
+)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
+    ptr::copy_nonoverlapping(&a as *const _ as *const u8, mem_addr as *mut u8, 8);
+}
+
+/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
+    intrinsics::nontemporal_store(mem_addr, a);
+}
+
+/// Stores a 32-bit integer value in the specified memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movnti))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
+    intrinsics::nontemporal_store(mem_addr, a);
+}
+
+/// Returns a vector where the low element is extracted from `a` and its upper
+/// element is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+// FIXME movd on windows, movd on i686
+#[cfg_attr(all(test, not(windows), target_arch = "x86_64"), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
+    let zero = _mm_setzero_si128();
+    let r: i64x2 = simd_shuffle2!(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
+    transmute(r)
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packs_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(packsswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(packsswb(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packs_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(packssdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(packssdw(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(packuswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(packuswb(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Returns the `imm8` element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
+    static_assert_imm3!(IMM8);
+    simd_extract::<_, u16>(a.as_u16x8(), IMM8 as u32) as i32
+}
+
+/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    static_assert_imm3!(IMM8);
+    transmute(simd_insert(a.as_i16x8(), IMM8 as u32, i as i16))
+}
+
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pmovmskb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
+    pmovmskb(a.as_i8x16())
+}
+
+/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i32x4();
+    let x: i32x4 = simd_shuffle4!(
+        a,
+        a,
+        <const IMM8: i32> [
+            IMM8 as u32 & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
+        ],
+    );
+    transmute(x)
+}
+
+/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the high 64 bits of the returned vector, with the low 64
+/// bits being copied from from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflehi_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i16x8();
+    let x: i16x8 = simd_shuffle8!(
+        a,
+        a,
+        <const IMM8: i32> [
+            0,
+            1,
+            2,
+            3,
+            (IMM8 as u32 & 0b11) + 4,
+            ((IMM8 as u32 >> 2) & 0b11) + 4,
+            ((IMM8 as u32 >> 4) & 0b11) + 4,
+            ((IMM8 as u32 >> 6) & 0b11) + 4,
+        ],
+    );
+    transmute(x)
+}
+
+/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the low 64 bits of the returned vector, with the high 64
+/// bits being copied from from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflelo_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let a = a.as_i16x8();
+    let x: i16x8 = simd_shuffle8!(
+        a,
+        a,
+        <const IMM8: i32> [
+            IMM8 as u32 & 0b11,
+            (IMM8 as u32 >> 2) & 0b11,
+            (IMM8 as u32 >> 4) & 0b11,
+            (IMM8 as u32 >> 6) & 0b11,
+            4,
+            5,
+            6,
+            7,
+        ],
+    );
+    transmute(x)
+}
+
+/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpckhbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i8x16, _>(simd_shuffle16!(
+        a.as_i8x16(),
+        b.as_i8x16(),
+        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
+    ))
+}
+
+/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpckhwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let x = simd_shuffle8!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
+    transmute::<i16x8, _>(x)
+}
+
+/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_shuffle4!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
+}
+
+/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i64x2, _>(simd_shuffle2!(a.as_i64x2(), b.as_i64x2(), [1, 3]))
+}
+
+/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi8)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpcklbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i8x16, _>(simd_shuffle16!(
+        a.as_i8x16(),
+        b.as_i8x16(),
+        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
+    ))
+}
+
+/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(punpcklwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let x = simd_shuffle8!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
+    transmute::<i16x8, _>(x)
+}
+
+/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i32x4, _>(simd_shuffle4!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
+}
+
+/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute::<i64x2, _>(simd_shuffle2!(a.as_i64x2(), b.as_i64x2(), [0, 2]))
+}
+
+/// Returns a new vector with the low element of `a` replaced by the sum of the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(addsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))
+}
+
+/// Adds packed double-precision (64-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(addpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_add(a, b)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// diving the lower element of `a` by the lower element of `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(divsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in `a` by
+/// packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(divpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_div(a, b)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the maximum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(maxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
+    maxsd(a, b)
+}
+
+/// Returns a new vector with the maximum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(maxpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
+    maxpd(a, b)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the minimum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(minsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
+    minsd(a, b)
+}
+
+/// Returns a new vector with the minimum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(minpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
+    minpd(a, b)
+}
+
+/// Returns a new vector with the low element of `a` replaced by multiplying the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(mulsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))
+}
+
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(mulpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_mul(a, b)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the square
+/// root of the lower element `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(sqrtsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(sqrtsd(b)))
+}
+
+/// Returns a new vector with the square root of each of the values in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(sqrtpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
+    simd_fsqrt(a)
+}
+
+/// Returns a new vector with the low element of `a` replaced by subtracting the
+/// low element by `b` from the low element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(subsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in `b`
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(subpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_sub(a, b)
+}
+
+/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
+    let a: __m128i = transmute(a);
+    let b: __m128i = transmute(b);
+    transmute(_mm_and_si128(a, b))
+}
+
+/// Computes the bitwise NOT of `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(andnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
+    let a: __m128i = transmute(a);
+    let b: __m128i = transmute(b);
+    transmute(_mm_andnot_si128(a, b))
+}
+
+/// Computes the bitwise OR of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(orps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
+    let a: __m128i = transmute(a);
+    let b: __m128i = transmute(b);
+    transmute(_mm_or_si128(a, b))
+}
+
+/// Computes the bitwise XOR of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
+    let a: __m128i = transmute(a);
+    let b: __m128i = transmute(b);
+    transmute(_mm_xor_si128(a, b))
+}
+
+/// Returns a new vector with the low element of `a` replaced by the equality
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpeqsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 0)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the less-than
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 1)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 2)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(_mm_cmple_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+}
+
+/// Returns a new vector with the low element of `a` replaced by the result
+/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
+/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpordsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 7)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
+/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpunordsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 3)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the not-equal
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpneqsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 4)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 5)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
+    cmpsd(a, b, 6)
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+}
+
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
+    simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+}
+
+/// Compares corresponding elements in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpeqpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 0)
+}
+
+/// Compares corresponding elements in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 1)
+}
+
+/// Compares corresponding elements in `a` and `b` for less-than-or-equal
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 2)
+}
+
+/// Compares corresponding elements in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmplt_pd(b, a)
+}
+
+/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmplepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmple_pd(b, a)
+}
+
+/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpordpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 7)
+}
+
+/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpunordpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 3)
+}
+
+/// Compares corresponding elements in `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpneqpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 4)
+}
+
+/// Compares corresponding elements in `a` and `b` for not-less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 5)
+}
+
+/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
+    cmppd(a, b, 6)
+}
+
+/// Compares corresponding elements in `a` and `b` for not-greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmpnlt_pd(b, a)
+}
+
+/// Compares corresponding elements in `a` and `b` for
+/// not-greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cmpnlepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_cmpnle_pd(b, a)
+}
+
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
+    comieqsd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
+    comiltsd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
+    comilesd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
+    comigtsd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
+    comigesd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
+    comineqsd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomieqsd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomiltsd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomilesd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomigtsd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomigesd(a, b)
+}
+
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
+    ucomineqsd(a, b)
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed single-precision (32-bit) floating-point elements
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtpd2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
+    cvtpd2ps(a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtps2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
+    cvtps2pd(a)
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
+    transmute(cvtpd2dq(a))
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in a to
+/// a 32-bit integer.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
+    cvtsd2si(a)
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in `b`
+/// to a single-precision (32-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
+    cvtsd2ss(a, b)
+}
+
+/// Returns the lower double-precision (64-bit) floating-point element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
+    simd_extract(a, 0)
+}
+
+/// Converts the lower single-precision (32-bit) floating-point element in `b`
+/// to a double-precision (64-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtss2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
+    cvtss2sd(a, b)
+}
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
+    transmute(cvttpd2dq(a))
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in `a`
+/// to a 32-bit integer with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 {
+    cvttsd2si(a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_epi32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i {
+    transmute(cvttps2dq(a))
+}
+
+/// Copies double-precision (64-bit) floating-point element `a` to the lower
+/// element of the packed 64-bit return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_sd(a: f64) -> __m128d {
+    _mm_set_pd(0.0, a)
+}
+
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set1_pd(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_pd1(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d {
+    __m128d(b, a)
+}
+
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
+    _mm_set_pd(b, a)
+}
+
+/// Returns packed double-precision (64-bit) floating-point elements with all
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(xorps))] // FIXME xorpd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_setzero_pd() -> __m128d {
+    _mm_set_pd(0.0, 0.0)
+}
+
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 2 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movmskpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
+    movmskpd(a)
+}
+
+/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from memory into the returned vector.
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
+    *(mem_addr as *const __m128d)
+}
+
+/// Loads a 64-bit double-precision value to the low element of a
+/// 128-bit integer vector and clears the upper element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
+    _mm_setr_pd(*mem_addr, 0.)
+}
+
+/// Loads a double-precision value into the high-order bits of a 128-bit
+/// vector of `[2 x double]`. The low-order bits are copied from the low-order
+/// bits of the first operand.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
+    _mm_setr_pd(simd_extract(a, 0), *mem_addr)
+}
+
+/// Loads a double-precision value into the low-order bits of a 128-bit
+/// vector of `[2 x double]`. The high-order bits are copied from the
+/// high-order bits of the first operand.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
+    _mm_setr_pd(*mem_addr, simd_extract(a, 1))
+}
+
+/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
+/// aligned memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m128d, a);
+}
+
+/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
+/// memory location.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
+    *mem_addr = simd_extract(a, 0)
+}
+
+/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
+/// on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
+    *(mem_addr as *mut __m128d) = a;
+}
+
+/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
+    storeupd(mem_addr as *mut i8, a);
+}
+
+/// Stores the lower double-precision (64-bit) floating-point element from `a`
+/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
+/// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
+    let b: __m128d = simd_shuffle2!(a, a, [0, 0]);
+    *(mem_addr as *mut __m128d) = b;
+}
+
+/// Stores the lower double-precision (64-bit) floating-point element from `a`
+/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
+/// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
+    let b: __m128d = simd_shuffle2!(a, a, [0, 0]);
+    *(mem_addr as *mut __m128d) = b;
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
+/// memory in reverse order.
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
+    let b: __m128d = simd_shuffle2!(a, a, [1, 0]);
+    *(mem_addr as *mut __m128d) = b;
+}
+
+/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
+/// memory location.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
+    *mem_addr = simd_extract(a, 1);
+}
+
+/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
+/// memory location.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
+    *mem_addr = simd_extract(a, 0);
+}
+
+/// Loads a double-precision (64-bit) floating-point element from memory
+/// into both elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
+    let d = *mem_addr;
+    _mm_setr_pd(d, d)
+}
+
+/// Loads a double-precision (64-bit) floating-point element from memory
+/// into both elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1)
+#[inline]
+#[target_feature(enable = "sse2")]
+// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
+    _mm_load1_pd(mem_addr)
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory into
+/// the returned vector in reverse order. `mem_addr` must be aligned on a
+/// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
+    let a = _mm_load_pd(mem_addr);
+    simd_shuffle2!(a, a, [1, 0])
+}
+
+/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from memory into the returned vector.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
+    let mut dst = _mm_undefined_pd();
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut __m128d as *mut u8,
+        mem::size_of::<__m128d>(),
+    );
+    dst
+}
+
+/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
+/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
+/// parameter as a specifier.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm8!(MASK);
+    simd_shuffle2!(a, b, <const MASK: i32> [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])
+}
+
+/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
+/// 64 bits are set to the lower 64 bits of the second parameter. The upper
+/// 64 bits are set to the upper 64 bits of the first parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_setr_pd(simd_extract(b, 0), simd_extract(a, 1))
+}
+
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// floating-point vector of `[4 x float]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 {
+    transmute(a)
+}
+
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i {
+    transmute(a)
+}
+
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// floating-point vector of `[2 x double]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d {
+    transmute(a)
+}
+
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i {
+    transmute(a)
+}
+
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[2 x double]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d {
+    transmute(a)
+}
+
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[4 x float]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ps)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 {
+    transmute(a)
+}
+
+/// Returns vector of type __m128d with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_undefined_pd() -> __m128d {
+    __m128d(0.0, 0.0)
+}
+
+/// Returns vector of type __m128i with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_undefined_si128() -> __m128i {
+    __m128i(0, 0)
+}
+
+/// The resulting `__m128d` element is composed by the low-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second
+/// input * The `[63:0]` bits are copied from the `[127:64]` bits of the first
+/// input
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(unpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_shuffle2!(a, b, [1, 3])
+}
+
+/// The resulting `__m128d` element is composed by the high-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
+    simd_shuffle2!(a, b, [0, 2])
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse2.pause"]
+    fn pause();
+    #[link_name = "llvm.x86.sse2.clflush"]
+    fn clflush(p: *const u8);
+    #[link_name = "llvm.x86.sse2.lfence"]
+    fn lfence();
+    #[link_name = "llvm.x86.sse2.mfence"]
+    fn mfence();
+    #[link_name = "llvm.x86.sse2.pavg.b"]
+    fn pavgb(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pavg.w"]
+    fn pavgw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse2.pmadd.wd"]
+    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
+    #[link_name = "llvm.x86.sse2.pmaxs.w"]
+    fn pmaxsw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pmaxu.b"]
+    fn pmaxub(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pmins.w"]
+    fn pminsw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pminu.b"]
+    fn pminub(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pmulh.w"]
+    fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pmulhu.w"]
+    fn pmulhuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse2.pmulu.dq"]
+    fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
+    #[link_name = "llvm.x86.sse2.psad.bw"]
+    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
+    #[link_name = "llvm.x86.sse2.pslli.w"]
+    fn pslliw(a: i16x8, imm8: i32) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psll.w"]
+    fn psllw(a: i16x8, count: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pslli.d"]
+    fn psllid(a: i32x4, imm8: i32) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psll.d"]
+    fn pslld(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse2.pslli.q"]
+    fn pslliq(a: i64x2, imm8: i32) -> i64x2;
+    #[link_name = "llvm.x86.sse2.psll.q"]
+    fn psllq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse2.psrai.w"]
+    fn psraiw(a: i16x8, imm8: i32) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psra.w"]
+    fn psraw(a: i16x8, count: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psrai.d"]
+    fn psraid(a: i32x4, imm8: i32) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psra.d"]
+    fn psrad(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psrli.w"]
+    fn psrliw(a: i16x8, imm8: i32) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psrl.w"]
+    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.psrli.d"]
+    fn psrlid(a: i32x4, imm8: i32) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psrl.d"]
+    fn psrld(a: i32x4, count: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse2.psrli.q"]
+    fn psrliq(a: i64x2, imm8: i32) -> i64x2;
+    #[link_name = "llvm.x86.sse2.psrl.q"]
+    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse2.cvtdq2ps"]
+    fn cvtdq2ps(a: i32x4) -> __m128;
+    #[link_name = "llvm.x86.sse2.cvtps2dq"]
+    fn cvtps2dq(a: __m128) -> i32x4;
+    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
+    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
+    #[link_name = "llvm.x86.sse2.packsswb.128"]
+    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
+    #[link_name = "llvm.x86.sse2.packssdw.128"]
+    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
+    #[link_name = "llvm.x86.sse2.packuswb.128"]
+    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pmovmskb.128"]
+    fn pmovmskb(a: i8x16) -> i32;
+    #[link_name = "llvm.x86.sse2.max.sd"]
+    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.max.pd"]
+    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.min.sd"]
+    fn minsd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.min.pd"]
+    fn minpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.sqrt.sd"]
+    fn sqrtsd(a: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.sqrt.pd"]
+    fn sqrtpd(a: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cmp.sd"]
+    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cmp.pd"]
+    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
+    #[link_name = "llvm.x86.sse2.comieq.sd"]
+    fn comieqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comilt.sd"]
+    fn comiltsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comile.sd"]
+    fn comilesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comigt.sd"]
+    fn comigtsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comige.sd"]
+    fn comigesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.comineq.sd"]
+    fn comineqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
+    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
+    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomile.sd"]
+    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
+    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomige.sd"]
+    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
+    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.movmsk.pd"]
+    fn movmskpd(a: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.cvtpd2ps"]
+    fn cvtpd2ps(a: __m128d) -> __m128;
+    #[link_name = "llvm.x86.sse2.cvtps2pd"]
+    fn cvtps2pd(a: __m128) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
+    fn cvtpd2dq(a: __m128d) -> i32x4;
+    #[link_name = "llvm.x86.sse2.cvtsd2si"]
+    fn cvtsd2si(a: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
+    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
+    #[link_name = "llvm.x86.sse2.cvtss2sd"]
+    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
+    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
+    fn cvttpd2dq(a: __m128d) -> i32x4;
+    #[link_name = "llvm.x86.sse2.cvttsd2si"]
+    fn cvttsd2si(a: __m128d) -> i32;
+    #[link_name = "llvm.x86.sse2.cvttps2dq"]
+    fn cvttps2dq(a: __m128) -> i32x4;
+    #[link_name = "llvm.x86.sse2.storeu.dq"]
+    fn storeudq(mem_addr: *mut i8, a: __m128i);
+    #[link_name = "llvm.x86.sse2.storeu.pd"]
+    fn storeupd(mem_addr: *mut i8, a: __m128d);
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        core_arch::{simd::*, x86::*},
+        hint::black_box,
+    };
+    use std::{
+        boxed, f32,
+        f64::{self, NAN},
+        i32,
+        mem::{self, transmute},
+    };
+    use stdarch_test::simd_test;
+
+    #[test]
+    fn test_mm_pause() {
+        unsafe { _mm_pause() }
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_clflush() {
+        let x = 0_u8;
+        _mm_clflush(&x as *const _);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_lfence() {
+        _mm_lfence();
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mfence() {
+        _mm_mfence();
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_add_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi8_overflow() {
+        let a = _mm_set1_epi8(0x7F);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_add_epi8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(-128));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_add_epi16(a, b);
+        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(4, 5, 6, 7);
+        let r = _mm_add_epi32(a, b);
+        let e = _mm_setr_epi32(4, 6, 8, 10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(2, 3);
+        let r = _mm_add_epi64(a, b);
+        let e = _mm_setr_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_adds_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi8_saturate_positive() {
+        let a = _mm_set1_epi8(0x7F);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_adds_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi8_saturate_negative() {
+        let a = _mm_set1_epi8(-0x80);
+        let b = _mm_set1_epi8(-1);
+        let r = _mm_adds_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_adds_epi16(a, b);
+        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi16_saturate_positive() {
+        let a = _mm_set1_epi16(0x7FFF);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_adds_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epi16_saturate_negative() {
+        let a = _mm_set1_epi16(-0x8000);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_adds_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_adds_epu8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu8_saturate() {
+        let a = _mm_set1_epi8(!0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_adds_epu8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_adds_epu16(a, b);
+        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_adds_epu16_saturate() {
+        let a = _mm_set1_epi16(!0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_adds_epu16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_avg_epu8() {
+        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
+        let r = _mm_avg_epu8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_avg_epu16() {
+        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
+        let r = _mm_avg_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(6));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_madd_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm_madd_epi16(a, b);
+        let e = _mm_setr_epi32(29, 81, 149, 233);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_max_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(!0);
+        let r = _mm_max_epu8(a, b);
+        assert_eq_m128i(r, b);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_min_epi16(a, b);
+        assert_eq_m128i(r, b);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_epu8() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(!0);
+        let r = _mm_min_epu8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mulhi_epi16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
+        let r = _mm_mulhi_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-16));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mulhi_epu16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
+        let r = _mm_mulhi_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(15));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mullo_epi16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
+        let r = _mm_mullo_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-17960));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mul_epu32() {
+        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
+        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
+        let r = _mm_mul_epu32(a, b);
+        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sad_epu8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
+            1, 2, 3, 4,
+            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
+            1, 2, 3, 4,
+        );
+        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
+        let r = _mm_sad_epu8(a, b);
+        let e = _mm_setr_epi64x(1020, 614);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi8() {
+        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
+        let r = _mm_sub_epi8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi16() {
+        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
+        let r = _mm_sub_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi32() {
+        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
+        let r = _mm_sub_epi32(a, b);
+        assert_eq_m128i(r, _mm_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_epi64() {
+        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
+        let r = _mm_sub_epi64(a, b);
+        assert_eq_m128i(r, _mm_set1_epi64x(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi8() {
+        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
+        let r = _mm_subs_epi8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi8_saturate_positive() {
+        let a = _mm_set1_epi8(0x7F);
+        let b = _mm_set1_epi8(-1);
+        let r = _mm_subs_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi8_saturate_negative() {
+        let a = _mm_set1_epi8(-0x80);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_subs_epi8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi16() {
+        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
+        let r = _mm_subs_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi16_saturate_positive() {
+        let a = _mm_set1_epi16(0x7FFF);
+        let b = _mm_set1_epi16(-1);
+        let r = _mm_subs_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epi16_saturate_negative() {
+        let a = _mm_set1_epi16(-0x8000);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_subs_epi16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu8() {
+        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
+        let r = _mm_subs_epu8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu8_saturate() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let r = _mm_subs_epu8(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu16() {
+        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
+        let r = _mm_subs_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_subs_epu16_saturate() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_subs_epu16(a, b);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128::<1>(a);
+        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128::<15>(a);
+        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_slli_si128::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi16(
+            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
+        );
+        let r = _mm_slli_epi16::<4>(a);
+
+        #[rustfmt::skip]
+        let e = _mm_setr_epi16(
+            0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0,
+            0, 0, 0, 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sll_epi16() {
+        let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_sll_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi16(0xFF0, 0, 0, 0, 0, 0, 0, 0));
+        let r = _mm_sll_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_epi32() {
+        let r = _mm_slli_epi32::<4>(_mm_set1_epi32(0xFFFF));
+        assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sll_epi32() {
+        let a = _mm_set1_epi32(0xFFFF);
+        let b = _mm_setr_epi32(4, 0, 0, 0);
+        let r = _mm_sll_epi32(a, b);
+        assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_slli_epi64() {
+        let r = _mm_slli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
+        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sll_epi64() {
+        let a = _mm_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_setr_epi64x(4, 0);
+        let r = _mm_sll_epi64(a, b);
+        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srai_epi16() {
+        let r = _mm_srai_epi16::<1>(_mm_set1_epi16(-1));
+        assert_eq_m128i(r, _mm_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sra_epi16() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_sra_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srai_epi32() {
+        let r = _mm_srai_epi32::<1>(_mm_set1_epi32(-1));
+        assert_eq_m128i(r, _mm_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sra_epi32() {
+        let a = _mm_set1_epi32(-1);
+        let b = _mm_setr_epi32(1, 0, 0, 0);
+        let r = _mm_sra_epi32(a, b);
+        assert_eq_m128i(r, _mm_set1_epi32(-1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
+        );
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128::<15>(a);
+        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let r = _mm_srli_si128::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi16(
+            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
+        );
+        let r = _mm_srli_epi16::<4>(a);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi16(
+            0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srl_epi16() {
+        let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_srl_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi16(0xF, 0, 0, 0, 0, 0, 0, 0));
+        let r = _mm_srl_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_epi32() {
+        let r = _mm_srli_epi32::<4>(_mm_set1_epi32(0xFFFF));
+        assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srl_epi32() {
+        let a = _mm_set1_epi32(0xFFFF);
+        let b = _mm_setr_epi32(4, 0, 0, 0);
+        let r = _mm_srl_epi32(a, b);
+        assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srli_epi64() {
+        let r = _mm_srli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
+        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_srl_epi64() {
+        let a = _mm_set1_epi64x(0xFFFFFFFF);
+        let b = _mm_setr_epi64x(4, 0);
+        let r = _mm_srl_epi64(a, b);
+        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_and_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_and_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_andnot_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_andnot_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(2));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_or_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_or_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(7));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_xor_si128() {
+        let a = _mm_set1_epi8(5);
+        let b = _mm_set1_epi8(3);
+        let r = _mm_xor_si128(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(6));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_epi8() {
+        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm_cmpeq_epi8(a, b);
+        #[rustfmt::skip]
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(
+                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            )
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
+        let r = _mm_cmpeq_epi16(a, b);
+        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(3, 2, 2, 0);
+        let r = _mm_cmpeq_epi32(a, b);
+        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_epi8() {
+        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let b = _mm_set1_epi8(0);
+        let r = _mm_cmpgt_epi8(a, b);
+        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_epi16() {
+        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
+        let b = _mm_set1_epi16(0);
+        let r = _mm_cmpgt_epi16(a, b);
+        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_epi32() {
+        let a = _mm_set_epi32(5, 0, 0, 0);
+        let b = _mm_set1_epi32(0);
+        let r = _mm_cmpgt_epi32(a, b);
+        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_epi8() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_cmplt_epi8(a, b);
+        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_epi16() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_cmplt_epi16(a, b);
+        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_epi32() {
+        let a = _mm_set1_epi32(0);
+        let b = _mm_set_epi32(5, 0, 0, 0);
+        let r = _mm_cmplt_epi32(a, b);
+        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtepi32_pd() {
+        let a = _mm_set_epi32(35, 25, 15, 5);
+        let r = _mm_cvtepi32_pd(a);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi32_sd() {
+        let a = _mm_set1_pd(3.5);
+        let r = _mm_cvtsi32_sd(a, 5);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtepi32_ps() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_cvtepi32_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtps_epi32() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi32_si128() {
+        let r = _mm_cvtsi32_si128(5);
+        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi128_si32() {
+        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
+        assert_eq!(r, 5);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi64x() {
+        let r = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi32() {
+        let r = _mm_set_epi32(0, 1, 2, 3);
+        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi16() {
+        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_epi8() {
+        #[rustfmt::skip]
+        let r = _mm_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            15, 14, 13, 12, 11, 10, 9, 8,
+            7, 6, 5, 4, 3, 2, 1, 0,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi64x() {
+        let r = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, _mm_set1_epi64x(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi32() {
+        let r = _mm_set1_epi32(1);
+        assert_eq_m128i(r, _mm_set1_epi32(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi16() {
+        let r = _mm_set1_epi16(1);
+        assert_eq_m128i(r, _mm_set1_epi16(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_epi8() {
+        let r = _mm_set1_epi8(1);
+        assert_eq_m128i(r, _mm_set1_epi8(1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_epi32() {
+        let r = _mm_setr_epi32(0, 1, 2, 3);
+        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_epi16() {
+        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_epi8() {
+        #[rustfmt::skip]
+        let r = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setzero_si128() {
+        let r = _mm_setzero_si128();
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadl_epi64() {
+        let a = _mm_setr_epi64x(6, 5);
+        let r = _mm_loadl_epi64(&a as *const _);
+        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_si128() {
+        let a = _mm_set_epi64x(5, 6);
+        let r = _mm_load_si128(&a as *const _ as *const _);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_si128() {
+        let a = _mm_set_epi64x(5, 6);
+        let r = _mm_loadu_si128(&a as *const _ as *const _);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_maskmoveu_si128() {
+        let a = _mm_set1_epi8(9);
+        #[rustfmt::skip]
+        let mask = _mm_set_epi8(
+            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        let mut r = _mm_set1_epi8(0);
+        _mm_maskmoveu_si128(a, mask, &mut r as *mut _ as *mut i8);
+        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_si128() {
+        let a = _mm_set1_epi8(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_store_si128(&mut r as *mut _ as *mut __m128i, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_si128() {
+        let a = _mm_set1_epi8(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_storeu_si128(&mut r as *mut _ as *mut __m128i, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storel_epi64() {
+        let a = _mm_setr_epi64x(2, 9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_storel_epi64(&mut r as *mut _ as *mut __m128i, a);
+        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_stream_si128() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut r = _mm_undefined_si128();
+        _mm_stream_si128(&mut r as *mut _, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_stream_si32() {
+        let a: i32 = 7;
+        let mut mem = boxed::Box::<i32>::new(-1);
+        _mm_stream_si32(&mut *mem as *mut i32, a);
+        assert_eq!(a, *mem);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_move_epi64() {
+        let a = _mm_setr_epi64x(5, 6);
+        let r = _mm_move_epi64(a);
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_packs_epi16() {
+        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
+        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
+        let r = _mm_packs_epi16(a, b);
+        #[rustfmt::skip]
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(
+                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
+            )
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_packs_epi32() {
+        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
+        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
+        let r = _mm_packs_epi32(a, b);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_packus_epi16() {
+        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
+        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
+        let r = _mm_packus_epi16(a, b);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_extract_epi16() {
+        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
+        let r1 = _mm_extract_epi16::<0>(a);
+        let r2 = _mm_extract_epi16::<3>(a);
+        assert_eq!(r1, 0xFFFF);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_insert_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_insert_epi16::<0>(a, 9);
+        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_movemask_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
+            0b0101, 0b1111_0000u8 as i8, 0, 0,
+            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
+            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
+        );
+        let r = _mm_movemask_epi8(a);
+        assert_eq!(r, 0b10100110_00100101);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shuffle_epi32() {
+        let a = _mm_setr_epi32(5, 10, 15, 20);
+        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
+        let e = _mm_setr_epi32(20, 10, 10, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shufflehi_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
+        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
+        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shufflelo_epi16() {
+        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
+        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
+        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_unpackhi_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_unpackhi_epi16(a, b);
+        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(4, 5, 6, 7);
+        let r = _mm_unpackhi_epi32(a, b);
+        let e = _mm_setr_epi32(2, 6, 3, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(2, 3);
+        let r = _mm_unpackhi_epi64(a, b);
+        let e = _mm_setr_epi64x(1, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let r = _mm_unpacklo_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 16, 1, 17, 2, 18, 3, 19,
+            4, 20, 5, 21, 6, 22, 7, 23,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi16() {
+        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_unpacklo_epi16(a, b);
+        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let b = _mm_setr_epi32(4, 5, 6, 7);
+        let r = _mm_unpacklo_epi32(a, b);
+        let e = _mm_setr_epi32(0, 4, 1, 5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(2, 3);
+        let r = _mm_unpacklo_epi64(a, b);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_add_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_add_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_add_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_div_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_div_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_div_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_div_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_max_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_max_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_max_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_min_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_min_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_min_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mul_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_mul_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_mul_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_mul_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sqrt_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sqrt_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sqrt_pd() {
+        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
+        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sub_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_sub_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_and_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_and_pd(a, b);
+        let e = transmute(u64x2::splat(1));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_andnot_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_andnot_pd(a, b);
+        let e = transmute(u64x2::splat(2));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_or_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_or_pd(a, b);
+        let e = transmute(u64x2::splat(7));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_xor_pd() {
+        let a = transmute(u64x2::splat(5));
+        let b = transmute(u64x2::splat(3));
+        let r = _mm_xor_pd(a, b);
+        let e = transmute(u64x2::splat(6));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmple_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_sd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpge_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpord_sd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpunord_sd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpneq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnlt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnle_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpngt_sd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnge_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmplt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmple_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpge_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpord_pd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpunord_pd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpneq_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnlt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnle_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpngt_pd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cmpnge_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comieq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comieq_sd(a, b) != 0);
+
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comieq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comilt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comilt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comile_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comile_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comigt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comigt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comige_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comige_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_comineq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comineq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomieq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomieq_sd(a, b) != 0);
+
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
+        assert!(_mm_ucomieq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomilt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomilt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomile_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomile_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomigt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomigt_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomige_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomige_sd(a, b) != 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_ucomineq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomineq_sd(a, b) == 0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_movemask_pd() {
+        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
+        assert_eq!(r, 0b01);
+
+        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
+        assert_eq!(r, 0b11);
+    }
+
+    #[repr(align(16))]
+    struct Memory {
+        data: [f64; 4],
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_pd() {
+        let mem = Memory {
+            data: [1.0f64, 2.0, 3.0, 4.0],
+        };
+        let vals = &mem.data;
+        let d = vals.as_ptr();
+
+        let r = _mm_load_pd(d);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_sd() {
+        let a = 1.;
+        let expected = _mm_setr_pd(a, 0.);
+        let r = _mm_load_sd(&a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadh_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = 3.;
+        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
+        let r = _mm_loadh_pd(a, &b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadl_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = 3.;
+        let expected = _mm_setr_pd(3., get_m128d(a, 1));
+        let r = _mm_loadl_pd(a, &b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_stream_pd() {
+        #[repr(align(128))]
+        struct Memory {
+            pub data: [f64; 2],
+        }
+        let a = _mm_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 2] };
+
+        _mm_stream_pd(&mut mem.data[0] as *mut f64, a);
+        for i in 0..2 {
+            assert_eq!(mem.data[i], get_m128d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_sd() {
+        let mut dest = 0.;
+        let a = _mm_setr_pd(1., 2.);
+        _mm_store_sd(&mut dest, a);
+        assert_eq!(dest, _mm_cvtsd_f64(a));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_store_pd(d, *black_box(&a));
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 2.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+
+        let mut ofs = 0;
+        let mut p = vals.as_mut_ptr();
+
+        // Make sure p is **not** aligned to 16-byte boundary
+        if (p as usize) & 0xf == 0 {
+            ofs = 1;
+            p = p.offset(1);
+        }
+
+        _mm_storeu_pd(p, *black_box(&a));
+
+        if ofs > 0 {
+            assert_eq!(vals[ofs - 1], 0.0);
+        }
+        assert_eq!(vals[ofs + 0], 1.0);
+        assert_eq!(vals[ofs + 1], 2.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store1_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_store1_pd(d, *black_box(&a));
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 1.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_store_pd1() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_store_pd1(d, *black_box(&a));
+        assert_eq!(vals[0], 1.0);
+        assert_eq!(vals[1], 1.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storer_pd() {
+        let mut mem = Memory { data: [0.0f64; 4] };
+        let vals = &mut mem.data;
+        let a = _mm_setr_pd(1.0, 2.0);
+        let d = vals.as_mut_ptr();
+
+        _mm_storer_pd(d, *black_box(&a));
+        assert_eq!(vals[0], 2.0);
+        assert_eq!(vals[1], 1.0);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeh_pd() {
+        let mut dest = 0.;
+        let a = _mm_setr_pd(1., 2.);
+        _mm_storeh_pd(&mut dest, a);
+        assert_eq!(dest, get_m128d(a, 1));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storel_pd() {
+        let mut dest = 0.;
+        let a = _mm_setr_pd(1., 2.);
+        _mm_storel_pd(&mut dest, a);
+        assert_eq!(dest, _mm_cvtsd_f64(a));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadr_pd() {
+        let mut mem = Memory {
+            data: [1.0f64, 2.0, 3.0, 4.0],
+        };
+        let vals = &mut mem.data;
+        let d = vals.as_ptr();
+
+        let r = _mm_loadr_pd(d);
+        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_pd() {
+        let mut mem = Memory {
+            data: [1.0f64, 2.0, 3.0, 4.0],
+        };
+        let vals = &mut mem.data;
+        let mut d = vals.as_ptr();
+
+        // make sure d is not aligned to 16-byte boundary
+        let mut offset = 0;
+        if (d as usize) & 0xf == 0 {
+            offset = 1;
+            d = d.offset(offset as isize);
+        }
+
+        let r = _mm_loadu_pd(d);
+        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtpd_ps() {
+        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
+        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
+        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtps_pd() {
+        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
+        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
+
+        let r = _mm_cvtps_pd(_mm_setr_ps(
+            f32::MAX,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            f32::MIN,
+        ));
+        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtpd_epi32() {
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
+        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si32() {
+        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
+        assert_eq!(r, -2);
+
+        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq!(r, i32::MIN);
+
+        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq!(r, i32::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_ss() {
+        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
+        let b = _mm_setr_pd(2.0, -5.0);
+
+        let r = _mm_cvtsd_ss(a, b);
+
+        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
+
+        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
+        let b = _mm_setr_pd(f64::INFINITY, -5.0);
+
+        let r = _mm_cvtsd_ss(a, b);
+
+        assert_eq_m128(
+            r,
+            _mm_setr_ps(
+                f32::INFINITY,
+                f32::NEG_INFINITY,
+                f32::MAX,
+                f32::NEG_INFINITY,
+            ),
+        );
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_f64() {
+        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
+        assert_eq!(r, -1.1);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtss_sd() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let r = _mm_cvtss_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
+
+        let a = _mm_setr_pd(-1.1, f64::INFINITY);
+        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
+
+        let r = _mm_cvtss_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttpd_epi32() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttpd_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttpd_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si32() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttsd_si32(a);
+        assert_eq!(r, -1);
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttsd_si32(a);
+        assert_eq!(r, i32::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttps_epi32() {
+        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
+        let r = _mm_cvttps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
+
+        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
+        let r = _mm_cvttps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_sd() {
+        let r = _mm_set_sd(-1.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set1_pd() {
+        let r = _mm_set1_pd(-1.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_pd1() {
+        let r = _mm_set_pd1(-2.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_set_pd() {
+        let r = _mm_set_pd(1.0_f64, 5.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setr_pd() {
+        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
+        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_setzero_pd() {
+        let r = _mm_setzero_pd();
+        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load1_pd() {
+        let d = -5.0;
+        let r = _mm_load1_pd(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_load_pd1() {
+        let d = -5.0;
+        let r = _mm_load_pd1(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpackhi_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+        let r = _mm_unpackhi_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_unpacklo_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+        let r = _mm_unpacklo_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_shuffle_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(3., 4.);
+        let expected = _mm_setr_pd(1., 3.);
+        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_move_sd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(3., 4.);
+        let expected = _mm_setr_pd(3., 2.);
+        let r = _mm_move_sd(a, b);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castpd_ps() {
+        let a = _mm_set1_pd(0.);
+        let expected = _mm_set1_ps(0.);
+        let r = _mm_castpd_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castpd_si128() {
+        let a = _mm_set1_pd(0.);
+        let expected = _mm_set1_epi64x(0);
+        let r = _mm_castpd_si128(a);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castps_pd() {
+        let a = _mm_set1_ps(0.);
+        let expected = _mm_set1_pd(0.);
+        let r = _mm_castps_pd(a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castps_si128() {
+        let a = _mm_set1_ps(0.);
+        let expected = _mm_set1_epi32(0);
+        let r = _mm_castps_si128(a);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castsi128_pd() {
+        let a = _mm_set1_epi64x(0);
+        let expected = _mm_set1_pd(0.);
+        let r = _mm_castsi128_pd(a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_castsi128_ps() {
+        let a = _mm_set1_epi32(0);
+        let expected = _mm_set1_ps(0.);
+        let r = _mm_castsi128_ps(a);
+        assert_eq_m128(r, expected);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs
new file mode 100644
index 000000000..ab0dd38fe
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@@ -0,0 +1,260 @@
+//! Streaming SIMD Extensions 3 (SSE3)
+
+use crate::{
+    core_arch::{
+        simd::*,
+        simd_llvm::{simd_shuffle2, simd_shuffle4},
+        x86::*,
+    },
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Alternatively add and subtract packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(addsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
+    addsubps(a, b)
+}
+
+/// Alternatively add and subtract packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(addsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
+    addsubpd(a, b)
+}
+
+/// Horizontally adds adjacent pairs of double-precision (64-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(haddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
+    haddpd(a, b)
+}
+
+/// Horizontally adds adjacent pairs of single-precision (32-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(haddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
+    haddps(a, b)
+}
+
+/// Horizontally subtract adjacent pairs of double-precision (64-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(hsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
+    hsubpd(a, b)
+}
+
+/// Horizontally adds adjacent pairs of single-precision (32-bit)
+/// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(hsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
+    hsubps(a, b)
+}
+
+/// Loads 128-bits of integer data from unaligned memory.
+/// This intrinsic may perform better than `_mm_loadu_si128`
+/// when the data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(lddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
+    transmute(lddqu(mem_addr as *const _))
+}
+
+/// Duplicate the low double-precision (64-bit) floating-point element
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
+    simd_shuffle2!(a, a, [0, 0])
+}
+
+/// Loads a double-precision (64-bit) floating-point element from memory
+/// into both elements of return vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
+    _mm_load1_pd(mem_addr)
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
+    simd_shuffle4!(a, a, [1, 1, 3, 3])
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps)
+#[inline]
+#[target_feature(enable = "sse3")]
+#[cfg_attr(test, assert_instr(movsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
+    simd_shuffle4!(a, a, [0, 0, 2, 2])
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse3.addsub.ps"]
+    fn addsubps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.addsub.pd"]
+    fn addsubpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hadd.pd"]
+    fn haddpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hadd.ps"]
+    fn haddps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.hsub.pd"]
+    fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse3.hsub.ps"]
+    fn hsubps(a: __m128, b: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse3.ldu.dq"]
+    fn lddqu(mem_addr: *const i8) -> i8x16;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_addsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_addsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_addsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_addsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hadd_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hadd_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hadd_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hadd_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_hsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_lddqu_si128() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        let r = _mm_lddqu_si128(&a);
+        assert_eq_m128i(a, r);
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_movedup_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let r = _mm_movedup_pd(a);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0, -1.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_movehdup_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let r = _mm_movehdup_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(5.0, 5.0, -10.0, -10.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_moveldup_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let r = _mm_moveldup_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(-1.0, -1.0, 0.0, 0.0));
+    }
+
+    #[simd_test(enable = "sse3")]
+    unsafe fn test_mm_loaddup_pd() {
+        let d = -5.0;
+        let r = _mm_loaddup_pd(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
new file mode 100644
index 000000000..7c59f2702
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -0,0 +1,1887 @@
+//! Streaming SIMD Extensions 4.1 (SSE4.1)
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// SSE4 rounding constants
+/// round to nearest
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
+/// round down
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
+/// round up
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
+/// truncate
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
+/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
+/// do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
+/// suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NO_EXC: i32 = 0x08;
+/// round to nearest and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NINT: i32 = 0x00;
+/// round down and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
+/// round up and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
+/// truncate and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
+/// use MXCSR.RC and do not suppress exceptions; see
+/// `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
+/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
+
+/// Blend packed 8-bit integers from `a` and `b` using `mask`
+///
+/// The high bit of each corresponding mask byte determines the selection.
+/// If the high bit is set the element of `a` is selected. The element
+/// of `b` is selected otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
+    transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
+}
+
+/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
+///
+/// The mask bits determine the selection. A clear bit selects the
+/// corresponding element of `a`, and a set bit the corresponding
+/// element of `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+// Note: LLVM7 prefers the single-precision floating-point domain when possible
+// see https://bugs.llvm.org/show_bug.cgi?id=38195
+// #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))]
+#[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8))
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a`
+/// and `b` using `mask`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
+    blendvpd(a, b, mask)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a`
+/// and `b` using `mask`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
+    blendvps(a, b, mask)
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a`
+/// and `b` using control mask `IMM2`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+// Note: LLVM7 prefers the single-precision floating-point domain when possible
+// see https://bugs.llvm.org/show_bug.cgi?id=38195
+// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
+#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm2!(IMM2);
+    blendpd(a, b, IMM2 as u8)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a`
+/// and `b` using mask `IMM4`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm4!(IMM4);
+    blendps(a, b, IMM4 as u8)
+}
+
+/// Extracts a single-precision (32-bit) floating-point element from `a`,
+/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
+/// and may be converted back to a floating point number via casting.
+///
+/// # Example
+/// ```rust
+/// # #[cfg(target_arch = "x86")]
+/// # use std::arch::x86::*;
+/// # #[cfg(target_arch = "x86_64")]
+/// # use std::arch::x86_64::*;
+/// # fn main() {
+/// #    if is_x86_feature_detected!("sse4.1") {
+/// #       #[target_feature(enable = "sse4.1")]
+/// #       unsafe fn worker() {
+/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
+/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
+/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
+/// float_store.push(f32::from_bits(x as u32));
+/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
+/// #       }
+/// #       unsafe { worker() }
+/// #   }
+/// # }
+/// ```
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(extractps, IMM8 = 0)
+)]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
+    static_assert_imm2!(IMM8);
+    transmute(simd_extract::<_, f32>(a, IMM8 as u32))
+}
+
+/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
+    static_assert_imm4!(IMM8);
+    simd_extract::<_, u8>(a.as_u8x16(), IMM8 as u32) as i32
+}
+
+/// Extracts an 32-bit integer from `a` selected with `IMM8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(extractps, IMM8 = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
+    static_assert_imm2!(IMM8);
+    simd_extract::<_, i32>(a.as_i32x4(), IMM8 as u32)
+}
+
+/// Select a single value in `a` to store at some position in `b`,
+/// Then zero elements according to `IMM8`.
+///
+/// `IMM8` specifies which bits from operand `a` will be copied, which bits in
+/// the result they will be copied to, and which bits in the result will be
+/// cleared. The following assignments are made:
+///
+/// * Bits `[7:6]` specify the bits to copy from operand `a`:
+///     - `00`: Selects bits `[31:0]` from operand `a`.
+///     - `01`: Selects bits `[63:32]` from operand `a`.
+///     - `10`: Selects bits `[95:64]` from operand `a`.
+///     - `11`: Selects bits `[127:96]` from operand `a`.
+///
+/// * Bits `[5:4]` specify the bits in the result to which the selected bits
+/// from operand `a` are copied:
+///     - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
+///     - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
+///     - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
+///     - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
+///
+/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
+/// element is cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm8!(IMM8);
+    insertps(a, b, IMM8 as u8)
+}
+
+/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
+/// location specified by `IMM8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    static_assert_imm4!(IMM8);
+    transmute(simd_insert(a.as_i8x16(), IMM8 as u32, i as i8))
+}
+
+/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
+/// location specified by `IMM8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    static_assert_imm2!(IMM8);
+    transmute(simd_insert(a.as_i32x4(), IMM8 as u32, i))
+}
+
+/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
+/// values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
+/// maximum.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
+/// values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
+/// values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
+/// minimum.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
+/// values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pminud(a.as_u32x4(), b.as_u32x4()))
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(packusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for equality
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
+    let a = a.as_i8x16();
+    let a: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    transmute(simd_cast::<_, i16x8>(a))
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
+    let a = a.as_i8x16();
+    let a: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    transmute(simd_cast::<_, i32x4>(a))
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
+/// 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i8x16();
+    let a: i8x2 = simd_shuffle2!(a, a, [0, 1]);
+    transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
+    let a = a.as_i16x8();
+    let a: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    transmute(simd_cast::<_, i32x4>(a))
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i16x8();
+    let a: i16x2 = simd_shuffle2!(a, a, [0, 1]);
+    transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i32x4();
+    let a: i32x2 = simd_shuffle2!(a, a, [0, 1]);
+    transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
+    let a = a.as_u8x16();
+    let a: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    transmute(simd_cast::<_, i16x8>(a))
+}
+
+/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
+    let a = a.as_u8x16();
+    let a: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    transmute(simd_cast::<_, i32x4>(a))
+}
+
+/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
+    let a = a.as_u8x16();
+    let a: u8x2 = simd_shuffle2!(a, a, [0, 1]);
+    transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a`
+/// to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
+    let a = a.as_u16x8();
+    let a: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+    transmute(simd_cast::<_, i32x4>(a))
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a`
+/// to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
+    let a = a.as_u16x8();
+    let a: u16x2 = simd_shuffle2!(a, a, [0, 1]);
+    transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Zeroes extend packed unsigned 32-bit integers in `a`
+/// to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
+    let a = a.as_u32x4();
+    let a: u32x2 = simd_shuffle2!(a, a, [0, 1]);
+    transmute(simd_cast::<_, i64x2>(a))
+}
+
+/// Returns the dot product of two __m128d vectors.
+///
+/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
+/// If a condition mask bit is zero, the corresponding multiplication is
+/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
+/// the dot product will be stored in the return value component. Otherwise if
+/// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm8!(IMM8);
+    dppd(a, b, IMM8 as u8)
+}
+
+/// Returns the dot product of two __m128 vectors.
+///
+/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
+/// If a condition mask bit is zero, the corresponding multiplication is
+/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
+/// the dot product will be stored in the return value component. Otherwise if
+/// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm8!(IMM8);
+    dpps(a, b, IMM8 as u8)
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// down to an integer value, and stores the results as packed double-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
+    simd_floor(a)
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// down to an integer value, and stores the results as packed single-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
+    simd_floor(a)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// down to an integer value, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper element from `a` to the upper element of the intrinsic
+/// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
+    roundsd(a, b, _MM_FROUND_FLOOR)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// down to an integer value, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
+    roundss(a, b, _MM_FROUND_FLOOR)
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// up to an integer value, and stores the results as packed double-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
+    simd_ceil(a)
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// up to an integer value, and stores the results as packed single-precision
+/// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
+    simd_ceil(a)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// up to an integer value, store the result as a double-precision
+/// floating-point element in the lower element of the intrisic result,
+/// and copies the upper element from `a` to the upper element
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
+    roundsd(a, b, _MM_FROUND_CEIL)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// up to an integer value, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
+    roundss(a, b, _MM_FROUND_CEIL)
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a`
+/// using the `ROUNDING` parameter, and stores the results as packed
+/// double-precision floating-point elements.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// // round to nearest, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+/// // round down, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
+/// // round up, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
+/// // truncate, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
+/// # let _x =
+/// _MM_FROUND_CUR_DIRECTION;
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
+    static_assert_imm4!(ROUNDING);
+    roundpd(a, ROUNDING)
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a`
+/// using the `ROUNDING` parameter, and stores the results as packed
+/// single-precision floating-point elements.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// // round to nearest, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+/// // round down, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
+/// // round up, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
+/// // truncate, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
+/// # let _x =
+/// _MM_FROUND_CUR_DIRECTION;
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
+    static_assert_imm4!(ROUNDING);
+    roundps(a, ROUNDING)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b`
+/// using the `ROUNDING` parameter, store the result as a double-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper element from `a` to the upper element of the intrinsic
+/// result.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// // round to nearest, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+/// // round down, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
+/// // round up, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
+/// // truncate, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
+/// # let _x =
+/// _MM_FROUND_CUR_DIRECTION;
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm4!(ROUNDING);
+    roundsd(a, b, ROUNDING)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b`
+/// using the `ROUNDING` parameter, store the result as a single-precision
+/// floating-point element in the lower element of the intrinsic result,
+/// and copies the upper 3 packed elements from `a` to the upper elements
+/// of the intrinsic result.
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// // round to nearest, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+/// // round down, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
+/// // round up, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
+/// // truncate, and suppress exceptions:
+/// # let _x =
+/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
+/// # let _x =
+/// _MM_FROUND_CUR_DIRECTION;
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm4!(ROUNDING);
+    roundss(a, b, ROUNDING)
+}
+
+/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
+/// returning a vector containing its value in its first position, and its
+/// index
+/// in its second position; all other elements are set to zero.
+///
+/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
+/// instruction.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit vector of type `__m128i`.
+///
+/// Returns:
+///
+/// A 128-bit value where:
+///
+/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
+/// * bits `[18:16]` - contain the index of the minimum value
+/// * remaining bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(phminposuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
+    transmute(phminposuw(a.as_u16x8()))
+}
+
+/// Multiplies the low 32-bit integers from each packed 64-bit
+/// element in `a` and `b`, and returns the signed 64-bit result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
+/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
+/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
+/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
+/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
+/// return a negative number.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Subtracts 8-bit unsigned integer values and computes the absolute
+/// values of the differences to the corresponding bits in the destination.
+/// Then sums of the absolute differences are returned according to the bit
+/// fields in the immediate operand.
+///
+/// The following algorithm is performed:
+///
+/// ```ignore
+/// i = IMM8[2] * 4
+/// j = IMM8[1:0] * 4
+/// for k := 0 to 7
+///     d0 = abs(a[i + k + 0] - b[j + 0])
+///     d1 = abs(a[i + k + 1] - b[j + 1])
+///     d2 = abs(a[i + k + 2] - b[j + 2])
+///     d3 = abs(a[i + k + 3] - b[j + 3])
+///     r[k] = d0 + d1 + d2 + d3
+/// ```
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit vector of type `__m128i`.
+/// * `b` - A 128-bit vector of type `__m128i`.
+/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
+///   differences are to be calculated
+///     * Bit `[2]` specify the offset for operand `a`
+///     * Bits `[1:0]` specify the offset for operand `b`
+///
+/// Returns:
+///
+/// * A `__m128i` vector containing the sums of the sets of   absolute
+///   differences between both operands.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm3!(IMM8);
+    transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8))
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
+    ptestz(a.as_i64x2(), mask.as_i64x2())
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
+    ptestc(a.as_i64x2(), mask.as_i64x2())
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
+    ptestnzc(a.as_i64x2(), mask.as_i64x2())
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testz_si128(a, mask)
+}
+
+/// Tests whether the specified bits in `a` 128-bit integer vector are all
+/// ones.
+///
+/// Argument:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+///
+/// Returns:
+///
+/// * `1` - if the bits specified in the operand are all set to 1,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pcmpeqd))]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
+    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///   operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testnzc_si128(a, mask)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse41.pblendvb"]
+    fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse41.blendvpd"]
+    fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
+    #[link_name = "llvm.x86.sse41.blendvps"]
+    fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
+    #[link_name = "llvm.x86.sse41.blendpd"]
+    fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
+    #[link_name = "llvm.x86.sse41.blendps"]
+    fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.pblendw"]
+    fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
+    #[link_name = "llvm.x86.sse41.insertps"]
+    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.pmaxsb"]
+    fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse41.pmaxuw"]
+    fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.pmaxsd"]
+    fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse41.pmaxud"]
+    fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
+    #[link_name = "llvm.x86.sse41.pminsb"]
+    fn pminsb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse41.pminuw"]
+    fn pminuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.pminsd"]
+    fn pminsd(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse41.pminud"]
+    fn pminud(a: u32x4, b: u32x4) -> u32x4;
+    #[link_name = "llvm.x86.sse41.packusdw"]
+    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
+    #[link_name = "llvm.x86.sse41.dppd"]
+    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
+    #[link_name = "llvm.x86.sse41.dpps"]
+    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    #[link_name = "llvm.x86.sse41.round.pd"]
+    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.sse41.round.ps"]
+    fn roundps(a: __m128, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.sse41.round.sd"]
+    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
+    #[link_name = "llvm.x86.sse41.round.ss"]
+    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
+    #[link_name = "llvm.x86.sse41.phminposuw"]
+    fn phminposuw(a: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.pmuldq"]
+    fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
+    #[link_name = "llvm.x86.sse41.mpsadbw"]
+    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.ptestz"]
+    fn ptestz(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestc"]
+    fn ptestc(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestnzc"]
+    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use std::mem;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        #[rustfmt::skip]
+        let mask = _mm_setr_epi8(
+            0, -1, 0, -1, 0, -1, 0, -1,
+            0, -1, 0, -1, 0, -1, 0, -1,
+        );
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
+        );
+        assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let mask = transmute(_mm_setr_epi64x(0, -1));
+        let r = _mm_blendv_pd(a, b, mask);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blendv_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
+        let r = _mm_blendv_ps(a, b, mask);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let r = _mm_blend_pd::<0b10>(a, b);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let r = _mm_blend_ps::<0b1010>(a, b);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_blend_epi16() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_blend_epi16::<0b1010_1100>(a, b);
+        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_ps() {
+        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
+        let r: f32 = transmute(_mm_extract_ps::<1>(a));
+        assert_eq!(r, 1.0);
+        let r: f32 = transmute(_mm_extract_ps::<3>(a));
+        assert_eq!(r, 3.0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15
+        );
+        let r1 = _mm_extract_epi8::<0>(a);
+        let r2 = _mm_extract_epi8::<3>(a);
+        assert_eq!(r1, 0xFF);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let r = _mm_extract_epi32::<1>(a);
+        assert_eq!(r, 1);
+        let r = _mm_extract_epi32::<3>(a);
+        assert_eq!(r, 3);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_ps() {
+        let a = _mm_set1_ps(1.0);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_insert_ps::<0b11_00_1100>(a, b);
+        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi8() {
+        let a = _mm_set1_epi8(0);
+        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_insert_epi8::<1>(a, 32);
+        assert_eq_m128i(r, e);
+        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
+        let r = _mm_insert_epi8::<14>(a, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi32() {
+        let a = _mm_set1_epi32(0);
+        let e = _mm_setr_epi32(0, 32, 0, 0);
+        let r = _mm_insert_epi32::<1>(a, 32);
+        assert_eq_m128i(r, e);
+        let e = _mm_setr_epi32(0, 0, 0, 32);
+        let r = _mm_insert_epi32::<3>(a, 32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 4, 5, 8, 9, 12, 13, 16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, 3, 6, 7, 10, 11, 14, 15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_max_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            2, 4, 6, 8, 10, 12, 14, 16,
+            18, 20, 22, 24, 26, 28, 30, 32,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_max_epu16(a, b);
+        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epi32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epi32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_max_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epu32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi8_1() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 4, 5, 8, 9, 12, 13, 16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, 3, 6, 7, 10, 11, 14, 15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            1, 3, 5, 7, 9, 11, 13, 15,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi8_2() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, -4, -5, 8, -9, -12, 13, -16,
+            17, 20, 21, 24, 25, 28, 29, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            2, -3, -6, 7, -10, -11, 14, -15,
+            18, 19, 22, 23, 26, 27, 30, 31,
+        );
+        let r = _mm_min_epi8(a, b);
+        #[rustfmt::skip]
+        let e = _mm_setr_epi8(
+            1, -4, -6, 7, -10, -12, 13, -16,
+            17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_min_epu16(a, b);
+        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi32_1() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epi32_2() {
+        let a = _mm_setr_epi32(-1, 4, 5, -7);
+        let b = _mm_setr_epi32(-2, 3, -6, 8);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(-2, 3, -6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_min_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epu32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_packus_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(-1, -2, -3, -4);
+        let r = _mm_packus_epi32(a, b);
+        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cmpeq_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(0, 0);
+        let r = _mm_cmpeq_epi64(a, b);
+        let e = _mm_setr_epi64x(-1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepi32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+        let a = _mm_set1_epi32(-10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(-10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi16(a);
+        let e = _mm_set1_epi16(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi32(a);
+        let e = _mm_set1_epi32(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_cvtepu32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepu32_epi64(a);
+        let e = _mm_set1_epi64x(10);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_dp_pd() {
+        let a = _mm_setr_pd(2.0, 3.0);
+        let b = _mm_setr_pd(1.0, 4.0);
+        let e = _mm_setr_pd(14.0, 0.0);
+        assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_dp_ps() {
+        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
+        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
+        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
+        assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_pd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let r = _mm_floor_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_ps() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let r = _mm_floor_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_sd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let b = _mm_setr_pd(-1.5, -3.5);
+        let r = _mm_floor_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 4.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_floor_ss() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
+        let r = _mm_floor_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_pd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let r = _mm_ceil_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_ps() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let r = _mm_ceil_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_ceil_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_ceil_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
+        let r = _mm_ceil_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_pd() {
+        let a = _mm_setr_pd(1.25, 3.75);
+        let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
+        let e = _mm_setr_pd(1.0, 4.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_ps() {
+        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
+        let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
+        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let old_mode = _MM_GET_ROUNDING_MODE();
+        _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+        let r = _mm_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        _MM_SET_ROUNDING_MODE(old_mode);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_round_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let old_mode = _MM_GET_ROUNDING_MODE();
+        _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+        let r = _mm_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        _MM_SET_ROUNDING_MODE(old_mode);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_1() {
+        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16_2() {
+        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mul_epi32() {
+        {
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(1, 3);
+            assert_eq_m128i(r, e);
+        }
+        {
+            let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
+            let b = _mm_setr_epi32(
+                -20, -256, /* ignored */
+                666666, 666666, /* ignored */
+            );
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(-300, 823043843622);
+            assert_eq_m128i(r, e);
+        }
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mullo_epi32() {
+        {
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mullo_epi32(a, b);
+            let e = _mm_setr_epi32(1, 2, 3, 4);
+            assert_eq_m128i(r, e);
+        }
+        {
+            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
+            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
+            let r = _mm_mullo_epi32(a, b);
+            // Attention, most significant bit in r[2] is treated
+            // as a sign bit:
+            // 1234567 * 666666 = -1589877210
+            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
+            assert_eq_m128i(r, e);
+        }
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_minpos_epu16() {
+        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_mpsadbw_epu8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+
+        let r = _mm_mpsadbw_epu8::<0b000>(a, a);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b001>(a, a);
+        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b100>(a, a);
+        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b101>(a, a);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq_m128i(r, e);
+
+        let r = _mm_mpsadbw_epu8::<0b111>(a, a);
+        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testz_si128() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testc_si128() {
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_testnzc_si128() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_all_zeros() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_all_ones() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_test_mix_ones_zeros() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse42.rs b/library/stdarch/crates/core_arch/src/x86/sse42.rs
new file mode 100644
index 000000000..f474b0671
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse42.rs
@@ -0,0 +1,802 @@
+//! Streaming SIMD Extensions 4.2 (SSE4.2)
+//!
+//! Extends SSE4.1 with STTNI (String and Text New Instructions).
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+/// String contains unsigned 8-bit characters *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UBYTE_OPS: i32 = 0b0000_0000;
+/// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UWORD_OPS: i32 = 0b0000_0001;
+/// String contains signed 8-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_SBYTE_OPS: i32 = 0b0000_0010;
+/// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_SWORD_OPS: i32 = 0b0000_0011;
+
+/// For each character in `a`, find if it is in `b` *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_ANY: i32 = 0b0000_0000;
+/// For each character in `a`, determine if
+/// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_RANGES: i32 = 0b0000_0100;
+/// The strings defined by `a` and `b` are equal
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_EACH: i32 = 0b0000_1000;
+/// Search for the defined substring in the target
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_CMP_EQUAL_ORDERED: i32 = 0b0000_1100;
+
+/// Do not negate results *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_POSITIVE_POLARITY: i32 = 0b0000_0000;
+/// Negates results
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_NEGATIVE_POLARITY: i32 = 0b0001_0000;
+/// Do not negate results before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MASKED_POSITIVE_POLARITY: i32 = 0b0010_0000;
+/// Negates results only before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MASKED_NEGATIVE_POLARITY: i32 = 0b0011_0000;
+
+/// **Index only**: return the least significant bit *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_LEAST_SIGNIFICANT: i32 = 0b0000_0000;
+/// **Index only**: return the most significant bit
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_MOST_SIGNIFICANT: i32 = 0b0100_0000;
+
+/// **Mask only**: return the bit mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_BIT_MASK: i32 = 0b0000_0000;
+/// **Mask only**: return the byte mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return the generated mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrm)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistrm, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistrm<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(pcmpistrm128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8))
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8` and return the generated index. Similar to
+/// [`_mm_cmpestri`] with the exception that [`_mm_cmpestri`] requires the
+/// lengths of `a` and `b` to be explicitly specified.
+///
+/// # Control modes
+///
+/// The control specified by `IMM8` may be one or more of the following.
+///
+/// ## Data size and signedness
+///
+///  - [`_SIDD_UBYTE_OPS`] - Default
+///  - [`_SIDD_UWORD_OPS`]
+///  - [`_SIDD_SBYTE_OPS`]
+///  - [`_SIDD_SWORD_OPS`]
+///
+/// ## Comparison options
+///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
+///  - [`_SIDD_CMP_RANGES`]
+///  - [`_SIDD_CMP_EQUAL_EACH`]
+///  - [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ## Result polarity
+///  - [`_SIDD_POSITIVE_POLARITY`] - Default
+///  - [`_SIDD_NEGATIVE_POLARITY`]
+///
+/// ## Bit returned
+///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
+///  - [`_SIDD_MOST_SIGNIFICANT`]
+///
+/// # Examples
+///
+/// Finds a substring using [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// let haystack = b"This is a long string of text data\r\n\tthat extends
+/// multiple lines";
+/// let needle = b"\r\n\t\0\0\0\0\0\0\0\0\0\0\0\0\0";
+///
+/// let a = _mm_loadu_si128(needle.as_ptr() as *const _);
+/// let hop = 16;
+/// let mut indexes = Vec::new();
+///
+/// // Chunk the haystack into 16 byte chunks and find
+/// // the first "\r\n\t" in the chunk.
+/// for (i, chunk) in haystack.chunks(hop).enumerate() {
+///     let b = _mm_loadu_si128(chunk.as_ptr() as *const _);
+///     let idx = _mm_cmpistri(a, b, _SIDD_CMP_EQUAL_ORDERED);
+///     if idx != 16 {
+///         indexes.push((idx as usize) + (i * hop));
+///     }
+/// }
+/// assert_eq!(indexes, vec![34]);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// The `_mm_cmpistri` intrinsic may also be used to find the existence of
+/// one or more of a given set of characters in the haystack.
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// // Ensure your input is 16 byte aligned
+/// let password = b"hunter2\0\0\0\0\0\0\0\0\0";
+/// let special_chars = b"!@#$%^&*()[]:;<>";
+///
+/// // Load the input
+/// let a = _mm_loadu_si128(special_chars.as_ptr() as *const _);
+/// let b = _mm_loadu_si128(password.as_ptr() as *const _);
+///
+/// // Use _SIDD_CMP_EQUAL_ANY to find the index of any bytes in b
+/// let idx = _mm_cmpistri(a.into(), b.into(), _SIDD_CMP_EQUAL_ANY);
+///
+/// if idx < 16 {
+///     println!("Congrats! Your password contains a special character");
+///     # panic!("{:?} does not contain a special character", password);
+/// } else {
+///     println!("Your password should contain a special character");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// Finds the index of the first character in the haystack that is within a
+/// range of characters.
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// # let b = b":;<=>?@[\\]^_`abc";
+/// # let b = _mm_loadu_si128(b.as_ptr() as *const _);
+///
+/// // Specify the ranges of values to be searched for [A-Za-z0-9].
+/// let a = b"AZaz09\0\0\0\0\0\0\0\0\0\0";
+/// let a = _mm_loadu_si128(a.as_ptr() as *const _);
+///
+/// // Use _SIDD_CMP_RANGES to find the index of first byte in ranges.
+/// // Which in this case will be the first alpha numeric byte found
+/// // in the string.
+/// let idx = _mm_cmpistri(a, b, _SIDD_CMP_RANGES);
+///
+/// if idx < 16 {
+///     println!("Found an alpha numeric character");
+///     # assert_eq!(idx, 13);
+/// } else {
+///     println!("Did not find an alpha numeric character");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// Working with 16-bit characters.
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+/// # let mut some_utf16_words = [0u16; 8];
+/// # let mut more_utf16_words = [0u16; 8];
+/// # '❤'.encode_utf16(&mut some_utf16_words);
+/// # '𝕊'.encode_utf16(&mut more_utf16_words);
+/// // Load the input
+/// let a = _mm_loadu_si128(some_utf16_words.as_ptr() as *const _);
+/// let b = _mm_loadu_si128(more_utf16_words.as_ptr() as *const _);
+///
+/// // Specify _SIDD_UWORD_OPS to compare words instead of bytes, and
+/// // use _SIDD_CMP_EQUAL_EACH to compare the two strings.
+/// let idx = _mm_cmpistri(a, b, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH);
+///
+/// if idx == 0 {
+///     println!("16-bit unicode strings were equal!");
+///     # panic!("Strings should not be equal!")
+/// } else {
+///     println!("16-bit unicode strings were not equal!");
+/// }
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistri)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistri<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpistri128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8)
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return `1` if any character in `b` was null.
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrz)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistrz<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpistriz128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8)
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return `1` if the resulting mask was non-zero,
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrc)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistrc<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpistric128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8)
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and returns `1` if any character in `a` was null,
+/// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrs)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistrs<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpistris128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8)
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return bit `0` of the resulting bit mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistro)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistro<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpistrio128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8)
+}
+
+/// Compares packed strings with implicit lengths in `a` and `b` using the
+/// control in `IMM8`, and return `1` if `b` did not contain a null
+/// character and the resulting mask was zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistra)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpistri, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpistra<const IMM8: i32>(a: __m128i, b: __m128i) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpistria128(a.as_i8x16(), b.as_i8x16(), IMM8 as i8)
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return the generated mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrm)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestrm, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestrm<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> __m128i {
+    static_assert_imm8!(IMM8);
+    transmute(pcmpestrm128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8))
+}
+
+/// Compares packed strings `a` and `b` with lengths `la` and `lb` using the
+/// control in `IMM8` and return the generated index. Similar to
+/// [`_mm_cmpistri`] with the exception that [`_mm_cmpistri`] implicitly
+/// determines the length of `a` and `b`.
+///
+/// # Control modes
+///
+/// The control specified by `IMM8` may be one or more of the following.
+///
+/// ## Data size and signedness
+///
+///  - [`_SIDD_UBYTE_OPS`] - Default
+///  - [`_SIDD_UWORD_OPS`]
+///  - [`_SIDD_SBYTE_OPS`]
+///  - [`_SIDD_SWORD_OPS`]
+///
+/// ## Comparison options
+///  - [`_SIDD_CMP_EQUAL_ANY`] - Default
+///  - [`_SIDD_CMP_RANGES`]
+///  - [`_SIDD_CMP_EQUAL_EACH`]
+///  - [`_SIDD_CMP_EQUAL_ORDERED`]
+///
+/// ## Result polarity
+///  - [`_SIDD_POSITIVE_POLARITY`] - Default
+///  - [`_SIDD_NEGATIVE_POLARITY`]
+///
+/// ## Bit returned
+///  - [`_SIDD_LEAST_SIGNIFICANT`] - Default
+///  - [`_SIDD_MOST_SIGNIFICANT`]
+///
+/// # Examples
+///
+/// ```
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// #     if is_x86_feature_detected!("sse4.2") {
+/// #         #[target_feature(enable = "sse4.2")]
+/// #         unsafe fn worker() {
+///
+/// // The string we want to find a substring in
+/// let haystack = b"Split \r\n\t line  ";
+///
+/// // The string we want to search for with some
+/// // extra bytes we do not want to search for.
+/// let needle = b"\r\n\t ignore this ";
+///
+/// let a = _mm_loadu_si128(needle.as_ptr() as *const _);
+/// let b = _mm_loadu_si128(haystack.as_ptr() as *const _);
+///
+/// // Note: We explicitly specify we only want to search `b` for the
+/// // first 3 characters of a.
+/// let idx = _mm_cmpestri(a, 3, b, 15, _SIDD_CMP_EQUAL_ORDERED);
+///
+/// assert_eq!(idx, 6);
+/// #         }
+/// #         unsafe { worker(); }
+/// #     }
+/// # }
+/// ```
+///
+/// [`_SIDD_UBYTE_OPS`]: constant._SIDD_UBYTE_OPS.html
+/// [`_SIDD_UWORD_OPS`]: constant._SIDD_UWORD_OPS.html
+/// [`_SIDD_SBYTE_OPS`]: constant._SIDD_SBYTE_OPS.html
+/// [`_SIDD_SWORD_OPS`]: constant._SIDD_SWORD_OPS.html
+/// [`_SIDD_CMP_EQUAL_ANY`]: constant._SIDD_CMP_EQUAL_ANY.html
+/// [`_SIDD_CMP_RANGES`]: constant._SIDD_CMP_RANGES.html
+/// [`_SIDD_CMP_EQUAL_EACH`]: constant._SIDD_CMP_EQUAL_EACH.html
+/// [`_SIDD_CMP_EQUAL_ORDERED`]: constant._SIDD_CMP_EQUAL_ORDERED.html
+/// [`_SIDD_POSITIVE_POLARITY`]: constant._SIDD_POSITIVE_POLARITY.html
+/// [`_SIDD_NEGATIVE_POLARITY`]: constant._SIDD_NEGATIVE_POLARITY.html
+/// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
+/// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
+/// [`_mm_cmpistri`]: fn._mm_cmpistri.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestri)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestri<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpestri128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8)
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if any character in
+/// `b` was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrz)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestrz<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpestriz128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8)
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if the resulting mask
+/// was non-zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrc)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestrc<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpestric128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8)
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if any character in
+/// a was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrs)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestrs<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpestris128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8)
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return bit `0` of the resulting
+/// bit mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestro)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestro<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpestrio128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8)
+}
+
+/// Compares packed strings in `a` and `b` with lengths `la` and `lb`
+/// using the control in `IMM8`, and return `1` if `b` did not
+/// contain a null character and the resulting mask was zero, and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestra)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpestri, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpestra<const IMM8: i32>(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    static_assert_imm8!(IMM8);
+    pcmpestria128(a.as_i8x16(), la, b.as_i8x16(), lb, IMM8 as i8)
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 8-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u8)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
+    crc32_32_8(crc, v)
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 16-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u16)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
+    crc32_32_16(crc, v)
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 32-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u32)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
+    crc32_32_32(crc, v)
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for greater-than,
+/// return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi64)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    // SSE 4.2 string and text comparison ops
+    #[link_name = "llvm.x86.sse42.pcmpestrm128"]
+    fn pcmpestrm128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> u8x16;
+    #[link_name = "llvm.x86.sse42.pcmpestri128"]
+    fn pcmpestri128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestriz128"]
+    fn pcmpestriz128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestric128"]
+    fn pcmpestric128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestris128"]
+    fn pcmpestris128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestrio128"]
+    fn pcmpestrio128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpestria128"]
+    fn pcmpestria128(a: i8x16, la: i32, b: i8x16, lb: i32, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistrm128"]
+    fn pcmpistrm128(a: i8x16, b: i8x16, imm8: i8) -> i8x16;
+    #[link_name = "llvm.x86.sse42.pcmpistri128"]
+    fn pcmpistri128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistriz128"]
+    fn pcmpistriz128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistric128"]
+    fn pcmpistric128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistris128"]
+    fn pcmpistris128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistrio128"]
+    fn pcmpistrio128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    #[link_name = "llvm.x86.sse42.pcmpistria128"]
+    fn pcmpistria128(a: i8x16, b: i8x16, imm8: i8) -> i32;
+    // SSE 4.2 CRC instructions
+    #[link_name = "llvm.x86.sse42.crc32.32.8"]
+    fn crc32_32_8(crc: u32, v: u8) -> u32;
+    #[link_name = "llvm.x86.sse42.crc32.32.16"]
+    fn crc32_32_16(crc: u32, v: u16) -> u32;
+    #[link_name = "llvm.x86.sse42.crc32.32.32"]
+    fn crc32_32_32(crc: u32, v: u32) -> u32;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use std::ptr;
+
+    // Currently one cannot `load` a &[u8] that is is less than 16
+    // in length. This makes loading strings less than 16 in length
+    // a bit difficult. Rather than `load` and mutate the __m128i,
+    // it is easier to memcpy the given string to a local slice with
+    // length 16 and `load` the local slice.
+    #[target_feature(enable = "sse4.2")]
+    unsafe fn str_to_m128i(s: &[u8]) -> __m128i {
+        assert!(s.len() <= 16);
+        let slice = &mut [0u8; 16];
+        ptr::copy_nonoverlapping(
+            s.get_unchecked(0) as *const u8 as *const u8,
+            slice.get_unchecked_mut(0) as *mut u8 as *mut u8,
+            s.len(),
+        );
+        _mm_loadu_si128(slice.as_ptr() as *const _)
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrm() {
+        let a = str_to_m128i(b"Hello! Good-Bye!");
+        let b = str_to_m128i(b"hello! good-bye!");
+        let i = _mm_cmpistrm::<_SIDD_UNIT_MASK>(a, b);
+        #[rustfmt::skip]
+        let res = _mm_setr_epi8(
+            0x00, !0, !0, !0, !0, !0, !0, 0x00,
+            !0, !0, !0, !0, 0x00, !0, !0, !0,
+        );
+        assert_eq_m128i(i, res);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistri() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"   Hello        ");
+        let i = _mm_cmpistri::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
+        assert_eq!(3, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrz() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello");
+        let i = _mm_cmpistrz::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrc() {
+        let a = str_to_m128i(b"                ");
+        let b = str_to_m128i(b"       !        ");
+        let i = _mm_cmpistrc::<_SIDD_UNIT_MASK>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistrs() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"");
+        let i = _mm_cmpistrs::<_SIDD_CMP_EQUAL_ORDERED>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistro() {
+        #[rustfmt::skip]
+        let a_bytes = _mm_setr_epi8(
+            0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        #[rustfmt::skip]
+        let b_bytes = _mm_setr_epi8(
+            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        let a = a_bytes;
+        let b = b_bytes;
+        let i = _mm_cmpistro::<{ _SIDD_UWORD_OPS | _SIDD_UNIT_MASK }>(a, b);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpistra() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello!!!!!!!!!!!");
+        let i = _mm_cmpistra::<_SIDD_UNIT_MASK>(a, b);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrm() {
+        let a = str_to_m128i(b"Hello!");
+        let b = str_to_m128i(b"Hello.");
+        let i = _mm_cmpestrm::<_SIDD_UNIT_MASK>(a, 5, b, 5);
+        #[rustfmt::skip]
+        let r = _mm_setr_epi8(
+            !0, !0, !0, !0, !0, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        );
+        assert_eq_m128i(i, r);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestri() {
+        let a = str_to_m128i(b"bar - garbage");
+        let b = str_to_m128i(b"foobar");
+        let i = _mm_cmpestri::<_SIDD_CMP_EQUAL_ORDERED>(a, 3, b, 6);
+        assert_eq!(3, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrz() {
+        let a = str_to_m128i(b"");
+        let b = str_to_m128i(b"Hello");
+        let i = _mm_cmpestrz::<_SIDD_CMP_EQUAL_ORDERED>(a, 16, b, 6);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrc() {
+        let va = str_to_m128i(b"!!!!!!!!");
+        let vb = str_to_m128i(b"        ");
+        let i = _mm_cmpestrc::<_SIDD_UNIT_MASK>(va, 7, vb, 7);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestrs() {
+        #[rustfmt::skip]
+        let a_bytes = _mm_setr_epi8(
+            0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
+            0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        );
+        let a = a_bytes;
+        let b = _mm_set1_epi8(0x00);
+        let i = _mm_cmpestrs::<_SIDD_UWORD_OPS>(a, 8, b, 0);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestro() {
+        let a = str_to_m128i(b"Hello");
+        let b = str_to_m128i(b"World");
+        let i = _mm_cmpestro::<_SIDD_UBYTE_OPS>(a, 5, b, 5);
+        assert_eq!(0, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpestra() {
+        let a = str_to_m128i(b"Cannot match a");
+        let b = str_to_m128i(b"Null after 14");
+        let i = _mm_cmpestra::<{ _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK }>(a, 14, b, 16);
+        assert_eq!(1, i);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u8() {
+        let crc = 0x2aa1e72b;
+        let v = 0x2a;
+        let i = _mm_crc32_u8(crc, v);
+        assert_eq!(i, 0xf24122e4);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u16() {
+        let crc = 0x8ecec3b5;
+        let v = 0x22b;
+        let i = _mm_crc32_u16(crc, v);
+        assert_eq!(i, 0x13bb2fb);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u32() {
+        let crc = 0xae2912c8;
+        let v = 0x845fed;
+        let i = _mm_crc32_u32(crc, v);
+        assert_eq!(i, 0xffae2ed1);
+    }
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_cmpgt_epi64() {
+        let a = _mm_setr_epi64x(0, 0x2a);
+        let b = _mm_set1_epi64x(0x00);
+        let i = _mm_cmpgt_epi64(a, b);
+        assert_eq_m128i(i, _mm_setr_epi64x(0x00, 0xffffffffffffffffu64 as i64));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/sse4a.rs b/library/stdarch/crates/core_arch/src/x86/sse4a.rs
new file mode 100644
index 000000000..976c907cb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/sse4a.rs
@@ -0,0 +1,164 @@
+//! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`)
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse4a.extrq"]
+    fn extrq(x: i64x2, y: i8x16) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.insertq"]
+    fn insertq(x: i64x2, y: i64x2) -> i64x2;
+    #[link_name = "llvm.x86.sse4a.movnt.sd"]
+    fn movntsd(x: *mut f64, y: __m128d);
+    #[link_name = "llvm.x86.sse4a.movnt.ss"]
+    fn movntss(x: *mut f32, y: __m128);
+}
+
+// FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ
+// FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ
+
+/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
+///
+/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
+/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
+/// other bits are ignored.
+///
+/// If the length is zero, it is interpreted as `64`. If the length and index
+/// are zero, the lower 64 bits of `x` are extracted.
+///
+/// If `length == 0 && index > 0` or `length + index > 64` the result is
+/// undefined.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(extrq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
+    transmute(extrq(x.as_i64x2(), y.as_i8x16()))
+}
+
+/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
+///
+/// The bits of `y`:
+///
+/// - `[69:64]` specify the `length`,
+/// - `[77:72]` specify the index.
+///
+/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
+/// or `index > 0 && length == 0` the result is undefined.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(insertq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
+    transmute(insertq(x.as_i64x2(), y.as_i64x2()))
+}
+
+/// Non-temporal store of `a.0` into `p`.
+///
+/// Writes 64-bit data to a memory location without polluting the caches.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(movntsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
+    movntsd(p, a);
+}
+
+/// Non-temporal store of `a.0` into `p`.
+///
+/// Writes 32-bit data to a memory location without polluting the caches.
+#[inline]
+#[target_feature(enable = "sse4a")]
+#[cfg_attr(test, assert_instr(movntss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
+    movntss(p, a);
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_extract_si64() {
+        let b = 0b0110_0000_0000_i64;
+        //        ^^^^ bit range extracted
+        let x = _mm_setr_epi64x(b, 0);
+        let v = 0b001000___00___000100_i64;
+        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
+        let y = _mm_setr_epi64x(v, 0);
+        let e = _mm_setr_epi64x(0b0110_i64, 0);
+        let r = _mm_extract_si64(x, y);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_insert_si64() {
+        let i = 0b0110_i64;
+        //        ^^^^ bit range inserted
+        let z = 0b1010_1010_1010i64;
+        //        ^^^^ bit range replaced
+        let e = 0b0110_1010_1010i64;
+        //        ^^^^ replaced 1010 with 0110
+        let x = _mm_setr_epi64x(z, 0);
+        let expected = _mm_setr_epi64x(e, 0);
+        let v = 0b001000___00___000100_i64;
+        //        ^idx: 2^3 = 8 ^length = 2^2 = 4
+        let y = _mm_setr_epi64x(i, v);
+        let r = _mm_insert_si64(x, y);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[repr(align(16))]
+    struct MemoryF64 {
+        data: [f64; 2],
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_stream_sd() {
+        let mut mem = MemoryF64 {
+            data: [1.0_f64, 2.0],
+        };
+        {
+            let vals = &mut mem.data;
+            let d = vals.as_mut_ptr();
+
+            let x = _mm_setr_pd(3.0, 4.0);
+
+            _mm_stream_sd(d, x);
+        }
+        assert_eq!(mem.data[0], 3.0);
+        assert_eq!(mem.data[1], 2.0);
+    }
+
+    #[repr(align(16))]
+    struct MemoryF32 {
+        data: [f32; 4],
+    }
+
+    #[simd_test(enable = "sse4a")]
+    unsafe fn test_mm_stream_ss() {
+        let mut mem = MemoryF32 {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        {
+            let vals = &mut mem.data;
+            let d = vals.as_mut_ptr();
+
+            let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+            _mm_stream_ss(d, x);
+        }
+        assert_eq!(mem.data[0], 5.0);
+        assert_eq!(mem.data[1], 2.0);
+        assert_eq!(mem.data[2], 3.0);
+        assert_eq!(mem.data[3], 4.0);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/ssse3.rs b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
new file mode 100644
index 000000000..4beb496b6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
@@ -0,0 +1,537 @@
+//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Computes the absolute value of packed 8-bit signed integers in `a` and
+/// return the unsigned results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
+    transmute(pabsb128(a.as_i8x16()))
+}
+
+/// Computes the absolute value of each of the packed 16-bit signed integers in
+/// `a` and
+/// return the 16-bit unsigned integer
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
+    transmute(pabsw128(a.as_i16x8()))
+}
+
+/// Computes the absolute value of each of the packed 32-bit signed integers in
+/// `a` and
+/// return the 32-bit unsigned integer
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
+    transmute(pabsd128(a.as_i32x4()))
+}
+
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// The last 4 bits of each byte of `b` are used as addresses
+/// into the 16 bytes of `a`.
+///
+/// In addition, if the highest significant bit of a byte of `b`
+/// is set, the respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
+/// logically equivalent to:
+///
+/// ```
+/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
+///     let mut r = [0u8; 16];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
+}
+
+/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
+/// shift the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(palignr, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if IMM8 > 32 {
+        return _mm_set1_epi8(0);
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b) = if IMM8 > 16 {
+        (_mm_set1_epi8(0), a)
+    } else {
+        (a, b)
+    };
+    const fn mask(shift: u32, i: u32) -> u32 {
+        if shift > 32 {
+            // Unused, but needs to be a valid index.
+            i
+        } else if shift > 16 {
+            shift - 16 + i
+        } else {
+            shift + i
+        }
+    }
+    let r: i8x16 = simd_shuffle16!(
+        b.as_i8x16(),
+        a.as_i8x16(),
+        <const IMM8: i32> [
+            mask(IMM8 as u32, 0),
+            mask(IMM8 as u32, 1),
+            mask(IMM8 as u32, 2),
+            mask(IMM8 as u32, 3),
+            mask(IMM8 as u32, 4),
+            mask(IMM8 as u32, 5),
+            mask(IMM8 as u32, 6),
+            mask(IMM8 as u32, 7),
+            mask(IMM8 as u32, 8),
+            mask(IMM8 as u32, 9),
+            mask(IMM8 as u32, 10),
+            mask(IMM8 as u32, 11),
+            mask(IMM8 as u32, 12),
+            mask(IMM8 as u32, 13),
+            mask(IMM8 as u32, 14),
+            mask(IMM8 as u32, 15),
+        ],
+    );
+    transmute(r)
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(phaddw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
+/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(phaddsw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(phaddd128(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(phsubw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
+/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+/// saturated to 8000h.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(phsubsw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(phsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(phsubd128(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, add pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16()))
+}
+
+/// Multiplies packed 16-bit signed integer values, truncate the 32-bit
+/// product to the 18 most significant bits by right-shifting, round the
+/// truncated value by adding 1, and write bits `[16:1]` to the destination.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(pmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit
+/// integer in `b` is negative, and returns the result.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi8)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(psignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
+    transmute(psignb128(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit
+/// integer in `b` is negative, and returns the results.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi16)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(psignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
+    transmute(psignw128(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit
+/// integer in `b` is negative, and returns the results.
+/// Element in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi32)
+#[inline]
+#[target_feature(enable = "ssse3")]
+#[cfg_attr(test, assert_instr(psignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
+    transmute(psignd128(a.as_i32x4(), b.as_i32x4()))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.ssse3.pabs.b.128"]
+    fn pabsb128(a: i8x16) -> u8x16;
+
+    #[link_name = "llvm.x86.ssse3.pabs.w.128"]
+    fn pabsw128(a: i16x8) -> u16x8;
+
+    #[link_name = "llvm.x86.ssse3.pabs.d.128"]
+    fn pabsd128(a: i32x4) -> u32x4;
+
+    #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
+    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
+
+    #[link_name = "llvm.x86.ssse3.phadd.w.128"]
+    fn phaddw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phadd.sw.128"]
+    fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phadd.d.128"]
+    fn phaddd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.ssse3.phsub.w.128"]
+    fn phsubw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phsub.sw.128"]
+    fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.phsub.d.128"]
+    fn phsubd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
+    fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.pmul.hr.sw.128"]
+    fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.psign.b.128"]
+    fn psignb128(a: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.x86.ssse3.psign.w.128"]
+    fn psignw128(a: i16x8, b: i16x8) -> i16x8;
+
+    #[link_name = "llvm.x86.ssse3.psign.d.128"]
+    fn psignd128(a: i32x4, b: i32x4) -> i32x4;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi8() {
+        let r = _mm_abs_epi8(_mm_set1_epi8(-5));
+        assert_eq_m128i(r, _mm_set1_epi8(5));
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi16() {
+        let r = _mm_abs_epi16(_mm_set1_epi16(-5));
+        assert_eq_m128i(r, _mm_set1_epi16(5));
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi32() {
+        let r = _mm_abs_epi32(_mm_set1_epi32(-5));
+        assert_eq_m128i(r, _mm_set1_epi32(5));
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 128_u8 as i8, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
+        let r = _mm_shuffle_epi8(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_alignr_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 63, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let r = _mm_alignr_epi8::<33>(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(0));
+
+        let r = _mm_alignr_epi8::<17>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm_setr_epi8(
+            2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 0,
+        );
+        assert_eq_m128i(r, expected);
+
+        let r = _mm_alignr_epi8::<16>(a, b);
+        assert_eq_m128i(r, a);
+
+        let r = _mm_alignr_epi8::<15>(a, b);
+        #[rustfmt::skip]
+        let expected = _mm_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+        );
+        assert_eq_m128i(r, expected);
+
+        let r = _mm_alignr_epi8::<0>(a, b);
+        assert_eq_m128i(r, b);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hadd_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 36, 25);
+        let r = _mm_hadd_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hadds_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, 1, -32768, -1);
+        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 32767, -32768);
+        let r = _mm_hadds_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hadd_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(4, 128, 4, 3);
+        let expected = _mm_setr_epi32(3, 7, 132, 7);
+        let r = _mm_hadd_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hsub_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 12, -13);
+        let r = _mm_hsub_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hsubs_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
+        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 32767, -32768);
+        let r = _mm_hsubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_hsub_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(4, 128, 4, 3);
+        let expected = _mm_setr_epi32(-1, -1, -124, 1);
+        let r = _mm_hsub_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_maddubs_epi16() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 63, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let expected = _mm_setr_epi16(130, 24, 192, 194, 158, 175, 66, 120);
+        let r = _mm_maddubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_mulhrs_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
+        let expected = _mm_setr_epi16(0, 0, 0, 0, 5, 0, -7, 0);
+        let r = _mm_mulhrs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, -14, -15, 16,
+        );
+        #[rustfmt::skip]
+        let b = _mm_setr_epi8(
+            4, 63, -4, 3, 24, 12, -6, -19,
+            12, 5, -5, 10, 4, 1, -8, 0,
+        );
+        #[rustfmt::skip]
+        let expected = _mm_setr_epi8(
+            1, 2, -3, 4, 5, 6, -7, -8,
+            9, 10, -11, 12, 13, -14, 15, 0,
+        );
+        let r = _mm_sign_epi8(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, -5, -6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 0, 3, 1, -1, -2, 1);
+        let expected = _mm_setr_epi16(1, 2, 0, 4, -5, 6, -7, 8);
+        let r = _mm_sign_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi32() {
+        let a = _mm_setr_epi32(-1, 2, 3, 4);
+        let b = _mm_setr_epi32(1, -1, 1, 0);
+        let expected = _mm_setr_epi32(-1, -2, 3, 0);
+        let r = _mm_sign_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/tbm.rs b/library/stdarch/crates/core_arch/src/x86/tbm.rs
new file mode 100644
index 000000000..d1102a116
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/tbm.rs
@@ -0,0 +1,460 @@
+//! Trailing Bit Manipulation (TBM) instruction set.
+//!
+//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3:
+//! General-Purpose and System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the available
+//! instructions.
+//!
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// FIXME(blocked on #248)
+// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select:
+// intrinsic %llvm.x86.tbm.bextri.u32
+/*
+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.x86.tbm.bextri.u32"]
+    fn x86_tbm_bextri_u32(a: u32, y: u32) -> u32;
+    #[link_name="llvm.x86.tbm.bextri.u64"]
+    fn x86_tbm_bextri_u64(x: u64, y: u64) -> u64;
+}
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+#[inline]
+#[target_feature(enable = "tbm")]
+pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+    _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
+}
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+#[inline]
+#[target_feature(enable = "tbm")]
+pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
+    _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
+/// be extracted, and bits `[15,8]` specify the length of the range.
+#[inline]
+#[target_feature(enable = "tbm")]
+pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    unsafe { x86_tbm_bextri_u32(a, control) }
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
+/// be extracted, and bits `[15,8]` specify the length of the range.
+#[inline]
+#[target_feature(enable = "tbm")]
+pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
+    unsafe { x86_tbm_bextri_u64(a, control) }
+}
+*/
+
+/// Clears all bits below the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcfill_u32(x: u32) -> u32 {
+    x & (x.wrapping_add(1))
+}
+
+/// Clears all bits below the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcfill))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcfill_u64(x: u64) -> u64 {
+    x & (x.wrapping_add(1))
+}
+
+/// Sets all bits of `x` to 1 except for the least significant zero bit.
+///
+/// If there is no zero bit in `x`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blci))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blci_u32(x: u32) -> u32 {
+    x | !(x.wrapping_add(1))
+}
+
+/// Sets all bits of `x` to 1 except for the least significant zero bit.
+///
+/// If there is no zero bit in `x`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blci))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blci_u64(x: u64) -> u64 {
+    x | !(x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all other bits.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcic_u32(x: u32) -> u32 {
+    !x & (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all other bits.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcic))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcic_u64(x: u64) -> u64 {
+    !x & (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all bits above
+/// that bit.
+///
+/// If there is no zero bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
+    x ^ (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all bits above
+/// that bit.
+///
+/// If there is no zero bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcmsk))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
+    x ^ (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns `x`.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcs))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcs_u32(x: u32) -> u32 {
+    x | (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns `x`.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blcs))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blcs_u64(x: u64) -> u64 {
+    x | x.wrapping_add(1)
+}
+
+/// Sets all bits of `x` below the least significant one.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsfill_u32(x: u32) -> u32 {
+    x | (x.wrapping_sub(1))
+}
+
+/// Sets all bits of `x` below the least significant one.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsfill))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsfill_u64(x: u64) -> u64 {
+    x | (x.wrapping_sub(1))
+}
+
+/// Clears least significant bit and sets all other bits.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsic_u32(x: u32) -> u32 {
+    !x | (x.wrapping_sub(1))
+}
+
+/// Clears least significant bit and sets all other bits.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(blsic))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsic_u64(x: u64) -> u64 {
+    !x | (x.wrapping_sub(1))
+}
+
+/// Clears all bits below the least significant zero of `x` and sets all other
+/// bits.
+///
+/// If the least significant bit of `x` is `0`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(t1mskc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
+    !x | (x.wrapping_add(1))
+}
+
+/// Clears all bits below the least significant zero of `x` and sets all other
+/// bits.
+///
+/// If the least significant bit of `x` is `0`, it sets all bits.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(t1mskc))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
+    !x | (x.wrapping_add(1))
+}
+
+/// Sets all bits below the least significant one of `x` and clears all other
+/// bits.
+///
+/// If the least significant bit of `x` is 1, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(tzmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
+    !x & (x.wrapping_sub(1))
+}
+
+/// Sets all bits below the least significant one of `x` and clears all other
+/// bits.
+///
+/// If the least significant bit of `x` is 1, it returns zero.
+#[inline]
+#[target_feature(enable = "tbm")]
+#[cfg_attr(test, assert_instr(tzmsk))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
+    !x & (x.wrapping_sub(1))
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    /*
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_bextr_u32() {
+        assert_eq!(_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_bextr_u64() {
+        assert_eq!(_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
+    }
+    */
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcfill_u32() {
+        assert_eq!(_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
+        assert_eq!(_blcfill_u32(0b1111_1111u32), 0u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_blcfill_u64() {
+        assert_eq!(_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
+        assert_eq!(_blcfill_u64(0b1111_1111u64), 0u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blci_u32() {
+        assert_eq!(
+            _blci_u32(0b0101_0000u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1110u32
+        );
+        assert_eq!(
+            _blci_u32(0b1111_1111u32),
+            0b1111_1111_1111_1111_1111_1110_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    #[rustfmt::skip]
+    unsafe fn test_blci_u64() {
+        assert_eq!(
+            _blci_u64(0b0101_0000u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64
+        );
+        assert_eq!(
+            _blci_u64(0b1111_1111u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcic_u32() {
+        assert_eq!(_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
+        assert_eq!(_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_blcic_u64() {
+        assert_eq!(_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
+        assert_eq!(_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcmsk_u32() {
+        assert_eq!(_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
+        assert_eq!(_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_blcmsk_u64() {
+        assert_eq!(_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
+        assert_eq!(_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blcs_u32() {
+        assert_eq!(_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
+        assert_eq!(_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_blcs_u64() {
+        assert_eq!(_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
+        assert_eq!(_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsfill_u32() {
+        assert_eq!(_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
+        assert_eq!(
+            _blsfill_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    #[rustfmt::skip]
+    unsafe fn test_blsfill_u64() {
+        assert_eq!(_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
+        assert_eq!(
+            _blsfill_u64(0u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_blsic_u32() {
+        assert_eq!(
+            _blsic_u32(0b0101_0100u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1011u32
+        );
+        assert_eq!(
+            _blsic_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    #[rustfmt::skip]
+    unsafe fn test_blsic_u64() {
+        assert_eq!(
+            _blsic_u64(0b0101_0100u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64
+        );
+        assert_eq!(
+            _blsic_u64(0u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_t1mskc_u32() {
+        assert_eq!(
+            _t1mskc_u32(0b0101_0111u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1000u32
+        );
+        assert_eq!(
+            _t1mskc_u32(0u32),
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    #[rustfmt::skip]
+    unsafe fn test_t1mksc_u64() {
+        assert_eq!(
+            _t1mskc_u64(0b0101_0111u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64
+        );
+        assert_eq!(
+            _t1mskc_u64(0u64),
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
+        );
+    }
+
+    #[simd_test(enable = "tbm")]
+    unsafe fn test_tzmsk_u32() {
+        assert_eq!(_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
+        assert_eq!(_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
+    }
+
+    #[simd_test(enable = "tbm")]
+    #[cfg(not(target_arch = "x86"))]
+    unsafe fn test_tzmsk_u64() {
+        assert_eq!(_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
+        assert_eq!(_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/test.rs b/library/stdarch/crates/core_arch/src/x86/test.rs
new file mode 100644
index 000000000..bab89e61a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/test.rs
@@ -0,0 +1,144 @@
+//! Utilities used in testing the x86 intrinsics
+
+use crate::core_arch::x86::*;
+use std::mem::transmute;
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
+    transmute::<_, [f64; 2]>(a)[idx]
+}
+
+#[target_feature(enable = "sse")]
+pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "sse")]
+pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
+    transmute::<_, [f32; 4]>(a)[idx]
+}
+
+// not actually an intrinsic but useful in various tests as we proted from
+// `i64x2::new` which is backwards from `_mm_set_epi64x`
+#[target_feature(enable = "sse2")]
+pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
+    let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_pd(cmp) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
+    transmute::<_, [f64; 4]>(a)[idx]
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
+    let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_ps(cmp) != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature(enable = "avx")]
+pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 {
+    transmute::<_, [f32; 8]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512(a: __m512, idx: usize) -> f32 {
+    transmute::<_, [f32; 16]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512d(a: __m512d, idx: usize) -> f64 {
+    transmute::<_, [f64; 8]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512i(a: __m512i, idx: usize) -> i64 {
+    transmute::<_, [i64; 8]>(a)[idx]
+}
+
+// These intrinsics doesn't exist on x86 b/c it requires a 64-bit register,
+// which doesn't exist on x86!
+#[cfg(target_arch = "x86")]
+mod x86_polyfill {
+    use crate::core_arch::x86::*;
+
+    #[rustc_legacy_const_generics(2)]
+    pub unsafe fn _mm_insert_epi64<const INDEX: i32>(a: __m128i, val: i64) -> __m128i {
+        static_assert_imm1!(INDEX);
+        #[repr(C)]
+        union A {
+            a: __m128i,
+            b: [i64; 2],
+        }
+        let mut a = A { a };
+        a.b[INDEX as usize] = val;
+        a.a
+    }
+
+    #[target_feature(enable = "avx2")]
+    #[rustc_legacy_const_generics(2)]
+    pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, val: i64) -> __m256i {
+        static_assert_imm2!(INDEX);
+        #[repr(C)]
+        union A {
+            a: __m256i,
+            b: [i64; 4],
+        }
+        let mut a = A { a };
+        a.b[INDEX as usize] = val;
+        a.a
+    }
+}
+#[cfg(target_arch = "x86_64")]
+mod x86_polyfill {
+    pub use crate::core_arch::x86_64::{_mm256_insert_epi64, _mm_insert_epi64};
+}
+pub use self::x86_polyfill::*;
+
+pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
+    assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b))
+}
+
+pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
+    let cmp = _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b);
+    if cmp != 0b11111111_11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
+    let cmp = _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b);
+    if cmp != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/xsave.rs b/library/stdarch/crates/core_arch/src/x86/xsave.rs
new file mode 100644
index 000000000..30f807e44
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/xsave.rs
@@ -0,0 +1,282 @@
+//! `i586`'s `xsave` and `xsaveopt` target feature intrinsics
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.xsave"]
+    fn xsave(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstor"]
+    fn xrstor(p: *const u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsetbv"]
+    fn xsetbv(v: u32, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xgetbv"]
+    fn xgetbv(v: u32) -> i64;
+    #[link_name = "llvm.x86.xsaveopt"]
+    fn xsaveopt(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsavec"]
+    fn xsavec(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsaves"]
+    fn xsaves(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstors"]
+    fn xrstors(p: *const u8, hi: u32, lo: u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and XCR0.
+/// `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
+/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsave)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
+    xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using
+/// the state information stored in memory at `mem_addr`.
+///
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstor)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
+    xrstor(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+/// `XFEATURE_ENABLED_MASK` for `XCR`
+///
+/// This intrinsic maps to `XSETBV` instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
+
+/// Copies 64-bits from `val` to the extended control register (`XCR`) specified
+/// by `a`.
+///
+/// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsetbv)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsetbv(a: u32, val: u64) {
+    xsetbv(a, (val >> 32) as u32, val as u32);
+}
+
+/// Reads the contents of the extended control register `XCR`
+/// specified in `xcr_no`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xgetbv)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xgetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
+    xgetbv(xcr_no) as u64
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and `XCR0`.
+/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
+/// the manner in which data is saved. The performance of this instruction will
+/// be equal to or better than using the `XSAVE` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaveopt)
+#[inline]
+#[target_feature(enable = "xsave,xsaveopt")]
+#[cfg_attr(test, assert_instr(xsaveopt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
+    xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory
+/// at `mem_addr`.
+///
+/// `xsavec` differs from `xsave` in that it uses compaction and that it may
+/// use init optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsavec)
+#[inline]
+#[target_feature(enable = "xsave,xsavec")]
+#[cfg_attr(test, assert_instr(xsavec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
+    xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`
+///
+/// `xsaves` differs from xsave in that it can save state components
+/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
+/// modified optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaves)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xsaves))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
+    xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using the
+/// state information stored in memory at `mem_addr`.
+///
+/// `xrstors` differs from `xrstor` in that it can restore state components
+/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
+/// from an `xsave` area in which the extended region is in the standard form.
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstors)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xrstors))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
+    xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{fmt, prelude::v1::*};
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[repr(align(64))]
+    struct XsaveArea {
+        // max size for 256-bit registers is 800 bytes:
+        // see https://software.intel.com/en-us/node/682996
+        // max size for 512-bit registers is 2560 bytes:
+        // FIXME: add source
+        data: [u8; 2560],
+    }
+
+    impl XsaveArea {
+        fn new() -> XsaveArea {
+            XsaveArea { data: [0; 2560] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            &mut self.data[0] as *mut _ as *mut u8
+        }
+    }
+
+    impl PartialEq<XsaveArea> for XsaveArea {
+        fn eq(&self, other: &XsaveArea) -> bool {
+            for i in 0..self.data.len() {
+                if self.data[i] != other.data[i] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    impl fmt::Debug for XsaveArea {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "[")?;
+            for i in 0..self.data.len() {
+                write!(f, "{}", self.data[i])?;
+                if i != self.data.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")
+        }
+    }
+
+    // FIXME: https://github.com/rust-lang/stdarch/issues/209
+    /*
+    #[simd_test(enable = "xsave")]
+    unsafe fn xsave() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsave(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsave(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+    */
+
+    #[simd_test(enable = "xsave")]
+    unsafe fn xgetbv_xsetbv() {
+        let xcr_n: u32 = _XCR_XFEATURE_ENABLED_MASK;
+
+        let xcr: u64 = _xgetbv(xcr_n);
+        // FIXME: XSETBV is a privileged instruction we should only test this
+        // when running in privileged mode:
+        //
+        // _xsetbv(xcr_n, xcr);
+        let xcr_cpy: u64 = _xgetbv(xcr_n);
+        assert_eq!(xcr, xcr_cpy);
+    }
+
+    // FIXME: https://github.com/rust-lang/stdarch/issues/209
+    /*
+    #[simd_test(enable = "xsave,xsaveopt")]
+    unsafe fn xsaveopt() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsaveopt(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsaveopt(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+    */
+
+    // FIXME: this looks like a bug in Intel's SDE:
+    #[cfg(not(stdarch_intel_sde))]
+    #[simd_test(enable = "xsave,xsavec")]
+    unsafe fn xsavec() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsavec(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsavec(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    // FIXME: https://github.com/rust-lang/stdarch/issues/209
+    /*
+    #[simd_test(enable = "xsave,xsaves")]
+    unsafe fn xsaves() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        _xsaves(a.ptr(), m);
+        _xrstors(a.ptr(), m);
+        _xsaves(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+    */
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/abm.rs b/library/stdarch/crates/core_arch/src/x86_64/abm.rs
new file mode 100644
index 000000000..988074d67
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/abm.rs
@@ -0,0 +1,62 @@
+//! Advanced Bit Manipulation (ABM) instructions
+//!
+//! The POPCNT and LZCNT have their own CPUID bits to indicate support.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//! System Instructions][amd64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Counts the leading most significant zero bits.
+///
+/// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_lzcnt_u64)
+#[inline]
+#[target_feature(enable = "lzcnt")]
+#[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
+    x.leading_zeros() as u64
+}
+
+/// Counts the bits that are set.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_popcnt64)
+#[inline]
+#[target_feature(enable = "popcnt")]
+#[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _popcnt64(x: i64) -> i32 {
+    x.count_ones() as i32
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::arch::x86_64::*;
+
+    #[simd_test(enable = "lzcnt")]
+    unsafe fn test_lzcnt_u64() {
+        assert_eq!(_lzcnt_u64(0b0101_1010), 57);
+    }
+
+    #[simd_test(enable = "popcnt")]
+    unsafe fn test_popcnt64() {
+        assert_eq!(_popcnt64(0b0101_1010), 4);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/adx.rs b/library/stdarch/crates/core_arch/src/x86_64/adx.rs
new file mode 100644
index 000000000..a54d71136
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/adx.rs
@@ -0,0 +1,148 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.addcarry.64"]
+    fn llvm_addcarry_u64(a: u8, b: u64, c: u64) -> (u8, u64);
+    #[link_name = "llvm.x86.addcarryx.u64"]
+    fn llvm_addcarryx_u64(a: u8, b: u64, c: u64, d: *mut u8) -> u8;
+    #[link_name = "llvm.x86.subborrow.64"]
+    fn llvm_subborrow_u64(a: u8, b: u64, c: u64) -> (u8, u64);
+}
+
+/// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry flag), and store the unsigned 64-bit result in `out`, and the carry-out
+/// is returned (carry or overflow flag).
+#[inline]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    let (a, b) = llvm_addcarry_u64(c_in, a, b);
+    *out = b;
+    a
+}
+
+/// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
+/// (carry or overflow flag), and store the unsigned 64-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+#[inline]
+#[target_feature(enable = "adx")]
+#[cfg_attr(test, assert_instr(adc))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    llvm_addcarryx_u64(c_in, a, b, out as *mut _ as *mut u8)
+}
+
+/// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`.
+/// (carry or overflow flag), and store the unsigned 64-bit result in `out`, and
+/// the carry-out is returned (carry or overflow flag).
+#[inline]
+#[cfg_attr(test, assert_instr(sbb))]
+#[stable(feature = "simd_x86_adx", since = "1.33.0")]
+pub unsafe fn _subborrow_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    let (a, b) = llvm_subborrow_u64(c_in, a, b);
+    *out = b;
+    a
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86_64::*;
+
+    #[test]
+    fn test_addcarry_u64() {
+        unsafe {
+            let a = u64::MAX;
+            let mut out = 0;
+
+            let r = _addcarry_u64(0, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u64(0, a, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, a);
+
+            let r = _addcarry_u64(1, a, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 1);
+
+            let r = _addcarry_u64(1, a, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, 0);
+
+            let r = _addcarry_u64(0, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 7);
+
+            let r = _addcarry_u64(1, 3, 4, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 8);
+        }
+    }
+
+    #[simd_test(enable = "adx")]
+    unsafe fn test_addcarryx_u64() {
+        let a = u64::MAX;
+        let mut out = 0;
+
+        let r = _addcarry_u64(0, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarry_u64(0, a, 0, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, a);
+
+        let r = _addcarry_u64(1, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 1);
+
+        let r = _addcarry_u64(1, a, 0, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarry_u64(0, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 7);
+
+        let r = _addcarry_u64(1, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 8);
+    }
+
+    #[test]
+    fn test_subborrow_u64() {
+        unsafe {
+            let a = u64::MAX;
+            let mut out = 0;
+
+            let r = _subborrow_u64(0, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u64(0, 0, 0, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 0);
+
+            let r = _subborrow_u64(1, 0, 1, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a - 1);
+
+            let r = _subborrow_u64(1, 0, 0, &mut out);
+            assert_eq!(r, 1);
+            assert_eq!(out, a);
+
+            let r = _subborrow_u64(0, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 4);
+
+            let r = _subborrow_u64(1, 7, 3, &mut out);
+            assert_eq!(r, 0);
+            assert_eq!(out, 3);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx.rs b/library/stdarch/crates/core_arch/src/x86_64/avx.rs
new file mode 100644
index 000000000..7ba26371c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx.rs
@@ -0,0 +1,48 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//! Programmer's Manual, Volume 3: General-Purpose and System
+//! Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
+use crate::{
+    core_arch::{simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+/// Copies `a` to result, and insert the 64-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi64)
+#[inline]
+#[rustc_legacy_const_generics(2)]
+#[target_feature(enable = "avx")]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, i: i64) -> __m256i {
+    static_assert_imm2!(INDEX);
+    transmute(simd_insert(a.as_i64x4(), INDEX as u32, i))
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx")]
+    unsafe fn test_mm256_insert_epi64() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_insert_epi64::<3>(a, 0);
+        let e = _mm256_setr_epi64x(1, 2, 3, 0);
+        assert_eq_m256i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx2.rs b/library/stdarch/crates/core_arch/src/x86_64/avx2.rs
new file mode 100644
index 000000000..14447a137
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx2.rs
@@ -0,0 +1,47 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use crate::core_arch::{simd_llvm::*, x86::*};
+
+/// Extracts a 64-bit integer from `a`, selected with `INDEX`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[rustc_legacy_const_generics(1)]
+// This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi64<const INDEX: i32>(a: __m256i) -> i64 {
+    static_assert_imm2!(INDEX);
+    simd_extract(a.as_i64x4(), INDEX as u32)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arch::x86_64::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx2")]
+    unsafe fn test_mm256_extract_epi64() {
+        let a = _mm256_setr_epi64x(0, 1, 2, 3);
+        let r = _mm256_extract_epi64::<3>(a);
+        assert_eq!(r, 3);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
new file mode 100644
index 000000000..5eed0502c
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
@@ -0,0 +1,12346 @@
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*, x86_64::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_i64&expand=1792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub unsafe fn _mm_cvtsd_i64(a: __m128d) -> i64 {
+    _mm_cvtsd_si64(a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_i64&expand=1894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
+    _mm_cvtss_si64(a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_u64&expand=1902)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
+    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_u64&expand=1800)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
+    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_ss&expand=1643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss))]
+pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
+    let b = b as f32;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti64_sd&expand=1644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd))]
+pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
+    let b = b as f64;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_ss&expand=2035)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss))]
+pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
+    let b = b as f32;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sd&expand=2034)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2sd))]
+pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
+    let b = b as f64;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_i64&expand=2016)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
+    transmute(vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_u64&expand=2021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
+    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=#text=_mm_cvttss_i64&expand=2023)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
+    transmute(vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_u64&expand=2027)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
+    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_sd&expand=1313)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let r = vcvtsi2sd64(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_sd&expand=1367)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let r = vcvtsi2sd64(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_ss&expand=1314)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtsi2ss64(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_sd&expand=1379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let r = vcvtusi2sd64(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_ss&expand=1368)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtsi2ss64(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_ss&expand=1380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtusi2ss64(a, b, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_si64&expand=1360)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let r = vcvtsd2si64(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_i64&expand=1358)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let r = vcvtsd2si64(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_u64&expand=1365)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let r = vcvtsd2usi64(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_si64&expand=1375)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtss2si64(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_i64&expand=1370)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtss2si64(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_u64&expand=1377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let r = vcvtss2usi64(a, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si64&expand=1931)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let r = vcvtsd2si64(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i64&expand=1929)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let r = vcvtsd2si64(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_u64&expand=1933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let r = vcvtsd2usi64(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_i64&expand=1935)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let r = vcvtss2si64(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_si64&expand=1937)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let r = vcvtss2si64(a, SAE);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_u64&expand=1939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let r = vcvtss2usi64(a, SAE);
+    transmute(r)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.vcvtss2si64"]
+    fn vcvtss2si64(a: f32x4, rounding: i32) -> i64;
+    #[link_name = "llvm.x86.avx512.vcvtss2usi64"]
+    fn vcvtss2usi64(a: f32x4, rounding: i32) -> u64;
+    #[link_name = "llvm.x86.avx512.vcvtsd2si64"]
+    fn vcvtsd2si64(a: f64x2, rounding: i32) -> i64;
+    #[link_name = "llvm.x86.avx512.vcvtsd2usi64"]
+    fn vcvtsd2usi64(a: f64x2, rounding: i32) -> u64;
+
+    #[link_name = "llvm.x86.avx512.cvtsi2ss64"]
+    fn vcvtsi2ss64(a: f32x4, b: i64, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.cvtsi2sd64"]
+    fn vcvtsi2sd64(a: f64x2, b: i64, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.cvtusi642ss"]
+    fn vcvtusi2ss64(a: f32x4, b: u64, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.cvtusi642sd"]
+    fn vcvtusi2sd64(a: f64x2, b: u64, rounding: i32) -> f64x2;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+    use crate::core_arch::x86_64::*;
+    use crate::hint::black_box;
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_epi64() {
+        let a = _mm512_set_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let r = _mm512_abs_epi64(a);
+        let e = _mm512_set_epi64(0, 1, 1, i64::MAX, i64::MAX.wrapping_add(1), 100, 100, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_epi64() {
+        let a = _mm512_set_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let r = _mm512_mask_abs_epi64(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_abs_epi64(a, 0b11111111, a);
+        let e = _mm512_set_epi64(0, 1, 1, i64::MAX, i64::MIN, 100, 100, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_abs_epi64() {
+        let a = _mm512_set_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let r = _mm512_maskz_abs_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_abs_epi64(0b11111111, a);
+        let e = _mm512_set_epi64(0, 1, 1, i64::MAX, i64::MIN, 100, 100, 32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_abs_epi64() {
+        let a = _mm256_set_epi64x(i64::MAX, i64::MIN, 100, -100);
+        let r = _mm256_abs_epi64(a);
+        let e = _mm256_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1), 100, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_abs_epi64() {
+        let a = _mm256_set_epi64x(i64::MAX, i64::MIN, 100, -100);
+        let r = _mm256_mask_abs_epi64(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_abs_epi64(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1), 100, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_abs_epi64() {
+        let a = _mm256_set_epi64x(i64::MAX, i64::MIN, 100, -100);
+        let r = _mm256_maskz_abs_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_abs_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1), 100, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_abs_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let r = _mm512_abs_pd(a);
+        let e = _mm512_setr_pd(0., 1., 1., f64::MAX, f64::MAX, 100., 100., 32.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_abs_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let r = _mm512_mask_abs_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_abs_pd(a, 0b00001111, a);
+        let e = _mm512_setr_pd(0., 1., 1., f64::MAX, f64::MIN, 100., -100., -32.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_epi64() {
+        let src = _mm512_set1_epi64(1);
+        let a = _mm512_set1_epi64(2);
+        let r = _mm512_mask_mov_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi64(src, 0b11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_mov_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi64(0b11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_epi64() {
+        let src = _mm256_set1_epi64x(1);
+        let a = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_mov_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_mov_epi64(src, 0b00001111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_epi64() {
+        let a = _mm256_set1_epi64x(2);
+        let r = _mm256_maskz_mov_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mov_epi64(0b00001111, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_epi64() {
+        let src = _mm_set1_epi64x(1);
+        let a = _mm_set1_epi64x(2);
+        let r = _mm_mask_mov_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_mov_epi64(src, 0b00000011, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_epi64() {
+        let a = _mm_set1_epi64x(2);
+        let r = _mm_maskz_mov_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mov_epi64(0b00000011, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_pd() {
+        let src = _mm512_set1_pd(1.);
+        let a = _mm512_set1_pd(2.);
+        let r = _mm512_mask_mov_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_mov_pd(src, 0b11111111, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_pd() {
+        let a = _mm512_set1_pd(2.);
+        let r = _mm512_maskz_mov_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_mov_pd(0b11111111, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mov_pd() {
+        let src = _mm256_set1_pd(1.);
+        let a = _mm256_set1_pd(2.);
+        let r = _mm256_mask_mov_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_mov_pd(src, 0b00001111, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mov_pd() {
+        let a = _mm256_set1_pd(2.);
+        let r = _mm256_maskz_mov_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_mov_pd(0b00001111, a);
+        assert_eq_m256d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mov_pd() {
+        let src = _mm_set1_pd(1.);
+        let a = _mm_set1_pd(2.);
+        let r = _mm_mask_mov_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_mov_pd(src, 0b00000011, a);
+        assert_eq_m128d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mov_pd() {
+        let a = _mm_set1_pd(2.);
+        let r = _mm_maskz_mov_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_mov_pd(0b00000011, a);
+        assert_eq_m128d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_add_epi64(a, b);
+        let e = _mm512_setr_epi64(1, 2, 0, i64::MIN, i64::MIN + 1, 101, -99, -31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_mask_add_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_add_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(1, 2, 0, i64::MIN, i64::MIN, 100, -100, -32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_add_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_add_epi64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(1, 2, 0, i64::MIN, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_epi64() {
+        let a = _mm256_set_epi64x(1, -1, i64::MAX, i64::MIN);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_add_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_add_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 0, i64::MIN, i64::MIN + 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_epi64() {
+        let a = _mm256_set_epi64x(1, -1, i64::MAX, i64::MIN);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_add_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_add_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 0, i64::MIN, i64::MIN + 1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_mask_add_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_add_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(i64::MIN, i64::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_maskz_add_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_add_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(i64::MIN, i64::MIN + 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_add_pd(a, b);
+        let e = _mm512_setr_pd(1., 2., 0., f64::MAX, f64::MIN + 1., 101., -99., -31.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_mask_add_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_add_pd(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(1., 2., 0., f64::MAX, f64::MIN, 100., -100., -32.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_add_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_add_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(1., 2., 0., f64::MAX, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_add_pd() {
+        let a = _mm256_set_pd(1., -1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(1.);
+        let r = _mm256_mask_add_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_add_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(2., 0., f64::MAX, f64::MIN + 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_add_pd() {
+        let a = _mm256_set_pd(1., -1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_add_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_add_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(2., 0., f64::MAX, f64::MIN + 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_add_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(1.);
+        let r = _mm_mask_add_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_add_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(f64::MAX, f64::MIN + 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_add_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(1.);
+        let r = _mm_maskz_add_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_add_pd(0b00000011, a, b);
+        let e = _mm_set_pd(f64::MAX, f64::MIN + 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_sub_epi64(a, b);
+        let e = _mm512_setr_epi64(-1, 0, -2, i64::MAX - 1, i64::MAX, 99, -101, -33);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_mask_sub_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sub_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(-1, 0, -2, i64::MAX - 1, i64::MIN, 100, -100, -32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_epi64() {
+        let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
+        let b = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_sub_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sub_epi64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(-1, 0, -2, i64::MAX - 1, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_epi64() {
+        let a = _mm256_set_epi64x(1, -1, i64::MAX, i64::MIN);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_sub_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sub_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, -2, i64::MAX - 1, i64::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_epi64() {
+        let a = _mm256_set_epi64x(1, -1, i64::MAX, i64::MIN);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_sub_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sub_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, -2, i64::MAX - 1, i64::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_mask_sub_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sub_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(i64::MAX - 1, i64::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_maskz_sub_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sub_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(i64::MAX - 1, i64::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_sub_pd(a, b);
+        let e = _mm512_setr_pd(-1., 0., -2., f64::MAX - 1., f64::MIN, 99., -101., -33.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_mask_sub_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_sub_pd(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(-1., 0., -2., f64::MAX - 1., f64::MIN, 100., -100., -32.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_pd() {
+        let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_sub_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_sub_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(-1., 0., -2., f64::MAX - 1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sub_pd() {
+        let a = _mm256_set_pd(1., -1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(1.);
+        let r = _mm256_mask_sub_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_sub_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(0., -2., f64::MAX - 1., f64::MIN);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sub_pd() {
+        let a = _mm256_set_pd(1., -1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_sub_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_sub_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(0., -2., f64::MAX - 1., f64::MIN);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sub_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(1.);
+        let r = _mm_mask_sub_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_sub_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(f64::MAX - 1., f64::MIN);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sub_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(1.);
+        let r = _mm_maskz_sub_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_sub_pd(0b00000011, a, b);
+        let e = _mm_set_pd(f64::MAX - 1., f64::MIN);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mul_epi32(a, b);
+        let e = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_mul_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mul_epi32(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 | 1 << 32, 1 | 1 << 32, 1 | 1 << 32, 1 | 1 << 32,
+            7, 5, 3, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_mul_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mul_epi32(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 7, 5, 3, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mul_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mask_mul_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mul_epi32(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 4, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_mul_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mul_epi32(0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 4, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mul_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_mask_mul_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mul_epi32(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mul_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_mul_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mul_epi32(0b00000011, a, b);
+        let e = _mm_set_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_epu32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mul_epu32(a, b);
+        let e = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_epu32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_mask_mul_epu32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mul_epu32(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 | 1 << 32, 1 | 1 << 32, 1 | 1 << 32, 1 | 1 << 32,
+            7, 5, 3, 1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_epu32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_maskz_mul_epu32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mul_epu32(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 7, 5, 3, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mul_epu32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_mask_mul_epu32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_mul_epu32(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 4, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_epu32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_mul_epu32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_mul_epu32(0b00001111, a, b);
+        let e = _mm256_set_epi64x(2, 4, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mul_epu32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_mask_mul_epu32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_mul_epu32(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mul_epu32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_mul_epu32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_mul_epu32(0b00000011, a, b);
+        let e = _mm_set_epi64x(2, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mullox_epi64() {
+        let a = _mm512_setr_epi64(0, 1, i64::MAX, i64::MIN, i64::MAX, 100, -100, -32);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_mullox_epi64(a, b);
+        let e = _mm512_setr_epi64(0, 2, -2, 0, -2, 200, -200, -64);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mullox_epi64() {
+        let a = _mm512_setr_epi64(0, 1, i64::MAX, i64::MIN, i64::MAX, 100, -100, -32);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_mask_mullox_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_mullox_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 2, -2, 0, i64::MAX, 100, -100, -32);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_set1_pd(2.);
+        let r = _mm512_mul_pd(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 2., f64::INFINITY, f64::NEG_INFINITY,
+            f64::INFINITY, f64::NEG_INFINITY, -200., -64.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_set1_pd(2.);
+        let r = _mm512_mask_mul_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_mul_pd(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 2., f64::INFINITY, f64::NEG_INFINITY,
+            f64::MAX, f64::MIN, -100., -32.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_set1_pd(2.);
+        let r = _mm512_maskz_mul_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_mul_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 2., f64::INFINITY, f64::NEG_INFINITY, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_mul_pd() {
+        let a = _mm256_set_pd(0., 1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(2.);
+        let r = _mm256_mask_mul_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_mul_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(0., 2., f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_mul_pd() {
+        let a = _mm256_set_pd(0., 1., f64::MAX, f64::MIN);
+        let b = _mm256_set1_pd(2.);
+        let r = _mm256_maskz_mul_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_mul_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(0., 2., f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_mul_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(2.);
+        let r = _mm_mask_mul_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_mul_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_mul_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set1_pd(2.);
+        let r = _mm_maskz_mul_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_mul_pd(0b00000011, a, b);
+        let e = _mm_set_pd(f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_setr_pd(2., 2., 0., 0., 0., 0., 2., 2.);
+        let r = _mm512_div_pd(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 0.5, f64::INFINITY, f64::NEG_INFINITY,
+            f64::INFINITY, f64::NEG_INFINITY, -50., -16.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_setr_pd(2., 2., 0., 0., 0., 0., 2., 2.);
+        let r = _mm512_mask_div_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_div_pd(a, 0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 0.5, f64::INFINITY, f64::NEG_INFINITY,
+            f64::MAX, f64::MIN, -100., -32.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_pd() {
+        let a = _mm512_setr_pd(0., 1., f64::MAX, f64::MIN, f64::MAX, f64::MIN, -100., -32.);
+        let b = _mm512_setr_pd(2., 2., 0., 0., 0., 0., 2., 2.);
+        let r = _mm512_maskz_div_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_div_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 0.5, f64::INFINITY, f64::NEG_INFINITY, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_div_pd() {
+        let a = _mm256_set_pd(0., 1., f64::MAX, f64::MIN);
+        let b = _mm256_set_pd(2., 2., 0., 0.);
+        let r = _mm256_mask_div_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_div_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(0., 0.5, f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_div_pd() {
+        let a = _mm256_set_pd(0., 1., f64::MAX, f64::MIN);
+        let b = _mm256_set_pd(2., 2., 0., 0.);
+        let r = _mm256_maskz_div_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_div_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(0., 0.5, f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_div_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set_pd(0., 0.);
+        let r = _mm_mask_div_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_div_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_div_pd() {
+        let a = _mm_set_pd(f64::MAX, f64::MIN);
+        let b = _mm_set_pd(0., 0.);
+        let r = _mm_maskz_div_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_div_pd(0b00000011, a, b);
+        let e = _mm_set_pd(f64::INFINITY, f64::NEG_INFINITY);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epi64(a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epi64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_max_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_max_epi64(a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_max_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_max_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_max_epi64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_max_epi64(a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epi64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_mask_max_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epi64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_maskz_max_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_max_pd(a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_mask_max_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_max_pd(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_maskz_max_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_max_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_mask_max_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_max_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(3., 2., 2., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_maskz_max_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_max_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(3., 2., 2., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_pd() {
+        let a = _mm_set_pd(2., 3.);
+        let b = _mm_set_pd(3., 2.);
+        let r = _mm_mask_max_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_max_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(3., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_pd() {
+        let a = _mm_set_pd(2., 3.);
+        let b = _mm_set_pd(3., 2.);
+        let r = _mm_maskz_max_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_max_pd(0b00000011, a, b);
+        let e = _mm_set_pd(3., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_max_epu64(a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_max_epu64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_max_epu64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_max_epu64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_max_epu64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(7, 6, 5, 4, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_max_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_max_epu64(a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_max_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_max_epu64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_max_epu64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_max_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_max_epu64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_max_epu64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(3, 2, 2, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_max_epu64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_max_epu64(a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_max_epu64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_mask_max_epu64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_max_epu64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_max_epu64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_maskz_max_epu64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_max_epu64(0b00000011, a, b);
+        let e = _mm_set_epi64x(3, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epi64(a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epi64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epi64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_min_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_min_epi64(a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_min_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_min_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_min_pd(a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_mask_min_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_min_pd(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_maskz_min_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_min_pd(0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_mask_min_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_min_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(0., 1., 1., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_maskz_min_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_min_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(0., 1., 1., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(1., 0.);
+        let r = _mm_mask_min_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_min_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(1., 0.);
+        let r = _mm_maskz_min_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_min_pd(0b00000011, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_min_epu64(a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 3, 2, 1, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_mask_min_epu64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_min_epu64(a, 0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_epu64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+        let r = _mm512_maskz_min_epu64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_min_epu64(0b00001111, a, b);
+        let e = _mm512_setr_epi64(0, 1, 2, 3, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_min_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_min_epu64(a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_min_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_min_epu64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_min_epu64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_min_epu64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_min_epu64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_min_epu64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(0, 1, 1, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_min_epu64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(1, 0);
+        let r = _mm_min_epu64(a, b);
+        let e = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epu64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(1, 0);
+        let r = _mm_mask_min_epu64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epu64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epu64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(1, 0);
+        let r = _mm_maskz_min_epu64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epu64(0b00000011, a, b);
+        let e = _mm_set_epi64x(0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_pd() {
+        let a = _mm512_setr_pd(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm512_sqrt_pd(a);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_pd() {
+        let a = _mm512_setr_pd(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm512_mask_sqrt_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_sqrt_pd(a, 0b00001111, a);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 16., 25., 36., 49.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_pd() {
+        let a = _mm512_setr_pd(0., 1., 4., 9., 16., 25., 36., 49.);
+        let r = _mm512_maskz_sqrt_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_sqrt_pd(0b00001111, a);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sqrt_pd() {
+        let a = _mm256_set_pd(0., 1., 4., 9.);
+        let r = _mm256_mask_sqrt_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_sqrt_pd(a, 0b00001111, a);
+        let e = _mm256_set_pd(0., 1., 2., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sqrt_pd() {
+        let a = _mm256_set_pd(0., 1., 4., 9.);
+        let r = _mm256_maskz_sqrt_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_sqrt_pd(0b00001111, a);
+        let e = _mm256_set_pd(0., 1., 2., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sqrt_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_mask_sqrt_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_sqrt_pd(a, 0b00000011, a);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sqrt_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_maskz_sqrt_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_sqrt_pd(0b00000011, a);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_pd() {
+        let a = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm512_fmadd_pd(a, b, c);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_pd() {
+        let a = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm512_mask_fmadd_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmadd_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_pd() {
+        let a = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm512_maskz_fmadd_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmadd_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_pd() {
+        let a = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fmadd_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmadd_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fmadd_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fmadd_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fmadd_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fmadd_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fmadd_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fmadd_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(1., 2., 3., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fmadd_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fmadd_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fmadd_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fmadd_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fmsub_pd(a, b, c);
+        let e = _mm512_setr_pd(-1., 0., 1., 2., 3., 4., 5., 6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fmsub_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmsub_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(-1., 0., 1., 2., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fmsub_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmsub_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(-1., 0., 1., 2., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fmsub_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmsub_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(-1., 0., 1., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fmsub_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fmsub_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(-1., 0., 1., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fmsub_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fmsub_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(-1., 0., 1., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fmsub_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fmsub_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(-1., 0., 1., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fmsub_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsub_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(-1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fmsub_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fmsub_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(-1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fmsub_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsub_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(-1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fmaddsub_pd(a, b, c);
+        let e = _mm512_setr_pd(-1., 2., 1., 4., 3., 6., 5., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fmaddsub_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmaddsub_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(-1., 2., 1., 4., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fmaddsub_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmaddsub_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(-1., 2., 1., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fmaddsub_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmaddsub_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(-1., 2., 1., 4., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmaddsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fmaddsub_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fmaddsub_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(1., 0., 3., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmaddsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fmaddsub_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fmaddsub_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(1., 0., 3., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmaddsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fmaddsub_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fmaddsub_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(1., 0., 3., 2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmaddsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fmaddsub_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmaddsub_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmaddsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fmaddsub_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fmaddsub_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmaddsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fmaddsub_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmaddsub_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fmsubadd_pd(a, b, c);
+        let e = _mm512_setr_pd(1., 0., 3., 2., 5., 4., 7., 6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fmsubadd_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmsubadd_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(1., 0., 3., 2., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fmsubadd_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmsubadd_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(1., 0., 3., 2., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fmsubadd_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmsubadd_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(1., 0., 3., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fmsubadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fmsubadd_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fmsubadd_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(-1., 2., 1., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fmsubadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fmsubadd_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fmsubadd_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(-1., 2., 1., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fmsubadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fmsubadd_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fmsubadd_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(-1., 2., 1., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fmsubadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fmsubadd_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmsubadd_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(-1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fmsubadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fmsubadd_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fmsubadd_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(-1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fmsubadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fmsubadd_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmsubadd_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(-1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fnmadd_pd(a, b, c);
+        let e = _mm512_setr_pd(1., 0., -1., -2., -3., -4., -5., -6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fnmadd_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fnmadd_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(1., 0., -1., -2., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fnmadd_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fnmadd_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(1., 0., -1., -2., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fnmadd_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fnmadd_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(1., 0., -1., -2., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fnmadd_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fnmadd_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(1., 0., -1., -2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fnmadd_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fnmadd_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(1., 0., -1., -2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmadd_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fnmadd_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fnmadd_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(1., 0., -1., -2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fnmadd_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmadd_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fnmadd_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fnmadd_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmadd_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fnmadd_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmadd_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fnmsub_pd(a, b, c);
+        let e = _mm512_setr_pd(-1., -2., -3., -4., -5., -6., -7., -8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fnmsub_pd(a, 0, b, c);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fnmsub_pd(a, 0b00001111, b, c);
+        let e = _mm512_setr_pd(-1., -2., -3., -4., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fnmsub_pd(0, a, b, c);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fnmsub_pd(0b00001111, a, b, c);
+        let e = _mm512_setr_pd(-1., -2., -3., -4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let c = _mm512_setr_pd(1., 1., 1., 1., 2., 2., 2., 2.);
+        let r = _mm512_mask3_fnmsub_pd(a, b, c, 0);
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fnmsub_pd(a, b, c, 0b00001111);
+        let e = _mm512_setr_pd(-1., -2., -3., -4., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fnmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask_fnmsub_pd(a, 0, b, c);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_fnmsub_pd(a, 0b00001111, b, c);
+        let e = _mm256_set_pd(-1., -2., -3., -4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fnmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_maskz_fnmsub_pd(0, a, b, c);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_fnmsub_pd(0b00001111, a, b, c);
+        let e = _mm256_set_pd(-1., -2., -3., -4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask3_fnmsub_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set_pd(0., 1., 2., 3.);
+        let c = _mm256_set1_pd(1.);
+        let r = _mm256_mask3_fnmsub_pd(a, b, c, 0);
+        assert_eq_m256d(r, c);
+        let r = _mm256_mask3_fnmsub_pd(a, b, c, 0b00001111);
+        let e = _mm256_set_pd(-1., -2., -3., -4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fnmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask_fnmsub_pd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fnmsub_pd(a, 0b00000011, b, c);
+        let e = _mm_set_pd(-1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fnmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_maskz_fnmsub_pd(0, a, b, c);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_fnmsub_pd(0b00000011, a, b, c);
+        let e = _mm_set_pd(-1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask3_fnmsub_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set_pd(0., 1.);
+        let c = _mm_set1_pd(1.);
+        let r = _mm_mask3_fnmsub_pd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fnmsub_pd(a, b, c, 0b00000011);
+        let e = _mm_set_pd(-1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rcp14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_rcp14_pd(a);
+        let e = _mm512_set1_pd(0.3333320617675781);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rcp14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_mask_rcp14_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_rcp14_pd(a, 0b11110000, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            3., 3., 3., 3.,
+            0.3333320617675781, 0.3333320617675781, 0.3333320617675781, 0.3333320617675781,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rcp14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_rcp14_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_rcp14_pd(0b11110000, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 0., 0., 0.,
+            0.3333320617675781, 0.3333320617675781, 0.3333320617675781, 0.3333320617675781,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rcp14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_rcp14_pd(a);
+        let e = _mm256_set1_pd(0.3333320617675781);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rcp14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_mask_rcp14_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_rcp14_pd(a, 0b00001111, a);
+        let e = _mm256_set1_pd(0.3333320617675781);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rcp14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_maskz_rcp14_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_rcp14_pd(0b00001111, a);
+        let e = _mm256_set1_pd(0.3333320617675781);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rcp14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_rcp14_pd(a);
+        let e = _mm_set1_pd(0.3333320617675781);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rcp14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_mask_rcp14_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_rcp14_pd(a, 0b00000011, a);
+        let e = _mm_set1_pd(0.3333320617675781);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rcp14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_maskz_rcp14_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_rcp14_pd(0b00000011, a);
+        let e = _mm_set1_pd(0.3333320617675781);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rsqrt14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_rsqrt14_pd(a);
+        let e = _mm512_set1_pd(0.5773391723632813);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rsqrt14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_mask_rsqrt14_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_rsqrt14_pd(a, 0b11110000, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            3., 3., 3., 3.,
+            0.5773391723632813, 0.5773391723632813, 0.5773391723632813, 0.5773391723632813,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rsqrt14_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_rsqrt14_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_rsqrt14_pd(0b11110000, a);
+        #[rustfmt::skip]
+        let e = _mm512_setr_pd(
+            0., 0., 0., 0.,
+            0.5773391723632813, 0.5773391723632813, 0.5773391723632813, 0.5773391723632813,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rsqrt14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_mask_rsqrt14_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_rsqrt14_pd(a, 0b00001111, a);
+        let e = _mm256_set1_pd(0.5773391723632813);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rsqrt14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_maskz_rsqrt14_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_rsqrt14_pd(0b00001111, a);
+        let e = _mm256_set1_pd(0.5773391723632813);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rsqrt14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_mask_rsqrt14_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_rsqrt14_pd(a, 0b00000011, a);
+        let e = _mm_set1_pd(0.5773391723632813);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rsqrt14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_maskz_rsqrt14_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_rsqrt14_pd(0b00000011, a);
+        let e = _mm_set1_pd(0.5773391723632813);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_getexp_pd(a);
+        let e = _mm512_set1_pd(1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_mask_getexp_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_getexp_pd(a, 0b11110000, a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_getexp_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_getexp_pd(0b11110000, a);
+        let e = _mm512_setr_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getexp_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_getexp_pd(a);
+        let e = _mm256_set1_pd(1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getexp_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_mask_getexp_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_getexp_pd(a, 0b00001111, a);
+        let e = _mm256_set1_pd(1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getexp_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_maskz_getexp_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_getexp_pd(0b00001111, a);
+        let e = _mm256_set1_pd(1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getexp_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_getexp_pd(a);
+        let e = _mm_set1_pd(1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getexp_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_getexp_pd(a, 0b00000011, a);
+        let e = _mm_set1_pd(1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getexp_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_getexp_pd(0b00000011, a);
+        let e = _mm_set1_pd(1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_roundscale_pd::<0b00_00_00_00>(a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_mask_roundscale_pd::<0b00_00_00_00>(a, 0, a);
+        let e = _mm512_set1_pd(1.1);
+        assert_eq_m512d(r, e);
+        let r = _mm512_mask_roundscale_pd::<0b00_00_00_00>(a, 0b11111111, a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_maskz_roundscale_pd::<0b00_00_00_00>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_roundscale_pd::<0b00_00_00_00>(0b11111111, a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_roundscale_pd() {
+        let a = _mm256_set1_pd(1.1);
+        let r = _mm256_roundscale_pd::<0b00_00_00_00>(a);
+        let e = _mm256_set1_pd(1.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_roundscale_pd() {
+        let a = _mm256_set1_pd(1.1);
+        let r = _mm256_mask_roundscale_pd::<0b00_00_00_00>(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_roundscale_pd::<0b00_00_00_00>(a, 0b00001111, a);
+        let e = _mm256_set1_pd(1.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_roundscale_pd() {
+        let a = _mm256_set1_pd(1.1);
+        let r = _mm256_maskz_roundscale_pd::<0b00_00_00_00>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_roundscale_pd::<0b00_00_00_00>(0b00001111, a);
+        let e = _mm256_set1_pd(1.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_roundscale_pd() {
+        let a = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_pd::<0b00_00_00_00>(a);
+        let e = _mm_set1_pd(1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_roundscale_pd() {
+        let a = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_pd::<0b00_00_00_00>(a, 0, a);
+        let e = _mm_set1_pd(1.1);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_pd::<0b00_00_00_00>(a, 0b00000011, a);
+        let e = _mm_set1_pd(1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_roundscale_pd() {
+        let a = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_pd::<0b00_00_00_00>(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_roundscale_pd::<0b00_00_00_00>(0b00000011, a);
+        let e = _mm_set1_pd(1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_scalef_pd(a, b);
+        let e = _mm512_set1_pd(8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_mask_scalef_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_scalef_pd(a, 0b11110000, a, b);
+        let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_scalef_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_scalef_pd(0b11110000, a, b);
+        let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_scalef_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(3.);
+        let r = _mm256_scalef_pd(a, b);
+        let e = _mm256_set1_pd(8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_scalef_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(3.);
+        let r = _mm256_mask_scalef_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_scalef_pd(a, 0b00001111, a, b);
+        let e = _mm256_set1_pd(8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_scalef_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(3.);
+        let r = _mm256_maskz_scalef_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_scalef_pd(0b00001111, a, b);
+        let e = _mm256_set1_pd(8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_scalef_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_pd(a, b);
+        let e = _mm_set1_pd(8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_scalef_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_scalef_pd(a, 0b00000011, a, b);
+        let e = _mm_set1_pd(8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_scalef_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_scalef_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_scalef_pd(0b00000011, a, b);
+        let e = _mm_set1_pd(8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_pd() {
+        let a = _mm512_set1_pd(f64::NAN);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_fixupimm_pd::<5>(a, b, c);
+        let e = _mm512_set1_pd(0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_mask_fixupimm_pd::<5>(a, 0b11110000, b, c);
+        let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_maskz_fixupimm_pd::<5>(0b11110000, a, b, c);
+        let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_fixupimm_pd() {
+        let a = _mm256_set1_pd(f64::NAN);
+        let b = _mm256_set1_pd(f64::MAX);
+        let c = _mm256_set1_epi64x(i32::MAX as i64);
+        let r = _mm256_fixupimm_pd::<5>(a, b, c);
+        let e = _mm256_set1_pd(0.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_fixupimm_pd() {
+        let a = _mm256_set1_pd(f64::NAN);
+        let b = _mm256_set1_pd(f64::MAX);
+        let c = _mm256_set1_epi64x(i32::MAX as i64);
+        let r = _mm256_mask_fixupimm_pd::<5>(a, 0b00001111, b, c);
+        let e = _mm256_set1_pd(0.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_fixupimm_pd() {
+        let a = _mm256_set1_pd(f64::NAN);
+        let b = _mm256_set1_pd(f64::MAX);
+        let c = _mm256_set1_epi64x(i32::MAX as i64);
+        let r = _mm256_maskz_fixupimm_pd::<5>(0b00001111, a, b, c);
+        let e = _mm256_set1_pd(0.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_fixupimm_pd() {
+        let a = _mm_set1_pd(f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_fixupimm_pd::<5>(a, b, c);
+        let e = _mm_set1_pd(0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_fixupimm_pd() {
+        let a = _mm_set1_pd(f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_mask_fixupimm_pd::<5>(a, 0b00000011, b, c);
+        let e = _mm_set1_pd(0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_fixupimm_pd() {
+        let a = _mm_set1_pd(f64::NAN);
+        let b = _mm_set1_pd(f64::MAX);
+        let c = _mm_set1_epi64x(i32::MAX as i64);
+        let r = _mm_maskz_fixupimm_pd::<5>(0b00000011, a, b, c);
+        let e = _mm_set1_pd(0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ternarylogic_epi64() {
+        let a = _mm512_set1_epi64(1 << 2);
+        let b = _mm512_set1_epi64(1 << 1);
+        let c = _mm512_set1_epi64(1 << 0);
+        let r = _mm512_ternarylogic_epi64::<8>(a, b, c);
+        let e = _mm512_set1_epi64(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ternarylogic_epi64() {
+        let src = _mm512_set1_epi64(1 << 2);
+        let a = _mm512_set1_epi64(1 << 1);
+        let b = _mm512_set1_epi64(1 << 0);
+        let r = _mm512_mask_ternarylogic_epi64::<8>(src, 0, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_ternarylogic_epi64::<8>(src, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ternarylogic_epi64() {
+        let a = _mm512_set1_epi64(1 << 2);
+        let b = _mm512_set1_epi64(1 << 1);
+        let c = _mm512_set1_epi64(1 << 0);
+        let r = _mm512_maskz_ternarylogic_epi64::<8>(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ternarylogic_epi64::<8>(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ternarylogic_epi64() {
+        let a = _mm256_set1_epi64x(1 << 2);
+        let b = _mm256_set1_epi64x(1 << 1);
+        let c = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_ternarylogic_epi64::<8>(a, b, c);
+        let e = _mm256_set1_epi64x(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ternarylogic_epi64() {
+        let src = _mm256_set1_epi64x(1 << 2);
+        let a = _mm256_set1_epi64x(1 << 1);
+        let b = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_mask_ternarylogic_epi64::<8>(src, 0, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_ternarylogic_epi64::<8>(src, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ternarylogic_epi64() {
+        let a = _mm256_set1_epi64x(1 << 2);
+        let b = _mm256_set1_epi64x(1 << 1);
+        let c = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_maskz_ternarylogic_epi64::<9>(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ternarylogic_epi64::<8>(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ternarylogic_epi64() {
+        let a = _mm_set1_epi64x(1 << 2);
+        let b = _mm_set1_epi64x(1 << 1);
+        let c = _mm_set1_epi64x(1 << 0);
+        let r = _mm_ternarylogic_epi64::<8>(a, b, c);
+        let e = _mm_set1_epi64x(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ternarylogic_epi64() {
+        let src = _mm_set1_epi64x(1 << 2);
+        let a = _mm_set1_epi64x(1 << 1);
+        let b = _mm_set1_epi64x(1 << 0);
+        let r = _mm_mask_ternarylogic_epi64::<8>(src, 0, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_ternarylogic_epi64::<8>(src, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ternarylogic_epi64() {
+        let a = _mm_set1_epi64x(1 << 2);
+        let b = _mm_set1_epi64x(1 << 1);
+        let c = _mm_set1_epi64x(1 << 0);
+        let r = _mm_maskz_ternarylogic_epi64::<9>(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ternarylogic_epi64::<8>(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a);
+        let e = _mm512_set1_pd(1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11110000, a);
+        let e = _mm512_setr_pd(10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11110000, a);
+        let e = _mm512_setr_pd(0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_getmant_pd() {
+        let a = _mm256_set1_pd(10.);
+        let r = _mm256_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a);
+        let e = _mm256_set1_pd(1.25);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_getmant_pd() {
+        let a = _mm256_set1_pd(10.);
+        let r = _mm256_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b00001111, a);
+        let e = _mm256_set1_pd(1.25);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_getmant_pd() {
+        let a = _mm256_set1_pd(10.);
+        let r = _mm256_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b00001111, a);
+        let e = _mm256_set1_pd(1.25);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_getmant_pd() {
+        let a = _mm_set1_pd(10.);
+        let r = _mm_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a);
+        let e = _mm_set1_pd(1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_getmant_pd() {
+        let a = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b00000011, a);
+        let e = _mm_set1_pd(1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_getmant_pd() {
+        let a = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_getmant_pd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b00000011, a);
+        let e = _mm_set1_pd(1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtps_pd(a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm512_set1_pd(0.);
+        let r = _mm512_mask_cvtps_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtps_pd(src, 0b00001111, a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtps_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_cvtps_pd(0b00001111, a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpslo_pd() {
+        let v2 = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 100., 100., 100., 100., 100., 100., 100., 100.,
+        );
+        let r = _mm512_cvtpslo_pd(v2);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpslo_pd() {
+        let v2 = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 100., 100., 100., 100., 100., 100., 100., 100.,
+        );
+        let src = _mm512_set1_pd(0.);
+        let r = _mm512_mask_cvtpslo_pd(src, 0, v2);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtpslo_pd(src, 0b00001111, v2);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtpd_ps(a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_ps(0.);
+        let r = _mm512_mask_cvtpd_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm512_mask_cvtpd_ps(src, 0b00001111, a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtpd_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm512_maskz_cvtpd_ps(0b00001111, a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_ps() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let src = _mm_set1_ps(0.);
+        let r = _mm256_mask_cvtpd_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm256_mask_cvtpd_ps(src, 0b00001111, a);
+        let e = _mm_set_ps(4., -5.5, 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_ps() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let r = _mm256_maskz_cvtpd_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm256_maskz_cvtpd_ps(0b00001111, a);
+        let e = _mm_set_ps(4., -5.5, 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_ps() {
+        let a = _mm_set_pd(6., -7.5);
+        let src = _mm_set1_ps(0.);
+        let r = _mm_mask_cvtpd_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtpd_ps(src, 0b00000011, a);
+        let e = _mm_set_ps(0., 0., 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_ps() {
+        let a = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvtpd_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtpd_ps(0b00000011, a);
+        let e = _mm_set_ps(0., 0., 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtpd_epi32(a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtpd_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtpd_epi32(src, 0b11111111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtpd_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtpd_epi32(0b11111111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epi32() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtpd_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, -6, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epi32() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let r = _mm256_maskz_cvtpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtpd_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, -6, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtpd_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvtpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtpd_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_epu32() {
+        let a = _mm512_setr_pd(0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5);
+        let r = _mm512_cvtpd_epu32(a);
+        let e = _mm256_setr_epi32(0, 2, 2, 4, 4, 6, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_epu32() {
+        let a = _mm512_setr_pd(0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtpd_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtpd_epu32(src, 0b11111111, a);
+        let e = _mm256_setr_epi32(0, 2, 2, 4, 4, 6, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtpd_epu32() {
+        let a = _mm512_setr_pd(0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5);
+        let r = _mm512_maskz_cvtpd_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtpd_epu32(0b11111111, a);
+        let e = _mm256_setr_epi32(0, 2, 2, 4, 4, 6, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_cvtpd_epu32(a);
+        let e = _mm_set_epi32(4, 6, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtpd_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 6, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_maskz_cvtpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtpd_epu32(0b00001111, a);
+        let e = _mm_set_epi32(4, 6, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_cvtpd_epu32(a);
+        let e = _mm_set_epi32(0, 0, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtpd_epu32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_maskz_cvtpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtpd_epu32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_pslo() {
+        let v2 = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtpd_pslo(v2);
+        let e = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_pslo() {
+        let v2 = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvtpd_pslo(src, 0, v2);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtpd_pslo(src, 0b00001111, v2);
+        let e = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi8_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepi8_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi8_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi8_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi8_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepi8_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi8_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepi8_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi8_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu8_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepu8_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu8_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu8_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu8_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepu8_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu8_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepu8_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu8_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi16_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepi16_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi16_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi16_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi16_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepi16_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi16_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi16_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi16_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepi16_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi16_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi16_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi16_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu16_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepu16_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu16_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu16_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu16_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepu16_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu16_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu16_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu16_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepu16_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu16_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu16_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu16_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepi32_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi32_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi32_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_epi64() {
+        let a = _mm_set_epi32(8, 9, 10, 11);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepi32_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi32_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(8, 9, 10, 11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_epi64() {
+        let a = _mm_set_epi32(8, 9, 10, 11);
+        let r = _mm256_maskz_cvtepi32_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi32_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(8, 9, 10, 11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi64() {
+        let a = _mm_set_epi32(8, 9, 10, 11);
+        let src = _mm_set1_epi64x(0);
+        let r = _mm_mask_cvtepi32_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi32_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(10, 11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_epi64() {
+        let a = _mm_set_epi32(8, 9, 10, 11);
+        let r = _mm_maskz_cvtepi32_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi32_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(10, 11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32_epi64(a);
+        let e = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi64(-1);
+        let r = _mm512_mask_cvtepu32_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu32_epi64(src, 0b00001111, a);
+        let e = _mm512_set_epi64(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu32_epi64() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu32_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu32_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepu32_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu32_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu32_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu32_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepu32_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu32_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu32_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu32_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_pd(a);
+        let e = _mm512_set_pd(8., 9., 10., 11., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_cvtepi32_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtepi32_pd(src, 0b00001111, a);
+        let e = _mm512_set_pd(-1., -1., -1., -1., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_cvtepi32_pd(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm256_set1_pd(-1.);
+        let r = _mm256_mask_cvtepi32_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_cvtepi32_pd(src, 0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi32_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_cvtepi32_pd(0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm_set1_pd(-1.);
+        let r = _mm_mask_cvtepi32_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_cvtepi32_pd(src, 0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi32_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_cvtepi32_pd(0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32_pd(a);
+        let e = _mm512_set_pd(8., 9., 10., 11., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_cvtepu32_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtepu32_pd(src, 0b00001111, a);
+        let e = _mm512_set_pd(-1., -1., -1., -1., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu32_pd() {
+        let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu32_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_cvtepu32_pd(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_cvtepu32_pd(a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm256_set1_pd(-1.);
+        let r = _mm256_mask_cvtepu32_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_cvtepu32_pd(src, 0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu32_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_cvtepu32_pd(0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_cvtepu32_pd(a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm_set1_pd(-1.);
+        let r = _mm_mask_cvtepu32_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_cvtepu32_pd(src, 0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu32_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_cvtepu32_pd(0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32lo_pd() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32lo_pd(a);
+        let e = _mm512_set_pd(8., 9., 10., 11., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32lo_pd() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_cvtepi32lo_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtepi32lo_pd(src, 0b00001111, a);
+        let e = _mm512_set_pd(-1., -1., -1., -1., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32lo_pd() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32lo_pd(a);
+        let e = _mm512_set_pd(8., 9., 10., 11., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32lo_pd() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_cvtepu32lo_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvtepu32lo_pd(src, 0b00001111, a);
+        let e = _mm512_set_pd(-1., -1., -1., -1., 12., 13., 14., 15.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi64_epi32() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi64_epi32(a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_epi32() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi64_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtepi64_epi32(src, 0b00001111, a);
+        let e = _mm256_set_epi32(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi64_epi32() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi64_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtepi64_epi32(0b00001111, a);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_epi32() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_cvtepi64_epi32(a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_epi32() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi64_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_epi32() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let r = _mm256_maskz_cvtepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi64_epi32(0b00001111, a);
+        let e = _mm_set_epi32(1, 2, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi64_epi32() {
+        let a = _mm_set_epi64x(3, 4);
+        let r = _mm_cvtepi64_epi32(a);
+        let e = _mm_set_epi32(0, 0, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_epi32() {
+        let a = _mm_set_epi64x(3, 4);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi64_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_epi32() {
+        let a = _mm_set_epi64x(3, 4);
+        let r = _mm_maskz_cvtepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi64_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi64_epi16() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi64_epi16(a);
+        let e = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_epi16() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm512_mask_cvtepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(-1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi64_epi16() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_epi16() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let r = _mm256_cvtepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_epi16() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi64_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_epi16() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi64_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi64_epi16() {
+        let a = _mm_set_epi64x(14, 15);
+        let r = _mm_cvtepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_epi16() {
+        let a = _mm_set_epi64x(14, 15);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi64_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_epi16() {
+        let a = _mm_set_epi64x(14, 15);
+        let r = _mm_maskz_cvtepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi64_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi64_epi8() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_epi8() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_mask_cvtepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi64_epi8() {
+        let a = _mm512_set_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepi64_epi8() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let r = _mm256_cvtepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_epi8() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi64_epi8() {
+        let a = _mm256_set_epi64x(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi64_epi8() {
+        let a = _mm_set_epi64x(14, 15);
+        let r = _mm_cvtepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_epi8() {
+        let a = _mm_set_epi64x(14, 15);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi64_epi8(src, 0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi64_epi8() {
+        let a = _mm_set_epi64x(14, 15);
+        let r = _mm_maskz_cvtepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi64_epi8(0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_cvtsepi64_epi32(a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, i32::MIN, i32::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm512_mask_cvtsepi64_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtsepi64_epi32(src, 0b00001111, a);
+        let e = _mm256_set_epi32(-1, -1, -1, -1, 4, 5, i32::MIN, i32::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_maskz_cvtsepi64_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtsepi64_epi32(0b00001111, a);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 4, 5, i32::MIN, i32::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_cvtsepi64_epi32(a);
+        let e = _mm_set_epi32(4, 5, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm256_mask_cvtsepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi64_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_maskz_cvtsepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi64_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi64_epi32() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_cvtsepi64_epi32(a);
+        let e = _mm_set_epi32(0, 0, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_epi32() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtsepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi64_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi64_epi32() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_maskz_cvtsepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi64_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_cvtsepi64_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm512_mask_cvtsepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtsepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(-1, -1, -1, -1, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_maskz_cvtsepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtsepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_cvtsepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtsepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_maskz_cvtsepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi64_epi16() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_cvtsepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_epi16() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtsepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi64_epi16(src, 0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi64_epi16() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_maskz_cvtsepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi64_epi16(0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_cvtsepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_mask_cvtsepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtsepi64_epi8(src, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            -1, -1, -1, -1,
+            4, 5, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtsepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
+        let r = _mm512_maskz_cvtsepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtsepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_cvtsepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtsepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_maskz_cvtsepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi64_epi8() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_cvtsepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_epi8() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtsepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi64_epi8(src, 0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi64_epi8() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_maskz_cvtsepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi64_epi8(0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_cvtusepi64_epi32(a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm512_mask_cvtusepi64_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtusepi64_epi32(src, 0b00001111, a);
+        let e = _mm256_set_epi32(-1, -1, -1, -1, 4, 5, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi64_epi32() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_maskz_cvtusepi64_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtusepi64_epi32(0b00001111, a);
+        let e = _mm256_set_epi32(0, 0, 0, 0, 4, 5, -1, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_cvtusepi64_epi32(a);
+        let e = _mm_set_epi32(4, 5, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtusepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi64_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_maskz_cvtusepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi64_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi64_epi32() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_cvtusepi64_epi32(a);
+        let e = _mm_set_epi32(0, 0, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_epi32() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtusepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi64_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi64_epi32() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_maskz_cvtusepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi64_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_cvtusepi64_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm512_mask_cvtusepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtusepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(-1, -1, -1, -1, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi64_epi16() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_maskz_cvtusepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtusepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_cvtusepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtusepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_maskz_cvtusepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi64_epi16() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_cvtusepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_epi16() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtusepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi64_epi16(src, 0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi64_epi16() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_maskz_cvtusepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi64_epi16(0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtusepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_cvtusepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_mask_cvtusepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtusepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtusepi64_epi8() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
+        let r = _mm512_maskz_cvtusepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtusepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_cvtusepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtusepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_maskz_cvtusepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi64_epi8() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_cvtusepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_epi8() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtusepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi64_epi8(src, 0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi64_epi8() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_maskz_cvtusepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi64_epi8(0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvttpd_epi32(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvttpd_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvttpd_epi32(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvttpd_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvttpd_epi32(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., -5.5, 6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvttpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvttpd_epi32(src, 0b00001111, a);
+        let e = _mm_setr_epi32(4, -5, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., -5.5, 6., -7.5);
+        let r = _mm256_maskz_cvttpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvttpd_epi32(0b00001111, a);
+        let e = _mm_setr_epi32(4, -5, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttpd_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvttpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttpd_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvttpd_epu32(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvttpd_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvttpd_epu32(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvttpd_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvttpd_epu32(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvttpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_cvttpd_epu32(a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvttpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvttpd_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_maskz_cvttpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvttpd_epu32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvttpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_cvttpd_epu32(a);
+        let e = _mm_set_epi32(0, 0, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttpd_epu32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_maskz_cvttpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttpd_epu32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_add_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(-1.);
+        let r = _mm512_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+        let r = _mm512_add_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_add_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(8., 9.5, 10., 11.5, 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_add_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(-1.);
+        let r =
+            _mm512_maskz_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(0., 0., 0., 0., 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sub_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+        let r = _mm512_sub_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sub_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let r = _mm512_mask_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(8., 9.5, 10., 11.5, 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sub_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let r =
+            _mm512_maskz_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_sub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(0., 0., 0., 0., 11., 12.5, 13., -1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mul_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.);
+        let b = _mm512_set1_pd(0.1);
+        let r = _mm512_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(
+            0.8,
+            0.9500000000000001,
+            1.,
+            1.1500000000000001,
+            1.2000000000000002,
+            1.35,
+            1.4000000000000001,
+            0.,
+        );
+        assert_eq_m512d(r, e);
+        let r = _mm512_mul_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_setr_pd(0.8, 0.95, 1.0, 1.15, 1.2, 1.3499999999999999, 1.4, 0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mul_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.);
+        let b = _mm512_set1_pd(0.1);
+        let r = _mm512_mask_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(
+            8.,
+            9.5,
+            10.,
+            11.5,
+            1.2000000000000002,
+            1.35,
+            1.4000000000000001,
+            0.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mul_round_pd() {
+        let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.);
+        let b = _mm512_set1_pd(0.1);
+        let r =
+            _mm512_maskz_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_mul_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(
+            0.,
+            0.,
+            0.,
+            0.,
+            1.2000000000000002,
+            1.35,
+            1.4000000000000001,
+            0.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_div_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pd(0.3333333333333333);
+        assert_eq_m512d(r, e);
+        let r = _mm512_div_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pd(0.3333333333333333);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_div_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_mask_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(
+            1.,
+            1.,
+            1.,
+            1.,
+            0.3333333333333333,
+            0.3333333333333333,
+            0.3333333333333333,
+            0.3333333333333333,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_div_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r =
+            _mm512_maskz_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_div_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_setr_pd(
+            0.,
+            0.,
+            0.,
+            0.,
+            0.3333333333333333,
+            0.3333333333333333,
+            0.3333333333333333,
+            0.3333333333333333,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sqrt_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_pd(1.7320508075688772);
+        assert_eq_m512d(r, e);
+        let r = _mm512_sqrt_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_set1_pd(1.7320508075688774);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sqrt_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r =
+            _mm512_mask_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a,
+        );
+        let e = _mm512_setr_pd(
+            3.,
+            3.,
+            3.,
+            3.,
+            1.7320508075688772,
+            1.7320508075688772,
+            1.7320508075688772,
+            1.7320508075688772,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sqrt_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r =
+            _mm512_maskz_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_sqrt_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a,
+        );
+        let e = _mm512_setr_pd(
+            0.,
+            0.,
+            0.,
+            0.,
+            1.7320508075688772,
+            1.7320508075688772,
+            1.7320508075688772,
+            1.7320508075688772,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(-1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fmadd_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(-0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            -1.,
+            -1.,
+            -1.,
+            -1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_maskz_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(-1., -1., -1., -1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask3_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(-1., -1., -1., -1., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(-1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fmsub_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(-0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            -1.,
+            -1.,
+            -1.,
+            -1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(-1., -1., -1., -1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask3_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(-1., -1., -1., -1., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmaddsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r =
+            _mm512_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_pd(1., -1., 1., -1., 1., -1., 1., -1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fmaddsub_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_pd(
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmaddsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            1.,
+            -1.,
+            1.,
+            -1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmaddsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_maskz_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(1., -1., 1., -1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmaddsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask3_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmaddsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(1., -1., 1., -1., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fmsubadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r =
+            _mm512_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_pd(-1., 1., -1., 1., -1., 1., -1., 1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fmsubadd_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_setr_pd(
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+            -0.9999999999999999,
+            1.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fmsubadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            -1.,
+            1.,
+            -1.,
+            1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fmsubadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_maskz_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(-1., 1., -1., 1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fmsubadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask3_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fmsubadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(-1., 1., -1., 1., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r =
+            _mm512_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fnmadd_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            1.,
+            1.,
+            1.,
+            1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_maskz_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(1., 1., 1., 1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmadd_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(1.);
+        let r = _mm512_mask3_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fnmadd_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fnmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r =
+            _mm512_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(1.);
+        assert_eq_m512d(r, e);
+        let r = _mm512_fnmsub_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
+        let e = _mm512_set1_pd(0.9999999999999999);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fnmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b00001111, b, c,
+        );
+        let e = _mm512_setr_pd(
+            1.,
+            1.,
+            1.,
+            1.,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+            0.000000000000000007,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fnmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_maskz_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00001111, a, b, c,
+        );
+        let e = _mm512_setr_pd(1., 1., 1., 1., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask3_fnmsub_round_pd() {
+        let a = _mm512_set1_pd(0.000000000000000007);
+        let b = _mm512_set1_pd(1.);
+        let c = _mm512_set1_pd(-1.);
+        let r = _mm512_mask3_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
+        assert_eq_m512d(r, c);
+        let r = _mm512_mask3_fnmsub_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b00001111,
+        );
+        let e = _mm512_setr_pd(1., 1., 1., 1., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_max_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_max_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_mask_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_max_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_maskz_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_max_round_pd::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a, b);
+        let e = _mm512_setr_pd(7., 6., 5., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_min_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 3., 2., 1., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_min_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_mask_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_min_round_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.);
+        let r = _mm512_maskz_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_min_round_pd::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a, b);
+        let e = _mm512_setr_pd(0., 1., 2., 3., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getexp_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_pd(1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getexp_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_mask_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11110000, a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getexp_round_pd() {
+        let a = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_getexp_round_pd::<_MM_FROUND_CUR_DIRECTION>(0b11110000, a);
+        let e = _mm512_setr_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_round_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_round_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_mask_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a);
+        let e = _mm512_set1_pd(1.1);
+        assert_eq_m512d(r, e);
+        let r = _mm512_mask_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_round_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_maskz_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_roundscale_round_pd::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set1_pd(8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_mask_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11110000, a, b,
+        );
+        let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b,
+        );
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_scalef_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11110000, a, b,
+        );
+        let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_round_pd() {
+        let a = _mm512_set1_pd(f64::NAN);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_fixupimm_round_pd::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
+        let e = _mm512_set1_pd(0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_round_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_mask_fixupimm_round_pd::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11110000, b, c);
+        let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_round_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_maskz_fixupimm_round_pd::<5, _MM_FROUND_CUR_DIRECTION>(0b11110000, a, b, c);
+        let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_getmant_round_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a);
+        let e = _mm512_set1_pd(1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_getmant_round_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_mask_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(a, 0b11110000, a);
+        let e = _mm512_setr_pd(10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_getmant_round_pd() {
+        let a = _mm512_set1_pd(10.);
+        let r = _mm512_maskz_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_getmant_round_pd::<
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        >(0b11110000, a);
+        let e = _mm512_setr_pd(0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm512_set1_pd(0.);
+        let r = _mm512_mask_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_pd() {
+        let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
+        let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm512_mask_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_ps() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm512_maskz_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
+        let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundpd_epu32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
+        let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_pd() {
+        assert_eq_m512d(_mm512_setzero_pd(), _mm512_set1_pd(0.));
+    }
+
+    unsafe fn test_mm512_set1_epi64() {
+        let r = _mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, _mm512_set1_epi64(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_pd() {
+        let expected = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(expected, _mm512_set1_pd(2.));
+    }
+
+    unsafe fn test_mm512_set4_epi64() {
+        let r = _mm512_set_epi64(4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_set4_epi64(4, 3, 2, 1));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set4_pd() {
+        let r = _mm512_set_pd(4., 3., 2., 1., 4., 3., 2., 1.);
+        assert_eq_m512d(r, _mm512_set4_pd(4., 3., 2., 1.));
+    }
+
+    unsafe fn test_mm512_setr4_epi64() {
+        let r = _mm512_set_epi64(4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_setr4_epi64(1, 2, 3, 4));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_pd() {
+        let r = _mm512_set_pd(4., 3., 2., 1., 4., 3., 2., 1.);
+        assert_eq_m512d(r, _mm512_setr4_pd(1., 2., 3., 4.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmplt_pd_mask(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmplt_pd_mask(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnlt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        assert_eq!(_mm512_cmpnlt_pd_mask(a, b), !_mm512_cmplt_pd_mask(a, b));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnlt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmpnlt_pd_mask(mask, a, b), 0b01111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        assert_eq!(_mm512_cmple_pd_mask(a, b), 0b00100101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmple_pd_mask(mask, a, b), 0b00100000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnle_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmpnle_pd_mask(b, a);
+        assert_eq!(m, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnle_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmpnle_pd_mask(mask, b, a);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let m = _mm512_cmpeq_pd_mask(b, a);
+        assert_eq!(m, 0b11001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpeq_pd_mask(mask, b, a);
+        assert_eq!(r, 0b01001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let m = _mm512_cmpneq_pd_mask(b, a);
+        assert_eq!(m, 0b00110010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpneq_pd_mask(mask, b, a);
+        assert_eq!(r, 0b00110010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmp_pd_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_pd_mask() {
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmp_pd_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_pd_mask() {
+        let a = _mm256_set_pd(0., 1., -1., 13.);
+        let b = _mm256_set1_pd(1.);
+        let m = _mm256_cmp_pd_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_pd_mask() {
+        let a = _mm256_set_pd(0., 1., -1., 13.);
+        let b = _mm256_set1_pd(1.);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_pd_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_pd_mask() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set1_pd(1.);
+        let m = _mm_cmp_pd_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_pd_mask() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set1_pd(1.);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_pd_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_round_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmp_round_pd_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_round_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmp_round_pd_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let m = _mm512_cmpord_pd_mask(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let mask = 0b11000011;
+        let m = _mm512_mask_cmpord_pd_mask(mask, a, b);
+        assert_eq!(m, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpunord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let m = _mm512_cmpunord_pd_mask(a, b);
+
+        assert_eq!(m, 0b11111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpunord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let mask = 0b00001111;
+        let m = _mm512_mask_cmpunord_pd_mask(mask, a, b);
+        assert_eq!(m, 0b000001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmplt_epu64_mask(a, b);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmplt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 100);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_cmplt_epu64_mask(a, b);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 100);
+        let b = _mm256_set1_epi64x(2);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_cmplt_epu64_mask(a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(2);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmpgt_epu64_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpgt_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_cmpgt_epu64_mask(a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epu64_mask() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_cmpgt_epu64_mask(a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epu64_mask() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(
+            _mm512_cmple_epu64_mask(a, b),
+            !_mm512_cmpgt_epu64_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmple_epu64_mask(mask, a, b), 0b01111010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 1);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_cmple_epu64_mask(a, b);
+        assert_eq!(r, 0b00001101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, 1);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00001101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_cmple_epu64_mask(a, b);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(
+            _mm512_cmpge_epu64_mask(a, b),
+            !_mm512_cmplt_epu64_mask(a, b)
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b11111111;
+        let r = _mm512_mask_cmpge_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, u64::MAX as i64);
+        let b = _mm256_set1_epi64x(1);
+        let r = _mm256_cmpge_epu64_mask(a, b);
+        assert_eq!(r, 0b00000111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, 2, u64::MAX as i64);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_cmpge_epu64_mask(a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epu64_mask(mask, a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpeq_epu64_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpeq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, u64::MAX as i64);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let m = _mm256_cmpeq_epu64_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, u64::MAX as i64);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpeq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(0, 1);
+        let m = _mm_cmpeq_epu64_mask(b, a);
+        assert_eq!(m, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(0, 1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpneq_epu64_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epu64_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, -100, 100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpneq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00110010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, u64::MAX as i64);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let r = _mm256_cmpneq_epu64_mask(b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, u64::MAX as i64);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu64_mask() {
+        let a = _mm_set_epi64x(-1, u64::MAX as i64);
+        let b = _mm_set_epi64x(13, 42);
+        let r = _mm_cmpneq_epu64_mask(b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu64_mask() {
+        let a = _mm_set_epi64x(-1, u64::MAX as i64);
+        let b = _mm_set_epi64x(13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmp_epu64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 100);
+        let b = _mm256_set1_epi64x(1);
+        let m = _mm256_cmp_epu64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 100);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let m = _mm_cmp_epu64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmplt_epi64_mask(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmplt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmplt_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, -13);
+        let b = _mm256_set1_epi64x(-1);
+        let r = _mm256_cmplt_epi64_mask(a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmplt_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, -13);
+        let b = _mm256_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epi64_mask() {
+        let a = _mm_set_epi64x(-1, -13);
+        let b = _mm_set1_epi64x(-1);
+        let r = _mm_cmplt_epi64_mask(a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epi64_mask() {
+        let a = _mm_set_epi64x(-1, -13);
+        let b = _mm_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmpgt_epi64_mask(b, a);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmpgt_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set1_epi64x(-1);
+        let r = _mm256_cmpgt_epi64_mask(a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi64_mask() {
+        let a = _mm_set_epi64x(0, -1);
+        let b = _mm_set1_epi64x(-1);
+        let r = _mm_cmpgt_epi64_mask(a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi64_mask() {
+        let a = _mm_set_epi64x(0, -1);
+        let b = _mm_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(
+            _mm512_cmple_epi64_mask(a, b),
+            !_mm512_cmpgt_epi64_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmple_epi64_mask(mask, a, b), 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, i64::MAX);
+        let b = _mm256_set1_epi64x(-1);
+        let r = _mm256_cmple_epi64_mask(a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, i64::MAX);
+        let b = _mm256_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let r = _mm_cmple_epi64_mask(a, b);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(
+            _mm512_cmpge_epi64_mask(a, b),
+            !_mm512_cmplt_epi64_mask(a, b)
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b11111111;
+        let r = _mm512_mask_cmpge_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b11111010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, i64::MAX);
+        let b = _mm256_set1_epi64x(-1);
+        let r = _mm256_cmpge_epi64_mask(a, b);
+        assert_eq!(r, 0b00001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, i64::MAX);
+        let b = _mm256_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(-1);
+        let r = _mm_cmpge_epi64_mask(a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epi64_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpeq_epi64_mask(b, a);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpeq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let m = _mm256_cmpeq_epi64_mask(b, a);
+        assert_eq!(m, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpeq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(0, 1);
+        let m = _mm_cmpeq_epi64_mask(b, a);
+        assert_eq!(m, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(0, 1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi64() {
+        let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m512i(r, _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_epi64() {
+        let r = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0))
+    }
+
+    unsafe fn test_mm512_cmpneq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpneq_epi64_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epi64_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, -100, 100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpneq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00110010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let r = _mm256_cmpneq_epi64_mask(b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set_epi64x(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epi64_mask() {
+        let a = _mm_set_epi64x(-1, 13);
+        let b = _mm_set_epi64x(13, 42);
+        let r = _mm_cmpneq_epi64_mask(b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epi64_mask() {
+        let a = _mm_set_epi64x(-1, 13);
+        let b = _mm_set_epi64x(13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00000011)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmp_epi64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set1_epi64x(1);
+        let m = _mm256_cmp_epi64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epi64_mask() {
+        let a = _mm256_set_epi64x(0, 1, -1, 13);
+        let b = _mm256_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let m = _mm_cmp_epi64_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epi64_mask() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set1_epi64x(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        // A multiplier of 8 is word-addressing
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i32gather_pd::<8>(index, arr.as_ptr() as *const u8);
+        assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        let src = _mm512_set1_pd(2.);
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i32gather_pd::<8>(src, mask, index, arr.as_ptr() as *const u8);
+        assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        // A multiplier of 8 is word-addressing
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_pd::<8>(index, arr.as_ptr() as *const u8);
+        assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        let src = _mm512_set1_pd(2.);
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_pd::<8>(src, mask, index, arr.as_ptr() as *const u8);
+        assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_ps() {
+        let mut arr = [0f32; 128];
+        for i in 0..128 {
+            arr[i] = i as f32;
+        }
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_ps::<4>(index, arr.as_ptr() as *const u8);
+        assert_eq_m256(r, _mm256_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_ps() {
+        let mut arr = [0f32; 128];
+        for i in 0..128 {
+            arr[i] = i as f32;
+        }
+        let src = _mm256_set1_ps(2.);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i64gather_ps::<4>(src, mask, index, arr.as_ptr() as *const u8);
+        assert_eq_m256(r, _mm256_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i32gather_epi64::<8>(index, arr.as_ptr() as *const u8);
+        assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm512_set1_epi64(2);
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i32gather_epi64::<8>(src, mask, index, arr.as_ptr() as *const u8);
+        assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_epi64::<8>(index, arr.as_ptr() as *const u8);
+        assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm512_set1_epi64(2);
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_epi64::<8>(src, mask, index, arr.as_ptr() as *const u8);
+        assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_epi32() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_epi32::<8>(index, arr.as_ptr() as *const u8);
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_epi32() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm256_set1_epi32(2);
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_epi32::<8>(src, mask, index, arr.as_ptr() as *const u8);
+        assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_pd() {
+        let mut arr = [0f64; 128];
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_i32scatter_pd::<8>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0f64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_pd() {
+        let mut arr = [0f64; 128];
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i32scatter_pd::<8>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0f64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_pd() {
+        let mut arr = [0f64; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_i64scatter_pd::<8>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0f64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_pd() {
+        let mut arr = [0f64; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i64scatter_pd::<8>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0f64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_ps() {
+        let mut arr = [0f32; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 4 is word-addressing
+        _mm512_i64scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0f32; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_ps() {
+        let mut arr = [0f32; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i64scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0f32; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_i32scatter_epi64::<8>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0i64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i32scatter_epi64::<8>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0i64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_i64scatter_epi64::<8>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0i64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i64scatter_epi64::<8>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0i64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_epi32() {
+        let mut arr = [0i32; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 4 is word-addressing
+        _mm512_i64scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0i32; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_epi32() {
+        let mut arr = [0i32; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i64scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0i32; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rol_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_rol_epi64::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rol_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_mask_rol_epi64::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rol_epi64::<1>(a, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0,  1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rol_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 63,
+        );
+        let r = _mm512_maskz_rol_epi64::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rol_epi64::<1>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 33, 1 << 33, 1 << 33, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rol_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_rol_epi64::<1>(a);
+        let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rol_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_mask_rol_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rol_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rol_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_maskz_rol_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rol_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rol_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_rol_epi64::<1>(a);
+        let e = _mm_set_epi64x(1 << 0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rol_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_mask_rol_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rol_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rol_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_maskz_rol_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rol_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ror_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0,  1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_ror_epi64::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 63, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ror_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0,  1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_mask_ror_epi64::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_ror_epi64::<1>(a, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 63, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ror_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let r = _mm512_maskz_ror_epi64::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ror_epi64::<1>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 31, 1 << 31, 1 << 31, 1 << 63);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_ror_epi64() {
+        let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_ror_epi64::<1>(a);
+        let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_ror_epi64() {
+        let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_mask_ror_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_ror_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_ror_epi64() {
+        let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_maskz_ror_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_ror_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_ror_epi64() {
+        let a = _mm_set_epi64x(1 << 0, 1 << 32);
+        let r = _mm_ror_epi64::<1>(a);
+        let e = _mm_set_epi64x(1 << 63, 1 << 31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_ror_epi64() {
+        let a = _mm_set_epi64x(1 << 0, 1 << 32);
+        let r = _mm_mask_ror_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_ror_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 63, 1 << 31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_ror_epi64() {
+        let a = _mm_set_epi64x(1 << 0, 1 << 32);
+        let r = _mm_maskz_ror_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_ror_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 63, 1 << 31);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_slli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_slli_epi64::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_slli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_mask_slli_epi64::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_slli_epi64::<1>(a, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_slli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 63,
+        );
+        let r = _mm512_maskz_slli_epi64::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_slli_epi64::<1>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 33, 1 << 33, 1 << 33, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_slli_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_mask_slli_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_slli_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_slli_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let r = _mm256_maskz_slli_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_slli_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_slli_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_mask_slli_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_slli_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_slli_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let r = _mm_maskz_slli_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_slli_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_srli_epi64::<1>(a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let r = _mm512_mask_srli_epi64::<1>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srli_epi64::<1>(a, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srli_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let r = _mm512_maskz_srli_epi64::<1>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srli_epi64::<1>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 31, 1 << 31, 1 << 31, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srli_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_mask_srli_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srli_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srli_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_maskz_srli_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srli_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srli_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_mask_srli_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srli_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srli_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_maskz_srli_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srli_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rolv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_rolv_epi64(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 0,  1 << 34, 1 << 35,
+            1 << 36, 1 << 37, 1 << 38, 1 << 39,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rolv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_rolv_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rolv_epi64(a, 0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 0,  1 << 34, 1 << 35,
+            1 << 36, 1 << 37, 1 << 38, 1 << 39,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rolv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 62,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 2);
+        let r = _mm512_maskz_rolv_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rolv_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 36, 1 << 37, 1 << 38, 1 << 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rolv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_rolv_epi64(a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 34, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rolv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_rolv_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rolv_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 34, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rolv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_rolv_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rolv_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 34, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rolv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 63);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_rolv_epi64(a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rolv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 63);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_rolv_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rolv_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rolv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 63);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_rolv_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rolv_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_rorv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 0, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_rorv_epi64(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 30, 1 << 29,
+            1 << 28, 1 << 27, 1 << 26, 1 << 25,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_rorv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 0, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_rorv_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_rorv_epi64(a, 0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 30, 1 << 29,
+            1 << 28, 1 << 27, 1 << 26, 1 << 25,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_rorv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let b = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 2);
+        let r = _mm512_maskz_rorv_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_rorv_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 28, 1 << 27, 1 << 26, 1 << 62);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rorv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_rorv_epi64(a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 30, 1 << 29);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_rorv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_rorv_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_rorv_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 30, 1 << 29);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_rorv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 0, 1 << 32, 1 << 32);
+        let b = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_rorv_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_rorv_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 63, 1 << 30, 1 << 29);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rorv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 0);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_rorv_epi64(a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_rorv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 0);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_rorv_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_rorv_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_rorv_epi64() {
+        let a = _mm_set_epi64x(1 << 32, 1 << 0);
+        let b = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_rorv_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_rorv_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(1 << 32, 1 << 63);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sllv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 63, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm512_set_epi64(0, 2, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_sllv_epi64(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 0, 1 << 34, 1 << 35,
+            1 << 36, 1 << 37, 1 << 38, 1 << 39,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sllv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 63, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_sllv_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sllv_epi64(a, 0b11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 1 << 33, 0, 1 << 35,
+            1 << 36, 1 << 37, 1 << 38, 1 << 39,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sllv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 63,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 1);
+        let r = _mm512_maskz_sllv_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sllv_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 36, 1 << 37, 1 << 38, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sllv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 32, 1 << 63, 1 << 32);
+        let count = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_sllv_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sllv_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 33, 0, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sllv_epi64() {
+        let a = _mm256_set_epi64x(1 << 32, 1 << 32, 1 << 63, 1 << 32);
+        let count = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_sllv_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sllv_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 32, 1 << 33, 0, 1 << 35);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sllv_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let count = _mm_set_epi64x(2, 3);
+        let r = _mm_mask_sllv_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sllv_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(0, 1 << 35);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sllv_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let count = _mm_set_epi64x(2, 3);
+        let r = _mm_maskz_sllv_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sllv_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(0, 1 << 35);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srlv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 0, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_srlv_epi64(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 0, 1 << 30, 1 << 29,
+            1 << 28, 1 << 27, 1 << 26, 1 << 25,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srlv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 0, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_srlv_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srlv_epi64(a, 0b11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 32, 0, 1 << 30, 1 << 29,
+            1 << 28, 1 << 27, 1 << 26, 1 << 25,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srlv_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let count = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_srlv_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srlv_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 28, 1 << 27, 1 << 26, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srlv_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_srlv_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srlv_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srlv_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_srlv_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srlv_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srlv_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_mask_srlv_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srlv_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srlv_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_maskz_srlv_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srlv_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sll_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_sll_epi64(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+        let count = _mm_set_epi64x(1, 0);
+        let r = _mm512_sll_epi64(a, count);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sll_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 63, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_mask_sll_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sll_epi64(a, 0b11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 33, 1 << 33, 1 << 33,
+            1 << 33, 1 << 33, 1 << 33, 1 << 33,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sll_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 63,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_maskz_sll_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sll_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 33, 1 << 33, 1 << 33, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sll_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_mask_sll_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sll_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sll_epi64() {
+        let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_maskz_sll_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sll_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(0, 1 << 33, 1 << 33, 1 << 33);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sll_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_sll_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sll_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sll_epi64() {
+        let a = _mm_set_epi64x(1 << 63, 1 << 32);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_sll_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sll_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(0, 1 << 33);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srl_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_srl_epi64(a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srl_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 0, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_mask_srl_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srl_epi64(a, 0b11111111, a, count);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 1 << 31, 1 << 31, 1 << 31,
+            1 << 31, 1 << 31, 1 << 31, 1 << 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srl_epi64() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi64(
+            1 << 32, 1 << 32, 1 << 32, 1 << 32,
+            1 << 32, 1 << 32, 1 << 32, 1 << 0,
+        );
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm512_maskz_srl_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srl_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 31, 1 << 31, 1 << 31, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srl_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_mask_srl_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srl_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srl_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_maskz_srl_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srl_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srl_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_srl_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srl_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srl_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_srl_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srl_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_sra_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm_set_epi64x(0, 2);
+        let r = _mm512_sra_epi64(a, count);
+        let e = _mm512_set_epi64(0, -2, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_sra_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm_set_epi64x(0, 2);
+        let r = _mm512_mask_sra_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_sra_epi64(a, 0b11111111, a, count);
+        let e = _mm512_set_epi64(0, -2, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_sra_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm_set_epi64x(0, 2);
+        let r = _mm512_maskz_sra_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_sra_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 3, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_sra_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_sra_epi64(a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_sra_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_mask_sra_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_sra_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_sra_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm256_maskz_sra_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_sra_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_sra_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_sra_epi64(a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_sra_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_sra_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_sra_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_sra_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_sra_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_sra_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srav_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm512_set_epi64(2, 2, 0, 0, 0, 0, 2, 1);
+        let r = _mm512_srav_epi64(a, count);
+        let e = _mm512_set_epi64(0, -2, 0, 0, 0, 0, 3, -8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srav_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm512_set_epi64(2, 2, 0, 0, 0, 0, 2, 1);
+        let r = _mm512_mask_srav_epi64(a, 0, a, count);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srav_epi64(a, 0b11111111, a, count);
+        let e = _mm512_set_epi64(0, -2, 0, 0, 0, 0, 3, -8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srav_epi64() {
+        let a = _mm512_set_epi64(1, -8, 0, 0, 0, 0, 15, -16);
+        let count = _mm512_set_epi64(2, 2, 0, 0, 0, 0, 2, 1);
+        let r = _mm512_maskz_srav_epi64(0, a, count);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srav_epi64(0b00001111, a, count);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 3, -8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_srav_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_srav_epi64(a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srav_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_srav_epi64(a, 0, a, count);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srav_epi64(a, 0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srav_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let count = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_srav_epi64(0, a, count);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srav_epi64(0b00001111, a, count);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_srav_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_srav_epi64(a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srav_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_mask_srav_epi64(a, 0, a, count);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srav_epi64(a, 0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srav_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let count = _mm_set1_epi64x(1);
+        let r = _mm_maskz_srav_epi64(0, a, count);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srav_epi64(0b00000011, a, count);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_srai_epi64() {
+        let a = _mm512_set_epi64(1, -4, 15, 0, 0, 0, 0, -16);
+        let r = _mm512_srai_epi64::<2>(a);
+        let e = _mm512_set_epi64(0, -1, 3, 0, 0, 0, 0, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_srai_epi64() {
+        let a = _mm512_set_epi64(1, -4, 15, 0, 0, 0, 0, -16);
+        let r = _mm512_mask_srai_epi64::<2>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_srai_epi64::<2>(a, 0b11111111, a);
+        let e = _mm512_set_epi64(0, -1, 3, 0, 0, 0, 0, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_srai_epi64() {
+        let a = _mm512_set_epi64(1, -4, 15, 0, 0, 0, 0, -16);
+        let r = _mm512_maskz_srai_epi64::<2>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_srai_epi64::<2>(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, -4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_srai_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_srai_epi64::<1>(a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_srai_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_mask_srai_epi64::<1>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_srai_epi64::<1>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_srai_epi64() {
+        let a = _mm256_set_epi64x(1 << 5, 0, 0, 0);
+        let r = _mm256_maskz_srai_epi64::<1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_srai_epi64::<1>(0b00001111, a);
+        let e = _mm256_set_epi64x(1 << 4, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_srai_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_srai_epi64::<1>(a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_srai_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_mask_srai_epi64::<1>(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_srai_epi64::<1>(a, 0b00000011, a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_srai_epi64() {
+        let a = _mm_set_epi64x(1 << 5, 0);
+        let r = _mm_maskz_srai_epi64::<1>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_srai_epi64::<1>(0b00000011, a);
+        let e = _mm_set_epi64x(1 << 4, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permute_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_permute_pd::<0b11_11_11_11>(a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permute_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_permute_pd::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permute_pd::<0b11_11_11_11>(a, 0b11111111, a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permute_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_permute_pd::<0b11_11_11_11>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permute_pd::<0b11_11_11_11>(0b11111111, a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permute_pd() {
+        let a = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_mask_permute_pd::<0b11_11>(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permute_pd::<0b11_11>(a, 0b00001111, a);
+        let e = _mm256_set_pd(3., 3., 1., 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permute_pd() {
+        let a = _mm256_set_pd(3., 2., 1., 0.);
+        let r = _mm256_maskz_permute_pd::<0b11_11>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permute_pd::<0b11_11>(0b00001111, a);
+        let e = _mm256_set_pd(3., 3., 1., 1.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permute_pd() {
+        let a = _mm_set_pd(1., 0.);
+        let r = _mm_mask_permute_pd::<0b11>(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_permute_pd::<0b11>(a, 0b00000011, a);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permute_pd() {
+        let a = _mm_set_pd(1., 0.);
+        let r = _mm_maskz_permute_pd::<0b11>(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_permute_pd::<0b11>(0b00000011, a);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_permutex_epi64::<0b11_11_11_11>(a);
+        let e = _mm512_setr_epi64(3, 3, 3, 3, 7, 7, 7, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_permutex_epi64::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex_epi64::<0b11_11_11_11>(a, 0b11111111, a);
+        let e = _mm512_setr_epi64(3, 3, 3, 3, 7, 7, 7, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex_epi64() {
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_permutex_epi64::<0b11_11_11_11>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex_epi64::<0b11_11_11_11>(0b11111111, a);
+        let e = _mm512_setr_epi64(3, 3, 3, 3, 7, 7, 7, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex_epi64() {
+        let a = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_permutex_epi64::<0b11_11_11_11>(a);
+        let e = _mm256_set_epi64x(3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex_epi64() {
+        let a = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_mask_permutex_epi64::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex_epi64::<0b11_11_11_11>(a, 0b00001111, a);
+        let e = _mm256_set_epi64x(3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm256_maskz_permutex_epi64() {
+        let a = _mm256_set_epi64x(3, 2, 1, 0);
+        let r = _mm256_maskz_permutex_epi64::<0b11_11_11_11>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex_epi64::<0b11_11_11_11>(0b00001111, a);
+        let e = _mm256_set_epi64x(3, 3, 3, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_permutex_pd::<0b11_11_11_11>(a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 7., 7., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_permutex_pd::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permutex_pd::<0b11_11_11_11>(a, 0b11111111, a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 7., 7., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex_pd() {
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_permutex_pd::<0b11_11_11_11>(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permutex_pd::<0b11_11_11_11>(0b11111111, a);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 7., 7., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_permutex_pd::<0b11_11_11_11>(a);
+        let e = _mm256_set_pd(0., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_mask_permutex_pd::<0b11_11_11_11>(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permutex_pd::<0b11_11_11_11>(a, 0b00001111, a);
+        let e = _mm256_set_pd(0., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_maskz_permutex_pd::<0b11_11_11_11>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permutex_pd::<0b11_11_11_11>(0b00001111, a);
+        let e = _mm256_set_pd(0., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutevar_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_set1_epi64(0b1);
+        let r = _mm512_permutevar_pd(a, b);
+        let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutevar_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_set1_epi64(0b1);
+        let r = _mm512_mask_permutevar_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permutevar_pd(a, 0b11111111, a, b);
+        let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutevar_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let b = _mm512_set1_epi64(0b1);
+        let r = _mm512_maskz_permutevar_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permutevar_pd(0b00001111, a, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutevar_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set1_epi64x(0b1);
+        let r = _mm256_mask_permutevar_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permutevar_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(1., 1., 3., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutevar_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let b = _mm256_set1_epi64x(0b1);
+        let r = _mm256_maskz_permutevar_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permutevar_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(1., 1., 3., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutevar_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set1_epi64x(0b1);
+        let r = _mm_mask_permutevar_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_permutevar_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutevar_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set1_epi64x(0b1);
+        let r = _mm_maskz_permutevar_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_permutevar_pd(0b00000011, a, b);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_epi64() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_permutexvar_epi64(idx, a);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_epi64() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_permutexvar_epi64(a, 0, idx, a);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutexvar_epi64(a, 0b11111111, idx, a);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_epi64() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_permutexvar_epi64(0, idx, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutexvar_epi64(0b00001111, idx, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 6, 6, 6, 6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_epi64() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_permutexvar_epi64(idx, a);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_epi64() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_permutexvar_epi64(a, 0, idx, a);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutexvar_epi64(a, 0b00001111, idx, a);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_epi64() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_permutexvar_epi64(0, idx, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutexvar_epi64(0b00001111, idx, a);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_pd() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_permutexvar_pd(idx, a);
+        let e = _mm512_set1_pd(6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutexvar_pd() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_permutexvar_pd(a, 0, idx, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permutexvar_pd(a, 0b11111111, idx, a);
+        let e = _mm512_set1_pd(6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutexvar_pd() {
+        let idx = _mm512_set1_epi64(1);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_permutexvar_pd(0, idx, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permutexvar_pd(0b00001111, idx, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 6., 6., 6., 6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutexvar_pd() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_permutexvar_pd(idx, a);
+        let e = _mm256_set1_pd(2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutexvar_pd() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_mask_permutexvar_pd(a, 0, idx, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permutexvar_pd(a, 0b00001111, idx, a);
+        let e = _mm256_set1_pd(2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutexvar_pd() {
+        let idx = _mm256_set1_epi64x(1);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_maskz_permutexvar_pd(0, idx, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permutexvar_pd(0b00001111, idx, a);
+        let e = _mm256_set1_pd(2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_epi64(100);
+        let r = _mm512_permutex2var_epi64(a, idx, b);
+        let e = _mm512_set_epi64(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_epi64(100);
+        let r = _mm512_mask_permutex2var_epi64(a, 0, idx, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_permutex2var_epi64(a, 0b11111111, idx, b);
+        let e = _mm512_set_epi64(6, 100, 5, 100, 4, 100, 3, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_epi64(100);
+        let r = _mm512_maskz_permutex2var_epi64(0, a, idx, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_permutex2var_epi64(0b00001111, a, idx, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 4, 100, 3, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let idx = _mm512_set_epi64(1000, 1 << 3, 2000, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_epi64(100);
+        let r = _mm512_mask2_permutex2var_epi64(a, idx, 0, b);
+        assert_eq_m512i(r, idx);
+        let r = _mm512_mask2_permutex2var_epi64(a, idx, 0b00001111, b);
+        let e = _mm512_set_epi64(1000, 1 << 3, 2000, 1 << 3, 4, 100, 3, 100);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_epi64x(100);
+        let r = _mm256_permutex2var_epi64(a, idx, b);
+        let e = _mm256_set_epi64x(2, 100, 1, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_epi64x(100);
+        let r = _mm256_mask_permutex2var_epi64(a, 0, idx, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_permutex2var_epi64(a, 0b00001111, idx, b);
+        let e = _mm256_set_epi64x(2, 100, 1, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_epi64x(100);
+        let r = _mm256_maskz_permutex2var_epi64(0, a, idx, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_permutex2var_epi64(0b00001111, a, idx, b);
+        let e = _mm256_set_epi64x(2, 100, 1, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_epi64x(100);
+        let r = _mm256_mask2_permutex2var_epi64(a, idx, 0, b);
+        assert_eq_m256i(r, idx);
+        let r = _mm256_mask2_permutex2var_epi64(a, idx, 0b00001111, b);
+        let e = _mm256_set_epi64x(2, 100, 1, 100);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_epi64x(100);
+        let r = _mm_permutex2var_epi64(a, idx, b);
+        let e = _mm_set_epi64x(0, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_epi64x(100);
+        let r = _mm_mask_permutex2var_epi64(a, 0, idx, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_permutex2var_epi64(a, 0b00000011, idx, b);
+        let e = _mm_set_epi64x(0, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_epi64x(100);
+        let r = _mm_maskz_permutex2var_epi64(0, a, idx, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_permutex2var_epi64(0b00000011, a, idx, b);
+        let e = _mm_set_epi64x(0, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_epi64x(100);
+        let r = _mm_mask2_permutex2var_epi64(a, idx, 0, b);
+        assert_eq_m128i(r, idx);
+        let r = _mm_mask2_permutex2var_epi64(a, idx, 0b00000011, b);
+        let e = _mm_set_epi64x(0, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_permutex2var_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_pd(100.);
+        let r = _mm512_permutex2var_pd(a, idx, b);
+        let e = _mm512_set_pd(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_permutex2var_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_pd(100.);
+        let r = _mm512_mask_permutex2var_pd(a, 0, idx, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_permutex2var_pd(a, 0b11111111, idx, b);
+        let e = _mm512_set_pd(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_permutex2var_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_pd(100.);
+        let r = _mm512_maskz_permutex2var_pd(0, a, idx, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_permutex2var_pd(0b00001111, a, idx, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 4., 100., 3., 100.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2_permutex2var_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
+        let b = _mm512_set1_pd(100.);
+        let r = _mm512_mask2_permutex2var_pd(a, idx, 0, b);
+        assert_eq_m512d(r, _mm512_castsi512_pd(idx));
+        let r = _mm512_mask2_permutex2var_pd(a, idx, 0b11111111, b);
+        let e = _mm512_set_pd(6., 100., 5., 100., 4., 100., 3., 100.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_permutex2var_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_pd(100.);
+        let r = _mm256_permutex2var_pd(a, idx, b);
+        let e = _mm256_set_pd(2., 100., 1., 100.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permutex2var_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_pd(100.);
+        let r = _mm256_mask_permutex2var_pd(a, 0, idx, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_permutex2var_pd(a, 0b00001111, idx, b);
+        let e = _mm256_set_pd(2., 100., 1., 100.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permutex2var_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_pd(100.);
+        let r = _mm256_maskz_permutex2var_pd(0, a, idx, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_permutex2var_pd(0b00001111, a, idx, b);
+        let e = _mm256_set_pd(2., 100., 1., 100.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask2_permutex2var_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let idx = _mm256_set_epi64x(1, 1 << 2, 2, 1 << 2);
+        let b = _mm256_set1_pd(100.);
+        let r = _mm256_mask2_permutex2var_pd(a, idx, 0, b);
+        assert_eq_m256d(r, _mm256_castsi256_pd(idx));
+        let r = _mm256_mask2_permutex2var_pd(a, idx, 0b00001111, b);
+        let e = _mm256_set_pd(2., 100., 1., 100.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_permutex2var_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_pd(100.);
+        let r = _mm_permutex2var_pd(a, idx, b);
+        let e = _mm_set_pd(0., 100.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_permutex2var_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_pd(100.);
+        let r = _mm_mask_permutex2var_pd(a, 0, idx, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_permutex2var_pd(a, 0b00000011, idx, b);
+        let e = _mm_set_pd(0., 100.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_permutex2var_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_pd(100.);
+        let r = _mm_maskz_permutex2var_pd(0, a, idx, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_permutex2var_pd(0b00000011, a, idx, b);
+        let e = _mm_set_pd(0., 100.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask2_permutex2var_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let idx = _mm_set_epi64x(1, 1 << 1);
+        let b = _mm_set1_pd(100.);
+        let r = _mm_mask2_permutex2var_pd(a, idx, 0, b);
+        assert_eq_m128d(r, _mm_castsi128_pd(idx));
+        let r = _mm_mask2_permutex2var_pd(a, idx, 0b00000011, b);
+        let e = _mm_set_pd(0., 100.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_pd() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_shuffle_pd::<0b11_11_11_11>(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(2., 1., 6., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_pd() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_shuffle_pd::<0b11_11_11_11>(0b00001111, a, b);
+        let e = _mm256_set_pd(2., 1., 6., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_pd() {
+        let a = _mm_set_pd(1., 4.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_shuffle_pd::<0b11_11_11_11>(a, 0b00000011, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_pd() {
+        let a = _mm_set_pd(1., 4.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_shuffle_pd::<0b11_11_11_11>(0b00000011, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_i64x2() {
+        let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_shuffle_i64x2::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_i64x2() {
+        let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_mask_shuffle_i64x2::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shuffle_i64x2::<0b00_00_00_00>(a, 0b11111111, a, b);
+        let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_i64x2() {
+        let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm512_maskz_shuffle_i64x2::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shuffle_i64x2::<0b00_00_00_00>(0b00001111, a, b);
+        let e = _mm512_setr_epi64(1, 4, 1, 4, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_i64x2() {
+        let a = _mm256_set_epi64x(1, 4, 5, 8);
+        let b = _mm256_set_epi64x(2, 3, 6, 7);
+        let r = _mm256_shuffle_i64x2::<0b00>(a, b);
+        let e = _mm256_set_epi64x(6, 7, 5, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_i64x2() {
+        let a = _mm256_set_epi64x(1, 4, 5, 8);
+        let b = _mm256_set_epi64x(2, 3, 6, 7);
+        let r = _mm256_mask_shuffle_i64x2::<0b00>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_i64x2::<0b00>(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(6, 7, 5, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_i64x2() {
+        let a = _mm256_set_epi64x(1, 4, 5, 8);
+        let b = _mm256_set_epi64x(2, 3, 6, 7);
+        let r = _mm256_maskz_shuffle_i64x2::<0b00>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_i64x2::<0b00>(0b00001111, a, b);
+        let e = _mm256_set_epi64x(6, 7, 5, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_shuffle_f64x2() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm512_shuffle_f64x2::<0b00_00_00_00>(a, b);
+        let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_shuffle_f64x2() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm512_mask_shuffle_f64x2::<0b00_00_00_00>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_shuffle_f64x2::<0b00_00_00_00>(a, 0b11111111, a, b);
+        let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_shuffle_f64x2() {
+        let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm512_maskz_shuffle_f64x2::<0b00_00_00_00>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_shuffle_f64x2::<0b00_00_00_00>(0b00001111, a, b);
+        let e = _mm512_setr_pd(1., 4., 1., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_f64x2() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_shuffle_f64x2::<0b00>(a, b);
+        let e = _mm256_set_pd(6., 7., 5., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_f64x2() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_mask_shuffle_f64x2::<0b00>(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_shuffle_f64x2::<0b00>(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(6., 7., 5., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_f64x2() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_maskz_shuffle_f64x2::<0b00>(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_shuffle_f64x2::<0b00>(0b00001111, a, b);
+        let e = _mm256_set_pd(6., 7., 5., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_movedup_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_movedup_pd(a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_movedup_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_mask_movedup_pd(a, 0, a);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_movedup_pd(a, 0b11111111, a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_movedup_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_movedup_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_movedup_pd(0b00001111, a);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_movedup_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_mask_movedup_pd(a, 0, a);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_movedup_pd(a, 0b00001111, a);
+        let e = _mm256_set_pd(2., 2., 4., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_movedup_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let r = _mm256_maskz_movedup_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_movedup_pd(0b00001111, a);
+        let e = _mm256_set_pd(2., 2., 4., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_movedup_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_mask_movedup_pd(a, 0, a);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_movedup_pd(a, 0b00000011, a);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_movedup_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let r = _mm_maskz_movedup_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_movedup_pd(0b00000011, a);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_inserti64x4() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_inserti64x4::<1>(a, b);
+        let e = _mm512_setr_epi64(1, 2, 3, 4, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_inserti64x4() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_mask_inserti64x4::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_inserti64x4::<1>(a, 0b11111111, a, b);
+        let e = _mm512_setr_epi64(1, 2, 3, 4, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_inserti64x4() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_maskz_inserti64x4::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_inserti64x4::<1>(0b00001111, a, b);
+        let e = _mm512_setr_epi64(1, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_insertf64x4() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_insertf64x4::<1>(a, b);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_insertf64x4() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_mask_insertf64x4::<1>(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_insertf64x4::<1>(a, 0b11111111, a, b);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_insertf64x4() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_maskz_insertf64x4::<1>(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_insertf64x4::<1>(0b00001111, a, b);
+        let e = _mm512_setr_pd(1., 2., 3., 4., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd128_pd512() {
+        let a = _mm_setr_pd(17., 18.);
+        let r = _mm512_castpd128_pd512(a);
+        let e = _mm512_setr_pd(17., 18., -1., -1., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd256_pd512() {
+        let a = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_castpd256_pd512(a);
+        let e = _mm512_setr_pd(17., 18., 19., 20., -1., -1., -1., -1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextpd128_pd512() {
+        let a = _mm_setr_pd(17., 18.);
+        let r = _mm512_zextpd128_pd512(a);
+        let e = _mm512_setr_pd(17., 18., 0., 0., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextpd256_pd512() {
+        let a = _mm256_setr_pd(17., 18., 19., 20.);
+        let r = _mm512_zextpd256_pd512(a);
+        let e = _mm512_setr_pd(17., 18., 19., 20., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd512_pd128() {
+        let a = _mm512_setr_pd(17., 18., -1., -1., -1., -1., -1., -1.);
+        let r = _mm512_castpd512_pd128(a);
+        let e = _mm_setr_pd(17., 18.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd512_pd256() {
+        let a = _mm512_setr_pd(17., 18., 19., 20., -1., -1., -1., -1.);
+        let r = _mm512_castpd512_pd256(a);
+        let e = _mm256_setr_pd(17., 18., 19., 20.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd_ps() {
+        let a = _mm512_set1_pd(1.);
+        let r = _mm512_castpd_ps(a);
+        let e = _mm512_set_ps(
+            1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0,
+            1.875, 0.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd_si512() {
+        let a = _mm512_set1_pd(1.);
+        let r = _mm512_castpd_si512(a);
+        let e = _mm512_set_epi32(
+            1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248,
+            0, 1072693248, 0, 1072693248, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi128_si512() {
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_castsi128_si512(a);
+        let e = _mm512_setr_epi64(17, 18, -1, -1, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi256_si512() {
+        let a = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_castsi256_si512(a);
+        let e = _mm512_setr_epi64(17, 18, 19, 20, -1, -1, -1, -1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextsi128_si512() {
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_zextsi128_si512(a);
+        let e = _mm512_setr_epi64(17, 18, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_zextsi256_si512() {
+        let a = _mm256_setr_epi64x(17, 18, 19, 20);
+        let r = _mm512_zextsi256_si512(a);
+        let e = _mm512_setr_epi64(17, 18, 19, 20, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_si128() {
+        let a = _mm512_setr_epi64(17, 18, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_castsi512_si128(a);
+        let e = _mm_setr_epi64x(17, 18);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_si256() {
+        let a = _mm512_setr_epi64(17, 18, 19, 20, -1, -1, -1, -1);
+        let r = _mm512_castsi512_si256(a);
+        let e = _mm256_setr_epi64x(17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_ps() {
+        let a = _mm512_set1_epi64(1 << 62);
+        let r = _mm512_castsi512_ps(a);
+        let e = _mm512_set_ps(
+            2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_pd() {
+        let a = _mm512_set1_epi64(1 << 62);
+        let r = _mm512_castsi512_pd(a);
+        let e = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_broadcastq_epi64(a);
+        let e = _mm512_set1_epi64(17);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastq_epi64() {
+        let src = _mm512_set1_epi64(18);
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_mask_broadcastq_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastq_epi64(src, 0b11111111, a);
+        let e = _mm512_set1_epi64(17);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(17, 18);
+        let r = _mm512_maskz_broadcastq_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastq_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 17, 17, 17, 17);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastq_epi64() {
+        let src = _mm256_set1_epi64x(18);
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm256_mask_broadcastq_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastq_epi64(src, 0b00001111, a);
+        let e = _mm256_set1_epi64x(18);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastq_epi64() {
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm256_maskz_broadcastq_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastq_epi64(0b00001111, a);
+        let e = _mm256_set1_epi64x(18);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastq_epi64() {
+        let src = _mm_set1_epi64x(18);
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm_mask_broadcastq_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastq_epi64(src, 0b00000011, a);
+        let e = _mm_set1_epi64x(18);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastq_epi64() {
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm_maskz_broadcastq_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastq_epi64(0b00000011, a);
+        let e = _mm_set1_epi64x(18);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastsd_pd() {
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm512_broadcastsd_pd(a);
+        let e = _mm512_set1_pd(18.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastsd_pd() {
+        let src = _mm512_set1_pd(18.);
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm512_mask_broadcastsd_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_broadcastsd_pd(src, 0b11111111, a);
+        let e = _mm512_set1_pd(18.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastsd_pd() {
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm512_maskz_broadcastsd_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_broadcastsd_pd(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 18., 18., 18., 18.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastsd_pd() {
+        let src = _mm256_set1_pd(18.);
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm256_mask_broadcastsd_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_broadcastsd_pd(src, 0b00001111, a);
+        let e = _mm256_set1_pd(18.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastsd_pd() {
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm256_maskz_broadcastsd_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_broadcastsd_pd(0b00001111, a);
+        let e = _mm256_set1_pd(18.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_i64x4() {
+        let a = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm512_broadcast_i64x4(a);
+        let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_i64x4() {
+        let src = _mm512_set1_epi64(18);
+        let a = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm512_mask_broadcast_i64x4(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcast_i64x4(src, 0b11111111, a);
+        let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_i64x4() {
+        let a = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm512_maskz_broadcast_i64x4(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcast_i64x4(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_f64x4() {
+        let a = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm512_broadcast_f64x4(a);
+        let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_f64x4() {
+        let src = _mm512_set1_pd(18.);
+        let a = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm512_mask_broadcast_f64x4(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_broadcast_f64x4(src, 0b11111111, a);
+        let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_f64x4() {
+        let a = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm512_maskz_broadcast_f64x4(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_broadcast_f64x4(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_mask_blend_epi64(0b11110000, a, b);
+        let e = _mm512_set_epi64(2, 2, 2, 2, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_blend_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_mask_blend_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(2.);
+        let r = _mm512_mask_blend_pd(0b11110000, a, b);
+        let e = _mm512_set_pd(2., 2., 2., 2., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(2.);
+        let r = _mm256_mask_blend_pd(0b00001111, a, b);
+        let e = _mm256_set1_pd(2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let r = _mm_mask_blend_pd(0b00000011, a, b);
+        let e = _mm_set1_pd(2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_unpackhi_epi64(a, b);
+        let e = _mm512_set_epi64(17, 1, 19, 3, 21, 5, 23, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_unpackhi_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(17, 1, 19, 3, 21, 5, 23, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_unpackhi_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 21, 5, 23, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_mask_unpackhi_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(17, 1, 19, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_maskz_unpackhi_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(17, 1, 19, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_mask_unpackhi_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(17, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_maskz_unpackhi_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(17, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_unpackhi_pd(a, b);
+        let e = _mm512_set_pd(17., 1., 19., 3., 21., 5., 23., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_mask_unpackhi_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_unpackhi_pd(a, 0b11111111, a, b);
+        let e = _mm512_set_pd(17., 1., 19., 3., 21., 5., 23., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_maskz_unpackhi_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_unpackhi_pd(0b00001111, a, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 21., 5., 23., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_mask_unpackhi_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_unpackhi_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(17., 1., 19., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_maskz_unpackhi_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_unpackhi_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(17., 1., 19., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_mask_unpackhi_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_unpackhi_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(17., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_maskz_unpackhi_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_unpackhi_pd(0b00000011, a, b);
+        let e = _mm_set_pd(17., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_unpacklo_epi64(a, b);
+        let e = _mm512_set_epi64(18, 2, 20, 4, 22, 6, 24, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_unpacklo_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(18, 2, 20, 4, 22, 6, 24, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_unpacklo_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 22, 6, 24, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_mask_unpacklo_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(18, 2, 20, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_maskz_unpacklo_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(18, 2, 20, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_mask_unpacklo_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_maskz_unpacklo_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_unpacklo_pd(a, b);
+        let e = _mm512_set_pd(18., 2., 20., 4., 22., 6., 24., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_mask_unpacklo_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_unpacklo_pd(a, 0b11111111, a, b);
+        let e = _mm512_set_pd(18., 2., 20., 4., 22., 6., 24., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_maskz_unpacklo_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_unpacklo_pd(0b00001111, a, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 22., 6., 24., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_mask_unpacklo_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_unpacklo_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(18., 2., 20., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_maskz_unpacklo_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_unpacklo_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(18., 2., 20., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_mask_unpacklo_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_unpacklo_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(18., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_maskz_unpacklo_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_unpacklo_pd(0b00000011, a, b);
+        let e = _mm_set_pd(18., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_alignr_epi64() {
+        let a = _mm512_set_epi64(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi64(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm512_alignr_epi64::<0>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi64::<8>(a, b);
+        assert_eq_m512i(r, b);
+        let r = _mm512_alignr_epi64::<1>(a, b);
+        let e = _mm512_set_epi64(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_alignr_epi64() {
+        let a = _mm512_set_epi64(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi64(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm512_mask_alignr_epi64::<1>(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_alignr_epi64::<1>(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_alignr_epi64() {
+        let a = _mm512_set_epi64(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm512_set_epi64(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm512_maskz_alignr_epi64::<1>(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_alignr_epi64::<1>(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 13, 12, 11, 10);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_alignr_epi64() {
+        let a = _mm256_set_epi64x(4, 3, 2, 1);
+        let b = _mm256_set_epi64x(8, 7, 6, 5);
+        let r = _mm256_alignr_epi64::<0>(a, b);
+        let e = _mm256_set_epi64x(8, 7, 6, 5);
+        assert_eq_m256i(r, e);
+        let r = _mm256_alignr_epi64::<6>(a, b);
+        let e = _mm256_set_epi64x(6, 5, 4, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi64() {
+        let a = _mm256_set_epi64x(4, 3, 2, 1);
+        let b = _mm256_set_epi64x(8, 7, 6, 5);
+        let r = _mm256_mask_alignr_epi64::<1>(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi64::<0>(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(8, 7, 6, 5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi64() {
+        let a = _mm256_set_epi64x(4, 3, 2, 1);
+        let b = _mm256_set_epi64x(8, 7, 6, 5);
+        let r = _mm256_maskz_alignr_epi64::<1>(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi64::<0>(0b00001111, a, b);
+        let e = _mm256_set_epi64x(8, 7, 6, 5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_alignr_epi64() {
+        let a = _mm_set_epi64x(2, 1);
+        let b = _mm_set_epi64x(4, 3);
+        let r = _mm_alignr_epi64::<0>(a, b);
+        let e = _mm_set_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi64() {
+        let a = _mm_set_epi64x(2, 1);
+        let b = _mm_set_epi64x(4, 3);
+        let r = _mm_mask_alignr_epi64::<1>(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi64::<0>(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi64() {
+        let a = _mm_set_epi64x(2, 1);
+        let b = _mm_set_epi64x(4, 3);
+        let r = _mm_maskz_alignr_epi64::<1>(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi64::<0>(0b00000011, a, b);
+        let e = _mm_set_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_and_epi64(a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_and_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_mask_and_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_and_epi64(a, 0b01111111, a, b);
+        let e = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_and_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_maskz_and_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_and_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_and_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_mask_and_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_and_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_and_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 0);
+        let r = _mm256_maskz_and_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_and_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_and_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 0);
+        let r = _mm_mask_and_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_and_epi64(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_and_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 0);
+        let r = _mm_maskz_and_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_and_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_and_si512() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_and_epi64(a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_or_epi64(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_or_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_mask_or_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_or_epi64(a, 0b11111111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_or_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_maskz_or_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_or_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_or_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_or_epi64(a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_or_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_mask_or_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_or_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_or_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_maskz_or_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_or_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_or_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_or_epi64(a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_or_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_mask_or_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_or_epi64(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_or_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_maskz_or_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_or_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_or_si512() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_or_epi64(a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0,
+            0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_xor_epi64(a, b);
+        let e = _mm512_set_epi64(1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_xor_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_mask_xor_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_xor_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_xor_epi64() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_maskz_xor_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_xor_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_xor_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_xor_epi64(a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_xor_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_mask_xor_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_xor_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_xor_epi64() {
+        let a = _mm256_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm256_set1_epi64x(1 << 13);
+        let r = _mm256_maskz_xor_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_xor_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_xor_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_xor_epi64(a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_xor_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_mask_xor_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_xor_epi64(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_xor_epi64() {
+        let a = _mm_set1_epi64x(1 << 0 | 1 << 15);
+        let b = _mm_set1_epi64x(1 << 13);
+        let r = _mm_maskz_xor_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_xor_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 0 | 1 << 13 | 1 << 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_xor_si512() {
+        let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let b = _mm512_set_epi64(1 << 13, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
+        let r = _mm512_xor_epi64(a, b);
+        let e = _mm512_set_epi64(1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_epi64() {
+        let a = _mm512_set1_epi64(0);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        let r = _mm512_andnot_epi64(a, b);
+        let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_andnot_epi64() {
+        let a = _mm512_set1_epi64(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        let r = _mm512_mask_andnot_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_andnot_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_andnot_epi64() {
+        let a = _mm512_set1_epi64(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        let r = _mm512_maskz_andnot_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_andnot_epi64(0b00001111, a, b);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi64(
+            0, 0, 0, 0,
+            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_andnot_epi64() {
+        let a = _mm256_set1_epi64x(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi64x(1 << 3 | 1 << 4);
+        let r = _mm256_mask_andnot_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_andnot_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_andnot_epi64() {
+        let a = _mm256_set1_epi64x(1 << 1 | 1 << 2);
+        let b = _mm256_set1_epi64x(1 << 3 | 1 << 4);
+        let r = _mm256_maskz_andnot_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_andnot_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(1 << 3 | 1 << 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_andnot_epi64() {
+        let a = _mm_set1_epi64x(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi64x(1 << 3 | 1 << 4);
+        let r = _mm_mask_andnot_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_andnot_epi64(a, 0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_andnot_epi64() {
+        let a = _mm_set1_epi64x(1 << 1 | 1 << 2);
+        let b = _mm_set1_epi64x(1 << 3 | 1 << 4);
+        let r = _mm_maskz_andnot_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_andnot_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(1 << 3 | 1 << 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_si512() {
+        let a = _mm512_set1_epi64(0);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        let r = _mm512_andnot_si512(a, b);
+        let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let e: i64 = _mm512_reduce_add_epi64(a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let e: i64 = _mm512_mask_reduce_add_epi64(0b11110000, a);
+        assert_eq!(4, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_add_pd() {
+        let a = _mm512_set1_pd(1.);
+        let e: f64 = _mm512_reduce_add_pd(a);
+        assert_eq!(8., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_pd() {
+        let a = _mm512_set1_pd(1.);
+        let e: f64 = _mm512_mask_reduce_add_pd(0b11110000, a);
+        assert_eq!(4., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let e: i64 = _mm512_reduce_mul_epi64(a);
+        assert_eq!(256, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let e: i64 = _mm512_mask_reduce_mul_epi64(0b11110000, a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_pd() {
+        let a = _mm512_set1_pd(2.);
+        let e: f64 = _mm512_reduce_mul_pd(a);
+        assert_eq!(256., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_pd() {
+        let a = _mm512_set1_pd(2.);
+        let e: f64 = _mm512_mask_reduce_mul_pd(0b11110000, a);
+        assert_eq!(16., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_reduce_max_epi64(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_mask_reduce_max_epi64(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_reduce_max_epu64(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_mask_reduce_max_epu64(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let e: f64 = _mm512_reduce_max_pd(a);
+        assert_eq!(7., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let e: f64 = _mm512_mask_reduce_max_pd(0b11110000, a);
+        assert_eq!(3., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_reduce_min_epi64(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_mask_reduce_min_epi64(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_reduce_min_epu64(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_mask_reduce_min_epu64(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let e: f64 = _mm512_reduce_min_pd(a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let e: f64 = _mm512_mask_reduce_min_pd(0b11110000, a);
+        assert_eq!(0., e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_and_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_reduce_and_epi64(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_and_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_mask_reduce_and_epi64(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_or_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_reduce_or_epi64(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_or_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_mask_reduce_or_epi64(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extractf64x4_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_extractf64x4_pd::<1>(a);
+        let e = _mm256_setr_pd(5., 6., 7., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extractf64x4_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let src = _mm256_set1_pd(100.);
+        let r = _mm512_mask_extractf64x4_pd::<1>(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm512_mask_extractf64x4_pd::<1>(src, 0b11111111, a);
+        let e = _mm256_setr_pd(5., 6., 7., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_extractf64x4_pd() {
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm512_maskz_extractf64x4_pd::<1>(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm512_maskz_extractf64x4_pd::<1>(0b00000001, a);
+        let e = _mm256_setr_pd(5., 0., 0., 0.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_extracti64x4_epi64() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_extracti64x4_epi64::<0x1>(a);
+        let e = _mm256_setr_epi64x(5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_extracti64x4_epi64() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm256_set1_epi64x(100);
+        let r = _mm512_mask_extracti64x4_epi64::<0x1>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_extracti64x4_epi64::<0x1>(src, 0b11111111, a);
+        let e = _mm256_setr_epi64x(5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_extracti64x4_epi64() {
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_maskz_extracti64x4_epi64::<0x1>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_extracti64x4_epi64::<0x1>(0b00000001, a);
+        let e = _mm256_setr_epi64x(5, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_epi64() {
+        let src = _mm512_set1_epi64(200);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_compress_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_compress_epi64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(200, 200, 200, 200, 1, 3, 5, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_compress_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_compress_epi64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1, 3, 5, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi64() {
+        let src = _mm256_set1_epi64x(200);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_compress_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_compress_epi64(src, 0b00000101, a);
+        let e = _mm256_set_epi64x(200, 200, 1, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_compress_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_compress_epi64(0b00000101, a);
+        let e = _mm256_set_epi64x(0, 0, 1, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi64() {
+        let src = _mm_set1_epi64x(200);
+        let a = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_compress_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_compress_epi64(src, 0b00000001, a);
+        let e = _mm_set_epi64x(200, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_compress_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_compress_epi64(0b00000001, a);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_compress_pd() {
+        let src = _mm512_set1_pd(200.);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_compress_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_compress_pd(src, 0b01010101, a);
+        let e = _mm512_set_pd(200., 200., 200., 200., 1., 3., 5., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_compress_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_compress_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_compress_pd(0b01010101, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 1., 3., 5., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_compress_pd() {
+        let src = _mm256_set1_pd(200.);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_mask_compress_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_compress_pd(src, 0b00000101, a);
+        let e = _mm256_set_pd(200., 200., 1., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_maskz_compress_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_compress_pd(0b00000101, a);
+        let e = _mm256_set_pd(0., 0., 1., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_compress_pd() {
+        let src = _mm_set1_pd(200.);
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_mask_compress_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_compress_pd(src, 0b00000001, a);
+        let e = _mm_set_pd(200., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_compress_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_maskz_compress_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_compress_pd(0b00000001, a);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_epi64() {
+        let src = _mm512_set1_epi64(200);
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_expand_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_expand_epi64(src, 0b01010101, a);
+        let e = _mm512_set_epi64(200, 4, 200, 5, 200, 6, 200, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_expand_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_expand_epi64(0b01010101, a);
+        let e = _mm512_set_epi64(0, 4, 0, 5, 0, 6, 0, 7);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi64() {
+        let src = _mm256_set1_epi64x(200);
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_mask_expand_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_expand_epi64(src, 0b00000101, a);
+        let e = _mm256_set_epi64x(200, 2, 200, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi64() {
+        let a = _mm256_set_epi64x(0, 1, 2, 3);
+        let r = _mm256_maskz_expand_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_expand_epi64(0b00000101, a);
+        let e = _mm256_set_epi64x(0, 2, 0, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi64() {
+        let src = _mm_set1_epi64x(200);
+        let a = _mm_set_epi64x(0, 1);
+        let r = _mm_mask_expand_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_expand_epi64(src, 0b00000001, a);
+        let e = _mm_set_epi64x(200, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let r = _mm_maskz_expand_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_expand_epi64(0b00000001, a);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_expand_pd() {
+        let src = _mm512_set1_pd(200.);
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_expand_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_expand_pd(src, 0b01010101, a);
+        let e = _mm512_set_pd(200., 4., 200., 5., 200., 6., 200., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_expand_pd() {
+        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_expand_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_expand_pd(0b01010101, a);
+        let e = _mm512_set_pd(0., 4., 0., 5., 0., 6., 0., 7.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_expand_pd() {
+        let src = _mm256_set1_pd(200.);
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_mask_expand_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_expand_pd(src, 0b00000101, a);
+        let e = _mm256_set_pd(200., 2., 200., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_pd() {
+        let a = _mm256_set_pd(0., 1., 2., 3.);
+        let r = _mm256_maskz_expand_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_expand_pd(0b00000101, a);
+        let e = _mm256_set_pd(0., 2., 0., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_expand_pd() {
+        let src = _mm_set1_pd(200.);
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_mask_expand_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_expand_pd(src, 0b00000001, a);
+        let e = _mm_set_pd(200., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_expand_pd() {
+        let a = _mm_set_pd(0., 1.);
+        let r = _mm_maskz_expand_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_expand_pd(0b00000001, a);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_epi64() {
+        let a = &[4, 3, 2, 5, -8, -9, -64, -50];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_epi64(black_box(p));
+        let e = _mm512_setr_epi64(4, 3, 2, 5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_loadu_epi64() {
+        let a = &[4, 3, 2, 5];
+        let p = a.as_ptr();
+        let r = _mm256_loadu_epi64(black_box(p));
+        let e = _mm256_setr_epi64x(4, 3, 2, 5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_loadu_epi64() {
+        let a = &[4, 3];
+        let p = a.as_ptr();
+        let r = _mm_loadu_epi64(black_box(p));
+        let e = _mm_setr_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_storeu_epi16() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_storeu_epi16() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm_set1_epi16(0);
+        _mm256_mask_cvtepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_storeu_epi16() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_storeu_epi16() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtsepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_storeu_epi16() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm256_mask_cvtsepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_storeu_epi16() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtsepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_storeu_epi16() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtusepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_storeu_epi16() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm256_mask_cvtusepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(
+            0,
+            0,
+            0,
+            0,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_storeu_epi16() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtusepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_storeu_epi8() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm512_mask_cvtepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_storeu_epi8() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_storeu_epi8() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_storeu_epi8() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm512_mask_cvtsepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_storeu_epi8() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtsepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_storeu_epi8() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_storeu_epi8() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm512_mask_cvtusepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_storeu_epi8() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtusepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_storeu_epi8() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi64_storeu_epi32() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm256_set1_epi32(9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi64_storeu_epi32() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm_set1_epi32(0);
+        _mm256_mask_cvtepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi32(9, 9, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi64_storeu_epi32() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi32(0, 0, 9, 9);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_storeu_epi32() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm256_set1_epi32(i32::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_storeu_epi32() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi32(0);
+        _mm256_mask_cvtsepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b00001111, a);
+        let e = _mm_set1_epi32(i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_storeu_epi32() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtsepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, i32::MAX, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_storeu_epi32() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm256_set1_epi32(u32::MAX as i32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_storeu_epi32() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi32(0);
+        _mm256_mask_cvtusepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b00001111, a);
+        let e = _mm_set1_epi32(u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_storeu_epi32() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtusepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, u32::MAX as i32, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_epi64() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm512_set1_epi64(0);
+        _mm512_storeu_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_storeu_epi64() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm256_set1_epi64x(0);
+        _mm256_storeu_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_storeu_epi64() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi64x(0);
+        _mm_storeu_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [4, 3, 2, 5, -8, -9, -64, -50],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_epi64(black_box(p));
+        let e = _mm512_setr_epi64(4, 3, 2, 5, -8, -9, -64, -50);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 4],
+        }
+        let a = Align { data: [4, 3, 2, 5] };
+        let p = (a.data).as_ptr();
+        let r = _mm256_load_epi64(black_box(p));
+        let e = _mm256_set_epi64x(5, 2, 3, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 2],
+        }
+        let a = Align { data: [4, 3] };
+        let p = (a.data).as_ptr();
+        let r = _mm_load_epi64(black_box(p));
+        let e = _mm_set_epi64x(3, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_epi64() {
+        let a = _mm512_set1_epi64(9);
+        let mut r = _mm512_set1_epi64(0);
+        _mm512_store_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_store_epi64() {
+        let a = _mm256_set1_epi64x(9);
+        let mut r = _mm256_set1_epi64x(0);
+        _mm256_store_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m256i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_store_epi64() {
+        let a = _mm_set1_epi64x(9);
+        let mut r = _mm_set1_epi64x(0);
+        _mm_store_epi64(&mut r as *mut _ as *mut i64, a);
+        assert_eq_m128i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [4., 3., 2., 5., -8., -9., -64., -50.],
+        };
+        let p = (a.data).as_ptr();
+        let r = _mm512_load_pd(black_box(p));
+        let e = _mm512_setr_pd(4., 3., 2., 5., -8., -9., -64., -50.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_store_pd() {
+        let a = _mm512_set1_pd(9.);
+        let mut r = _mm512_undefined_pd();
+        _mm512_store_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_test_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi64_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_test_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_test_epi64_mask() {
+        let a = _mm256_set1_epi64x(1 << 0);
+        let b = _mm256_set1_epi64x(1 << 0 | 1 << 1);
+        let r = _mm256_test_epi64_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_test_epi64_mask() {
+        let a = _mm256_set1_epi64x(1 << 0);
+        let b = _mm256_set1_epi64x(1 << 0 | 1 << 1);
+        let r = _mm256_mask_test_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_test_epi64_mask(0b00001111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_test_epi64_mask() {
+        let a = _mm_set1_epi64x(1 << 0);
+        let b = _mm_set1_epi64x(1 << 0 | 1 << 1);
+        let r = _mm_test_epi64_mask(a, b);
+        let e: __mmask8 = 0b00000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_test_epi64_mask() {
+        let a = _mm_set1_epi64x(1 << 0);
+        let b = _mm_set1_epi64x(1 << 0 | 1 << 1);
+        let r = _mm_mask_test_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_test_epi64_mask(0b00000011, a, b);
+        let e: __mmask8 = 0b00000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_testn_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi64_mask(a, b);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_testn_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 1);
+        let r = _mm512_mask_testn_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_testn_epi64_mask() {
+        let a = _mm256_set1_epi64x(1 << 0);
+        let b = _mm256_set1_epi64x(1 << 1);
+        let r = _mm256_testn_epi64_mask(a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_testn_epi64_mask() {
+        let a = _mm256_set1_epi64x(1 << 0);
+        let b = _mm256_set1_epi64x(1 << 1);
+        let r = _mm256_mask_testn_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm256_mask_testn_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00001111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_testn_epi64_mask() {
+        let a = _mm_set1_epi64x(1 << 0);
+        let b = _mm_set1_epi64x(1 << 1);
+        let r = _mm_testn_epi64_mask(a, b);
+        let e: __mmask8 = 0b00000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_testn_epi64_mask() {
+        let a = _mm_set1_epi64x(1 << 0);
+        let b = _mm_set1_epi64x(1 << 1);
+        let r = _mm_mask_testn_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm_mask_testn_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b00000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_stream_pd() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [f64; 8],
+        }
+        let a = _mm512_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 8] };
+
+        _mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_stream_si512() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [i64; 8],
+        }
+        let a = _mm512_set1_epi64(7);
+        let mut mem = Memory { data: [-1; 8] };
+
+        _mm512_stream_si512(&mut mem.data[0] as *mut i64, a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512i(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_set1_epi64() {
+        let src = _mm512_set1_epi64(2);
+        let a: i64 = 11;
+        let r = _mm512_mask_set1_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi64(src, 0b11111111, a);
+        let e = _mm512_set1_epi64(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_set1_epi64() {
+        let a: i64 = 11;
+        let r = _mm512_maskz_set1_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi64(0b11111111, a);
+        let e = _mm512_set1_epi64(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_set1_epi64() {
+        let src = _mm256_set1_epi64x(2);
+        let a: i64 = 11;
+        let r = _mm256_mask_set1_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_set1_epi64(src, 0b00001111, a);
+        let e = _mm256_set1_epi64x(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_set1_epi64() {
+        let a: i64 = 11;
+        let r = _mm256_maskz_set1_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_set1_epi64(0b00001111, a);
+        let e = _mm256_set1_epi64x(11);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_set1_epi64() {
+        let src = _mm_set1_epi64x(2);
+        let a: i64 = 11;
+        let r = _mm_mask_set1_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_set1_epi64(src, 0b00000011, a);
+        let e = _mm_set1_epi64x(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_set1_epi64() {
+        let a: i64 = 11;
+        let r = _mm_maskz_set1_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_set1_epi64(0b00000011, a);
+        let e = _mm_set1_epi64x(11);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundi64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundsi64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvti64_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvti64_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_si64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_si64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_i64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_u64() {
+        let a = _mm_set_pd(1., f64::MAX);
+        let r = _mm_cvt_roundsd_u64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_i64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_si64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_si64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_u64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_i64::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_si64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_si64::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_u64::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_i64::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_si64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_si64::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_u64::<_MM_FROUND_CUR_DIRECTION>(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvtu64_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvtu64_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvt_roundu64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvt_roundu64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundi64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundsi64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bmi.rs b/library/stdarch/crates/core_arch/src/x86_64/bmi.rs
new file mode 100644
index 000000000..9f71a8d38
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bmi.rs
@@ -0,0 +1,183 @@
+//! Bit Manipulation Instruction (BMI) Set 1.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
+    _bextr2_u64(a, ((start & 0xff) | ((len & 0xff) << 8)) as u64)
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
+/// to be extracted, and bits `[15,8]` specify the length of the range.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(bextr))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
+    x86_bmi_bextr_64(a, control)
+}
+
+/// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_andn_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
+    !a & b
+}
+
+/// Extracts lowest set isolated bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsi_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsi))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsi_u64(x: u64) -> u64 {
+    x & x.wrapping_neg()
+}
+
+/// Gets mask up to lowest set bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsmsk_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsmsk))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
+    x ^ (x.wrapping_sub(1_u64))
+}
+
+/// Resets the lowest set bit of `x`.
+///
+/// If `x` is sets CF.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsr_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(blsr))]
+#[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _blsr_u64(x: u64) -> u64 {
+    x & (x.wrapping_sub(1))
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
+    x.trailing_zeros() as u64
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_tzcnt_64)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_tzcnt_64(x: u64) -> i64 {
+    x.trailing_zeros() as i64
+}
+
+extern "C" {
+    #[link_name = "llvm.x86.bmi.bextr.64"]
+    fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::{x86::*, x86_64::*};
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_bextr_u64() {
+        let r = _bextr_u64(0b0101_0000u64, 4, 4);
+        assert_eq!(r, 0b0000_0101u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_andn_u64() {
+        assert_eq!(_andn_u64(0, 0), 0);
+        assert_eq!(_andn_u64(0, 1), 1);
+        assert_eq!(_andn_u64(1, 0), 0);
+        assert_eq!(_andn_u64(1, 1), 0);
+
+        let r = _andn_u64(0b0000_0000u64, 0b0000_0000u64);
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = _andn_u64(0b0000_0000u64, 0b1111_1111u64);
+        assert_eq!(r, 0b1111_1111u64);
+
+        let r = _andn_u64(0b1111_1111u64, 0b0000_0000u64);
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = _andn_u64(0b1111_1111u64, 0b1111_1111u64);
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = _andn_u64(0b0100_0000u64, 0b0101_1101u64);
+        assert_eq!(r, 0b0001_1101u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsi_u64() {
+        assert_eq!(_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsmsk_u64() {
+        let r = _blsmsk_u64(0b0011_0000u64);
+        assert_eq!(r, 0b0001_1111u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_blsr_u64() {
+        // TODO: test the behavior when the input is `0`.
+        let r = _blsr_u64(0b0011_0000u64);
+        assert_eq!(r, 0b0010_0000u64);
+    }
+
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u64() {
+        assert_eq!(_tzcnt_u64(0b0000_0001u64), 0u64);
+        assert_eq!(_tzcnt_u64(0b0000_0000u64), 64u64);
+        assert_eq!(_tzcnt_u64(0b1001_0000u64), 4u64);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bmi2.rs b/library/stdarch/crates/core_arch/src/x86_64/bmi2.rs
new file mode 100644
index 000000000..356d95a3d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bmi2.rs
@@ -0,0 +1,139 @@
+//! Bit Manipulation Instruction (BMI) Set 2.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
+//! available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [wikipedia_bmi]:
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Unsigned multiply without affecting flags.
+///
+/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
+/// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mulx_u64)
+#[inline]
+#[cfg_attr(test, assert_instr(mul))]
+#[target_feature(enable = "bmi2")]
+#[cfg(not(target_arch = "x86"))] // calls an intrinsic
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 {
+    let result: u128 = (a as u128) * (b as u128);
+    *hi = (result >> 64) as u64;
+    result as u64
+}
+
+/// Zeroes higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bzhi_u64)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(bzhi))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bzhi_u64(a: u64, index: u32) -> u64 {
+    x86_bmi2_bzhi_64(a, index as u64)
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pdep_u64)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pdep))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
+    x86_bmi2_pdep_64(a, mask)
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pext_u64)
+#[inline]
+#[target_feature(enable = "bmi2")]
+#[cfg_attr(test, assert_instr(pext))]
+#[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
+    x86_bmi2_pext_64(a, mask)
+}
+
+extern "C" {
+    #[link_name = "llvm.x86.bmi.bzhi.64"]
+    fn x86_bmi2_bzhi_64(x: u64, y: u64) -> u64;
+    #[link_name = "llvm.x86.bmi.pdep.64"]
+    fn x86_bmi2_pdep_64(x: u64, y: u64) -> u64;
+    #[link_name = "llvm.x86.bmi.pext.64"]
+    fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86_64::*;
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pext_u64() {
+        let n = 0b1011_1110_1001_0011u64;
+
+        let m0 = 0b0110_0011_1000_0101u64;
+        let s0 = 0b0000_0000_0011_0101u64;
+
+        let m1 = 0b1110_1011_1110_1111u64;
+        let s1 = 0b0001_0111_0100_0011u64;
+
+        assert_eq!(_pext_u64(n, m0), s0);
+        assert_eq!(_pext_u64(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_pdep_u64() {
+        let n = 0b1011_1110_1001_0011u64;
+
+        let m0 = 0b0110_0011_1000_0101u64;
+        let s0 = 0b0000_0010_0000_0101u64;
+
+        let m1 = 0b1110_1011_1110_1111u64;
+        let s1 = 0b1110_1001_0010_0011u64;
+
+        assert_eq!(_pdep_u64(n, m0), s0);
+        assert_eq!(_pdep_u64(n, m1), s1);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    unsafe fn test_bzhi_u64() {
+        let n = 0b1111_0010u64;
+        let s = 0b0001_0010u64;
+        assert_eq!(_bzhi_u64(n, 5), s);
+    }
+
+    #[simd_test(enable = "bmi2")]
+    #[rustfmt::skip]
+    unsafe fn test_mulx_u64() {
+        let a: u64 = 9_223_372_036_854_775_800;
+        let b: u64 = 100;
+        let mut hi = 0;
+        let lo = _mulx_u64(a, b, &mut hi);
+        /*
+result = 922337203685477580000 =
+0b00110001_1111111111111111_1111111111111111_1111111111111111_1111110011100000
+  ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        */
+        assert_eq!(
+            lo,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64
+        );
+        assert_eq!(hi, 0b00110001u64);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bswap.rs b/library/stdarch/crates/core_arch/src/x86_64/bswap.rs
new file mode 100644
index 000000000..90a209ce3
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bswap.rs
@@ -0,0 +1,29 @@
+//! Byte swap intrinsics.
+
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Returns an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bswap64)
+#[inline]
+#[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _bswap64(x: i64) -> i64 {
+    x.swap_bytes()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bswap64() {
+        unsafe {
+            assert_eq!(_bswap64(0x0EADBEEFFADECA0E), 0x0ECADEFAEFBEAD0E);
+            assert_eq!(_bswap64(0x0000000000000000), 0x0000000000000000);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/bt.rs b/library/stdarch/crates/core_arch/src/x86_64/bt.rs
new file mode 100644
index 000000000..53da9d02f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/bt.rs
@@ -0,0 +1,135 @@
+use crate::arch::asm;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+// x32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+#[cfg(target_pointer_width = "32")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b}, ({p:e})")
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! bt {
+    ($inst:expr) => {
+        concat!($inst, " {b}, ({p})")
+    };
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`.
+#[inline]
+#[cfg_attr(test, assert_instr(bt))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittest64(p: *const i64, b: i64) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btq"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(readonly, nostack, pure, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then sets the bit to `1`.
+#[inline]
+#[cfg_attr(test, assert_instr(bts))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandset64(p: *mut i64, b: i64) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btsq"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then resets that bit to `0`.
+#[inline]
+#[cfg_attr(test, assert_instr(btr))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandreset64(p: *mut i64, b: i64) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btrq"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+/// Returns the bit in position `b` of the memory addressed by `p`, then inverts that bit.
+#[inline]
+#[cfg_attr(test, assert_instr(btc))]
+#[stable(feature = "simd_x86_bittest", since = "1.55.0")]
+pub unsafe fn _bittestandcomplement64(p: *mut i64, b: i64) -> u8 {
+    let r: u8;
+    asm!(
+        bt!("btcq"),
+        "setc {r}",
+        p = in(reg) p,
+        b = in(reg) b,
+        r = out(reg_byte) r,
+        options(nostack, att_syntax)
+    );
+    r
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86_64::*;
+
+    #[test]
+    fn test_bittest64() {
+        unsafe {
+            let a = 0b0101_0000i64;
+            assert_eq!(_bittest64(&a as _, 4), 1);
+            assert_eq!(_bittest64(&a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    fn test_bittestandset64() {
+        unsafe {
+            let mut a = 0b0101_0000i64;
+            assert_eq!(_bittestandset64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandset64(&mut a as _, 5), 0);
+            assert_eq!(_bittestandset64(&mut a as _, 5), 1);
+        }
+    }
+
+    #[test]
+    fn test_bittestandreset64() {
+        unsafe {
+            let mut a = 0b0101_0000i64;
+            assert_eq!(_bittestandreset64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandreset64(&mut a as _, 4), 0);
+            assert_eq!(_bittestandreset64(&mut a as _, 5), 0);
+            assert_eq!(_bittestandreset64(&mut a as _, 5), 0);
+        }
+    }
+
+    #[test]
+    fn test_bittestandcomplement64() {
+        unsafe {
+            let mut a = 0b0101_0000i64;
+            assert_eq!(_bittestandcomplement64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement64(&mut a as _, 4), 0);
+            assert_eq!(_bittestandcomplement64(&mut a as _, 4), 1);
+            assert_eq!(_bittestandcomplement64(&mut a as _, 5), 0);
+            assert_eq!(_bittestandcomplement64(&mut a as _, 5), 1);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs b/library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs
new file mode 100644
index 000000000..391daed20
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/cmpxchg16b.rs
@@ -0,0 +1,73 @@
+use crate::sync::atomic::Ordering;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Compares and exchange 16 bytes (128 bits) of data atomically.
+///
+/// This intrinsic corresponds to the `cmpxchg16b` instruction on `x86_64`
+/// processors. It performs an atomic compare-and-swap, updating the `ptr`
+/// memory location to `val` if the current value in memory equals `old`.
+///
+/// # Return value
+///
+/// This function returns the previous value at the memory location. If it is
+/// equal to `old` then the memory was updated to `new`.
+///
+/// # Memory Orderings
+///
+/// This atomic operations has the same semantics of memory orderings as
+/// `AtomicUsize::compare_exchange` does, only operating on 16 bytes of memory
+/// instead of just a pointer.
+///
+/// For more information on memory orderings here see the `compare_exchange`
+/// documentation for other `Atomic*` types in the standard library.
+///
+/// # Unsafety
+///
+/// This method is unsafe because it takes a raw pointer and will attempt to
+/// read and possibly write the memory at the pointer. The pointer must also be
+/// aligned on a 16-byte boundary.
+///
+/// This method also requires the `cmpxchg16b` CPU feature to be available at
+/// runtime to work correctly. If the CPU running the binary does not actually
+/// support `cmpxchg16b` and the program enters an execution path that
+/// eventually would reach this function the behavior is undefined.
+///
+/// The `success` ordering must also be stronger or equal to `failure`, or this
+/// function call is undefined. See the `Atomic*` documentation's
+/// `compare_exchange` function for more information. When `compare_exchange`
+/// panics, this is undefined behavior. Currently this function aborts the
+/// process with an undefined instruction.
+#[inline]
+#[cfg_attr(test, assert_instr(cmpxchg16b, success = Ordering::SeqCst, failure = Ordering::SeqCst))]
+#[target_feature(enable = "cmpxchg16b")]
+pub unsafe fn cmpxchg16b(
+    dst: *mut u128,
+    old: u128,
+    new: u128,
+    success: Ordering,
+    failure: Ordering,
+) -> u128 {
+    use crate::{intrinsics, sync::atomic::Ordering::*};
+
+    debug_assert!(dst as usize % 16 == 0);
+
+    let (val, _ok) = match (success, failure) {
+        (Acquire, Acquire) => intrinsics::atomic_cxchg_acq(dst, old, new),
+        (Release, Relaxed) => intrinsics::atomic_cxchg_rel(dst, old, new),
+        (AcqRel, Acquire) => intrinsics::atomic_cxchg_acqrel(dst, old, new),
+        (Relaxed, Relaxed) => intrinsics::atomic_cxchg_relaxed(dst, old, new),
+        (SeqCst, SeqCst) => intrinsics::atomic_cxchg(dst, old, new),
+        (Acquire, Relaxed) => intrinsics::atomic_cxchg_acq_failrelaxed(dst, old, new),
+        (AcqRel, Relaxed) => intrinsics::atomic_cxchg_acqrel_failrelaxed(dst, old, new),
+        (SeqCst, Relaxed) => intrinsics::atomic_cxchg_failrelaxed(dst, old, new),
+        (SeqCst, Acquire) => intrinsics::atomic_cxchg_failacq(dst, old, new),
+
+        // The above block is all copied from libcore, and this statement is
+        // also copied from libcore except that it's a panic in libcore and we
+        // have a little bit more of a lightweight panic here.
+        _ => crate::core_arch::x86::ud2(),
+    };
+    val
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/fxsr.rs b/library/stdarch/crates/core_arch/src/x86_64/fxsr.rs
new file mode 100644
index 000000000..d02702046
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/fxsr.rs
@@ -0,0 +1,112 @@
+//! FXSR floating-point context fast save and restore.
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.fxsave64"]
+    fn fxsave64(p: *mut u8);
+    #[link_name = "llvm.x86.fxrstor64"]
+    fn fxrstor64(p: *const u8);
+}
+
+/// Saves the `x87` FPU, `MMX` technology, `XMM`, and `MXCSR` registers to the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxsave64)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxsave64(mem_addr: *mut u8) {
+    fxsave64(mem_addr)
+}
+
+/// Restores the `XMM`, `MMX`, `MXCSR`, and `x87` FPU registers from the
+/// 512-byte-long 16-byte-aligned memory region `mem_addr`.
+///
+/// The contents of this memory region should have been written to by a
+/// previous
+/// `_fxsave` or `_fxsave64` intrinsic.
+///
+/// A misaligned destination operand raises a general-protection (#GP) or an
+/// alignment check exception (#AC).
+///
+/// See [`FXSAVE`][fxsave] and [`FXRSTOR`][fxrstor].
+///
+/// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
+/// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxrstor64)
+#[inline]
+#[target_feature(enable = "fxsr")]
+#[cfg_attr(test, assert_instr(fxrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _fxrstor64(mem_addr: *const u8) {
+    fxrstor64(mem_addr)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86_64::*;
+    use std::{cmp::PartialEq, fmt};
+    use stdarch_test::simd_test;
+
+    #[repr(align(16))]
+    struct FxsaveArea {
+        data: [u8; 512], // 512 bytes
+    }
+
+    impl FxsaveArea {
+        fn new() -> FxsaveArea {
+            FxsaveArea { data: [0; 512] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            &mut self.data[0] as *mut _ as *mut u8
+        }
+    }
+
+    impl PartialEq<FxsaveArea> for FxsaveArea {
+        fn eq(&self, other: &FxsaveArea) -> bool {
+            for i in 0..self.data.len() {
+                if self.data[i] != other.data[i] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    impl fmt::Debug for FxsaveArea {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "[")?;
+            for i in 0..self.data.len() {
+                write!(f, "{}", self.data[i])?;
+                if i != self.data.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")
+        }
+    }
+
+    #[simd_test(enable = "fxsr")]
+    unsafe fn fxsave64() {
+        let mut a = FxsaveArea::new();
+        let mut b = FxsaveArea::new();
+
+        fxsr::_fxsave64(a.ptr());
+        fxsr::_fxrstor64(a.ptr());
+        fxsr::_fxsave64(b.ptr());
+        assert_eq!(a, b);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/macros.rs b/library/stdarch/crates/core_arch/src/x86_64/macros.rs
new file mode 100644
index 000000000..a3ea0e821
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/macros.rs
@@ -0,0 +1,36 @@
+//! Utility macros.
+
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// not a round number.
+pub(crate) struct ValidateConstRound<const IMM: i32>;
+impl<const IMM: i32> ValidateConstRound<IMM> {
+    pub(crate) const VALID: () = {
+        assert!(
+            IMM == 4 || IMM == 8 || IMM == 9 || IMM == 10 || IMM == 11,
+            "Invalid IMM value"
+        );
+    };
+}
+
+#[allow(unused)]
+macro_rules! static_assert_rounding {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86_64::macros::ValidateConstRound::<$imm>::VALID;
+    };
+}
+
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// not a sae number.
+pub(crate) struct ValidateConstSae<const IMM: i32>;
+impl<const IMM: i32> ValidateConstSae<IMM> {
+    pub(crate) const VALID: () = {
+        assert!(IMM == 4 || IMM == 8, "Invalid IMM value");
+    };
+}
+
+#[allow(unused)]
+macro_rules! static_assert_sae {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86_64::macros::ValidateConstSae::<$imm>::VALID;
+    };
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/mod.rs b/library/stdarch/crates/core_arch/src/x86_64/mod.rs
new file mode 100644
index 000000000..461874ece
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/mod.rs
@@ -0,0 +1,55 @@
+//! `x86_64` intrinsics
+
+#[macro_use]
+mod macros;
+
+mod fxsr;
+pub use self::fxsr::*;
+
+mod sse;
+pub use self::sse::*;
+
+mod sse2;
+pub use self::sse2::*;
+
+mod sse41;
+pub use self::sse41::*;
+
+mod sse42;
+pub use self::sse42::*;
+
+mod xsave;
+pub use self::xsave::*;
+
+mod abm;
+pub use self::abm::*;
+
+mod avx;
+pub use self::avx::*;
+
+mod bmi;
+pub use self::bmi::*;
+
+mod bmi2;
+pub use self::bmi2::*;
+
+mod avx2;
+pub use self::avx2::*;
+
+mod avx512f;
+pub use self::avx512f::*;
+
+mod bswap;
+pub use self::bswap::*;
+
+mod rdrand;
+pub use self::rdrand::*;
+
+mod cmpxchg16b;
+pub use self::cmpxchg16b::*;
+
+mod adx;
+pub use self::adx::*;
+
+mod bt;
+pub use self::bt::*;
diff --git a/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs b/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs
new file mode 100644
index 000000000..e5ec933fb
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs
@@ -0,0 +1,44 @@
+//! RDRAND and RDSEED instructions for returning random numbers from an Intel
+//! on-chip hardware random number generator which has been seeded by an
+//! on-chip entropy source.
+
+#![allow(clippy::module_name_repetitions)]
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.rdrand.64"]
+    fn x86_rdrand64_step() -> (u64, i32);
+    #[link_name = "llvm.x86.rdseed.64"]
+    fn x86_rdseed64_step() -> (u64, i32);
+}
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Read a hardware generated 64-bit random value and store the result in val.
+/// Returns 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand64_step)
+#[inline]
+#[target_feature(enable = "rdrand")]
+#[cfg_attr(test, assert_instr(rdrand))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
+    let (v, flag) = x86_rdrand64_step();
+    *val = v;
+    flag
+}
+
+/// Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store
+/// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed64_step)
+#[inline]
+#[target_feature(enable = "rdseed")]
+#[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _rdseed64_step(val: &mut u64) -> i32 {
+    let (v, flag) = x86_rdseed64_step();
+    *val = v;
+    flag
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse.rs b/library/stdarch/crates/core_arch/src/x86_64/sse.rs
new file mode 100644
index 000000000..ca6799c90
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse.rs
@@ -0,0 +1,148 @@
+//! `x86_64` Streaming SIMD Extensions (SSE)
+
+use crate::core_arch::x86::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse.cvtss2si64"]
+    fn cvtss2si64(a: __m128) -> i64;
+    #[link_name = "llvm.x86.sse.cvttss2si64"]
+    fn cvttss2si64(a: __m128) -> i64;
+    #[link_name = "llvm.x86.sse.cvtsi642ss"]
+    fn cvtsi642ss(a: __m128, b: i64) -> __m128;
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 64 bit integer.
+///
+/// The result is rounded according to the current rounding mode. If the result
+/// cannot be represented as a 64 bit integer the result will be
+/// `0x8000_0000_0000_0000` (`i64::MIN`) or trigger an invalid operation
+/// floating point exception if unmasked (see
+/// [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
+    cvtss2si64(a)
+}
+
+/// Converts the lowest 32 bit float in the input vector to a 64 bit integer
+/// with truncation.
+///
+/// The result is rounded always using truncation (round towards zero). If the
+/// result cannot be represented as a 64 bit integer the result will be
+/// `0x8000_0000_0000_0000` (`i64::MIN`) or an invalid operation floating
+/// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
+    cvttss2si64(a)
+}
+
+/// Converts a 64 bit integer to a 32 bit float. The result vector is the input
+/// vector `a` with the lowest 32 bit float replaced by the converted integer.
+///
+/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit
+/// input).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss)
+#[inline]
+#[target_feature(enable = "sse")]
+#[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
+    cvtsi642ss(a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arch::x86_64::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtss_si64() {
+        let inputs = &[
+            (42.0f32, 42i64),
+            (-31.4, -31),
+            (-33.5, -34),
+            (-34.5, -34),
+            (4.0e10, 40_000_000_000),
+            (4.0e-10, 0),
+            (f32::NAN, i64::MIN),
+            (2147483500.1, 2147483520),
+            (9.223371e18, 9223370937343148032),
+        ];
+        for i in 0..inputs.len() {
+            let (xi, e) = inputs[i];
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvtss_si64(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvttss_si64() {
+        let inputs = &[
+            (42.0f32, 42i64),
+            (-31.4, -31),
+            (-33.5, -33),
+            (-34.5, -34),
+            (10.999, 10),
+            (-5.99, -5),
+            (4.0e10, 40_000_000_000),
+            (4.0e-10, 0),
+            (f32::NAN, i64::MIN),
+            (2147483500.1, 2147483520),
+            (9.223371e18, 9223370937343148032),
+            (9.223372e18, i64::MIN),
+        ];
+        for i in 0..inputs.len() {
+            let (xi, e) = inputs[i];
+            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
+            let r = _mm_cvttss_si64(x);
+            assert_eq!(
+                e, r,
+                "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}",
+                i, x, r, e
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_cvtsi64_ss() {
+        let inputs = &[
+            (4555i64, 4555.0f32),
+            (322223333, 322223330.0),
+            (-432, -432.0),
+            (-322223333, -322223330.0),
+            (9223372036854775807, 9.223372e18),
+            (-9223372036854775808, -9.223372e18),
+        ];
+
+        for i in 0..inputs.len() {
+            let (x, f) = inputs[i];
+            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+            let r = _mm_cvtsi64_ss(a, x);
+            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
+            assert_eq_m128(e, r);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
new file mode 100644
index 000000000..f487a067f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
@@ -0,0 +1,209 @@
+//! `x86_64`'s Streaming SIMD Extensions 2 (SSE2)
+
+use crate::{
+    core_arch::{simd_llvm::*, x86::*},
+    intrinsics,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse2.cvtsd2si64"]
+    fn cvtsd2si64(a: __m128d) -> i64;
+    #[link_name = "llvm.x86.sse2.cvttsd2si64"]
+    fn cvttsd2si64(a: __m128d) -> i64;
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in a to
+/// a 64-bit integer.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 {
+    cvtsd2si64(a)
+}
+
+/// Alias for `_mm_cvtsd_si64`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 {
+    _mm_cvtsd_si64(a)
+}
+
+/// Converts the lower double-precision (64-bit) floating-point element in `a`
+/// to a 64-bit integer with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 {
+    cvttsd2si64(a)
+}
+
+/// Alias for `_mm_cvttsd_si64`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
+    _mm_cvttsd_si64(a)
+}
+
+/// Stores a 64-bit integer value in the specified memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(movnti))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
+    intrinsics::nontemporal_store(mem_addr, a);
+}
+
+/// Returns a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i {
+    _mm_set_epi64x(0, a)
+}
+
+/// Returns a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
+    _mm_cvtsi64_si128(a)
+}
+
+/// Returns the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
+    simd_extract(a.as_i64x2(), 0)
+}
+
+/// Returns the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
+    _mm_cvtsi128_si64(a)
+}
+
+/// Returns `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
+    simd_insert(a, 0, b as f64)
+}
+
+/// Returns `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_cvtsi64x_sd(a: __m128d, b: i64) -> __m128d {
+    _mm_cvtsi64_sd(a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arch::x86_64::*;
+    use std::boxed;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si64() {
+        let r = _mm_cvtsd_si64(_mm_setr_pd(-2.0, 5.0));
+        assert_eq!(r, -2_i64);
+
+        let r = _mm_cvtsd_si64(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq!(r, i64::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si64x() {
+        let r = _mm_cvtsd_si64x(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq!(r, i64::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si64() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttsd_si64(a);
+        assert_eq!(r, -1_i64);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si64x() {
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttsd_si64x(a);
+        assert_eq!(r, i64::MIN);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_stream_si64() {
+        let a: i64 = 7;
+        let mut mem = boxed::Box::<i64>::new(-1);
+        _mm_stream_si64(&mut *mem as *mut i64, a);
+        assert_eq!(a, *mem);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi64_si128() {
+        let r = _mm_cvtsi64_si128(5);
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi128_si64() {
+        let r = _mm_cvtsi128_si64(_mm_setr_epi64x(5, 0));
+        assert_eq!(r, 5);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_cvtsi64_sd() {
+        let a = _mm_set1_pd(3.5);
+        let r = _mm_cvtsi64_sd(a, 5);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse41.rs b/library/stdarch/crates/core_arch/src/x86_64/sse41.rs
new file mode 100644
index 000000000..3d1ea0cf6
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse41.rs
@@ -0,0 +1,62 @@
+//! `i686`'s Streaming SIMD Extensions 4.1 (SSE4.1)
+
+use crate::{
+    core_arch::{simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Extracts an 64-bit integer from `a` selected with `IMM1`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(pextrq, IMM1 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_extract_epi64<const IMM1: i32>(a: __m128i) -> i64 {
+    static_assert_imm1!(IMM1);
+    simd_extract(a.as_i64x2(), IMM1 as u32)
+}
+
+/// Returns a copy of `a` with the 64-bit integer from `i` inserted at a
+/// location specified by `IMM1`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi64)
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pinsrq, IMM1 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_insert_epi64<const IMM1: i32>(a: __m128i, i: i64) -> __m128i {
+    static_assert_imm1!(IMM1);
+    transmute(simd_insert(a.as_i64x2(), IMM1 as u32, i))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arch::x86_64::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_extract_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let r = _mm_extract_epi64::<1>(a);
+        assert_eq!(r, 1);
+        let r = _mm_extract_epi64::<0>(a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "sse4.1")]
+    unsafe fn test_mm_insert_epi64() {
+        let a = _mm_set1_epi64x(0);
+        let e = _mm_setr_epi64x(0, 32);
+        let r = _mm_insert_epi64::<1>(a, 32);
+        assert_eq_m128i(r, e);
+        let e = _mm_setr_epi64x(32, 0);
+        let r = _mm_insert_epi64::<0>(a, 32);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse42.rs b/library/stdarch/crates/core_arch/src/x86_64/sse42.rs
new file mode 100644
index 000000000..6b5d087c1
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse42.rs
@@ -0,0 +1,37 @@
+//! `x86_64`'s Streaming SIMD Extensions 4.2 (SSE4.2)
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.sse42.crc32.64.64"]
+    fn crc32_64_64(crc: u64, v: u64) -> u64;
+}
+
+/// Starting with the initial value in `crc`, return the accumulated
+/// CRC32-C value for unsigned 64-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u64)
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_crc32_u64(crc: u64, v: u64) -> u64 {
+    crc32_64_64(crc, v)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arch::x86_64::*;
+
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "sse4.2")]
+    unsafe fn test_mm_crc32_u64() {
+        let crc = 0x7819dccd3e824;
+        let v = 0x2a22b845fed;
+        let i = _mm_crc32_u64(crc, v);
+        assert_eq!(i, 0xbb6cdc6c);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/xsave.rs b/library/stdarch/crates/core_arch/src/x86_64/xsave.rs
new file mode 100644
index 000000000..2afd3e433
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/xsave.rs
@@ -0,0 +1,227 @@
+//! `x86_64`'s `xsave` and `xsaveopt` target feature intrinsics
+
+#![allow(clippy::module_name_repetitions)]
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.xsave64"]
+    fn xsave64(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstor64"]
+    fn xrstor64(p: *const u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsaveopt64"]
+    fn xsaveopt64(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsavec64"]
+    fn xsavec64(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xsaves64"]
+    fn xsaves64(p: *mut u8, hi: u32, lo: u32);
+    #[link_name = "llvm.x86.xrstors64"]
+    fn xrstors64(p: *const u8, hi: u32, lo: u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and XCR0.
+/// `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
+/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsave64)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
+    xsave64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using
+/// the state information stored in memory at `mem_addr`.
+///
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstor64)
+#[inline]
+#[target_feature(enable = "xsave")]
+#[cfg_attr(test, assert_instr(xrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
+    xrstor64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits `[62:0]` in `save_mask` and `XCR0`.
+/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
+/// the manner in which data is saved. The performance of this instruction will
+/// be equal to or better than using the `XSAVE64` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaveopt64)
+#[inline]
+#[target_feature(enable = "xsave,xsaveopt")]
+#[cfg_attr(test, assert_instr(xsaveopt64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
+    xsaveopt64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory
+/// at `mem_addr`.
+///
+/// `xsavec` differs from `xsave` in that it uses compaction and that it may
+/// use init optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsavec64)
+#[inline]
+#[target_feature(enable = "xsave,xsavec")]
+#[cfg_attr(test, assert_instr(xsavec64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
+    xsavec64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial save of the enabled processor states to memory at
+/// `mem_addr`
+///
+/// `xsaves` differs from xsave in that it can save state components
+/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
+/// modified optimization. State is saved based on bits `[62:0]` in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaves64)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xsaves64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
+    xsaves64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
+}
+
+/// Performs a full or partial restore of the enabled processor states using the
+/// state information stored in memory at `mem_addr`.
+///
+/// `xrstors` differs from `xrstor` in that it can restore state components
+/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
+/// from an `xsave` area in which the extended region is in the standard form.
+/// State is restored based on bits `[62:0]` in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstors64)
+#[inline]
+#[target_feature(enable = "xsave,xsaves")]
+#[cfg_attr(test, assert_instr(xrstors64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {
+    xrstors64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
+}
+
+// FIXME: https://github.com/rust-lang/stdarch/issues/209
+// All these tests fail with Intel SDE.
+/*
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::x86_64::xsave;
+    use stdarch_test::simd_test;
+    use std::fmt;
+
+    // FIXME: https://github.com/rust-lang/stdarch/issues/209
+    #[repr(align(64))]
+    struct XsaveArea {
+        // max size for 256-bit registers is 800 bytes:
+        // see https://software.intel.com/en-us/node/682996
+        // max size for 512-bit registers is 2560 bytes:
+        // FIXME: add source
+        data: [u8; 2560],
+    }
+
+    impl XsaveArea {
+        fn new() -> XsaveArea {
+            XsaveArea { data: [0; 2560] }
+        }
+        fn ptr(&mut self) -> *mut u8 {
+            &mut self.data[0] as *mut _ as *mut u8
+        }
+    }
+
+    impl PartialEq<XsaveArea> for XsaveArea {
+        fn eq(&self, other: &XsaveArea) -> bool {
+            for i in 0..self.data.len() {
+                if self.data[i] != other.data[i] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    impl fmt::Debug for XsaveArea {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "[")?;
+            for i in 0..self.data.len() {
+                write!(f, "{}", self.data[i])?;
+                if i != self.data.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")
+        }
+    }
+
+    #[simd_test(enable = "xsave")]
+    unsafe fn xsave64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsave64(a.ptr(), m);
+        xsave::_xrstor64(a.ptr(), m);
+        xsave::_xsave64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[simd_test(enable = "xsave,xsaveopt")]
+    unsafe fn xsaveopt64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsaveopt64(a.ptr(), m);
+        xsave::_xrstor64(a.ptr(), m);
+        xsave::_xsaveopt64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[simd_test(enable = "xsave,xsavec")]
+    unsafe fn xsavec64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsavec64(a.ptr(), m);
+        xsave::_xrstor64(a.ptr(), m);
+        xsave::_xsavec64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[simd_test(enable = "xsave,xsaves")]
+    unsafe fn xsaves64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = XsaveArea::new();
+        let mut b = XsaveArea::new();
+
+        xsave::_xsaves64(a.ptr(), m);
+        xsave::_xrstors64(a.ptr(), m);
+        xsave::_xsaves64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+}
+*/
diff --git a/library/stdarch/crates/core_arch/tests/cpu-detection.rs b/library/stdarch/crates/core_arch/tests/cpu-detection.rs
new file mode 100644
index 000000000..61f5f0905
--- /dev/null
+++ b/library/stdarch/crates/core_arch/tests/cpu-detection.rs
@@ -0,0 +1,63 @@
+#![feature(stdsimd)]
+#![allow(clippy::unwrap_used, clippy::print_stdout, clippy::use_debug)]
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[macro_use]
+extern crate std_detect;
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn x86_all() {
+    println!("sse: {:?}", is_x86_feature_detected!("sse"));
+    println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
+    println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
+    println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
+    println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
+    println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
+    println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
+    println!("avx: {:?}", is_x86_feature_detected!("avx"));
+    println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
+    println!("avx512f {:?}", is_x86_feature_detected!("avx512f"));
+    println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd"));
+    println!("avx512er {:?}", is_x86_feature_detected!("avx512er"));
+    println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf"));
+    println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw"));
+    println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq"));
+    println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl"));
+    println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma"));
+    println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
+    println!(
+        "avx512_vpopcntdq {:?}",
+        is_x86_feature_detected!("avx512vpopcntdq")
+    );
+    println!("avx512vbmi2 {:?}", is_x86_feature_detected!("avx512vbmi2"));
+    println!("avx512gfni {:?}", is_x86_feature_detected!("avx512gfni"));
+    println!("avx512vaes {:?}", is_x86_feature_detected!("avx512vaes"));
+    println!(
+        "avx512vpclmulqdq {:?}",
+        is_x86_feature_detected!("avx512vpclmulqdq")
+    );
+    println!("avx512vnni {:?}", is_x86_feature_detected!("avx512vnni"));
+    println!(
+        "avx512bitalg {:?}",
+        is_x86_feature_detected!("avx512bitalg")
+    );
+    println!("avx512bf16 {:?}", is_x86_feature_detected!("avx512bf16"));
+    println!(
+        "avx512vp2intersect {:?}",
+        is_x86_feature_detected!("avx512vp2intersect")
+    );
+    println!("f16c: {:?}", is_x86_feature_detected!("f16c"));
+    println!("fma: {:?}", is_x86_feature_detected!("fma"));
+    println!("abm: {:?}", is_x86_feature_detected!("abm"));
+    println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
+    println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
+    println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
+    println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
+    println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
+    println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
+    println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
+    println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
+    println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
+    println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
+}
diff --git a/library/stdarch/crates/intrinsic-test/Cargo.toml b/library/stdarch/crates/intrinsic-test/Cargo.toml
new file mode 100644
index 000000000..5fde23c9e
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "intrinsic-test"
+version = "0.1.0"
+authors = ["Jamie Cunliffe <Jamie.Cunliffe@arm.com>"]
+edition = "2018"
+
+[dependencies]
+lazy_static = "1.4.0"
+serde = { version = "1", features = ["derive"] }
+csv = "1.1"
+clap = "2.33.3"
+regex = "1.4.2"
+log = "0.4.11"
+pretty_env_logger = "0.4.0"
+rayon = "1.5.0"
+diff = "0.1.12"
+itertools = "0.10.1"
+\ No newline at end of file
diff --git a/library/stdarch/crates/intrinsic-test/README.md b/library/stdarch/crates/intrinsic-test/README.md
new file mode 100644
index 000000000..8a8ddab40
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/README.md
@@ -0,0 +1,24 @@
+Generate and run programs using equivalent C and Rust intrinsics, checking that
+each produces the same result from random inputs.
+
+# Usage
+```
+USAGE:
+    intrinsic-test [OPTIONS] <INPUT>
+
+FLAGS:
+    -h, --help       Prints help information
+    -V, --version    Prints version information
+
+OPTIONS:
+        --cppcompiler <CPPCOMPILER>    The C++ compiler to use for compiling the c++ code [default: clang++]
+        --runner <RUNNER>              Run the C programs under emulation with this command
+        --toolchain <TOOLCHAIN>        The rust toolchain to use for building the rust code
+
+ARGS:
+    <INPUT>    The input file containing the intrinsics
+```
+
+The intrinsic.csv is the arm neon tracking google sheet (https://docs.google.com/spreadsheets/d/1MqW1g8c7tlhdRWQixgdWvR4uJHNZzCYAf4V0oHjZkwA/edit#gid=0)
+that contains the intrinsic list. The done percentage column should be renamed to "enabled".
+
diff --git a/library/stdarch/crates/intrinsic-test/missing_aarch64.txt b/library/stdarch/crates/intrinsic-test/missing_aarch64.txt
new file mode 100644
index 000000000..56ec274b5
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/missing_aarch64.txt
@@ -0,0 +1,110 @@
+# Not implemented in stdarch yet
+vbfdot_f32
+vbfdot_lane_f32
+vbfdot_laneq_f32
+vbfdotq_f32
+vbfdotq_lane_f32
+vbfdotq_laneq_f32
+vbfmlalbq_f32
+vbfmlalbq_lane_f32
+vbfmlalbq_laneq_f32
+vbfmlaltq_f32
+vbfmlaltq_lane_f32
+vbfmlaltq_laneq_f32
+vbfmmlaq_f32
+vsudot_laneq_s32
+vsudot_lane_s32
+vsudotq_laneq_s32
+vsudotq_lane_s32
+vusdot_laneq_s32
+vusdot_lane_s32
+vusdotq_laneq_s32
+vusdotq_lane_s32
+vusdotq_s32
+vusdot_s32
+
+# Implemented in Clang but missing from CSV
+vcmla_f64
+vcmla_lane_f64
+vcmla_laneq_f64
+vcmlaq_lane_f64
+vcmlaq_laneq_f64
+vcmlaq_rot180_lane_f64
+vcmlaq_rot180_laneq_f64
+vcmlaq_rot270_lane_f64
+vcmlaq_rot270_laneq_f64
+vcmlaq_rot90_lane_f64
+vcmlaq_rot90_laneq_f64
+vcmla_rot180_f64
+vcmla_rot180_lane_f64
+vcmla_rot180_laneq_f64
+vcmla_rot270_f64
+vcmla_rot270_lane_f64
+vcmla_rot270_laneq_f64
+vcmla_rot90_f64
+vcmla_rot90_lane_f64
+vcmla_rot90_laneq_f64
+
+# Implemented in Clang and stdarch but missing from CSV
+vmov_n_p64
+vmovq_n_p64
+vreinterpret_f32_p64
+vreinterpret_p64_s64
+vreinterpretq_f32_p128
+vreinterpretq_f32_p64
+vreinterpretq_p128_p64
+vreinterpretq_p64_p128
+vtst_p16
+vtstq_p16
+
+# Missing from both Clang and stdarch
+vrnd32x_f64
+vrnd32xq_f64
+vrnd32z_f64
+vrnd32zq_f64
+vrnd64x_f64
+vrnd64xq_f64
+vrnd64z_f64
+vrnd64zq_f64
+
+# Takes too long to compile tests
+vcopyq_laneq_u8
+vcopyq_laneq_s8
+vcopyq_laneq_p8
+vcopyq_lane_u8
+vcopyq_lane_s8
+vcopyq_lane_p8
+vcopy_laneq_u8
+vcopy_laneq_s8
+vcopy_laneq_p8
+vcopy_lane_u8
+vcopy_lane_s8
+vcopy_lane_p8
+
+# QEMU 6.0 doesn't support these instructions
+vmmlaq_s32
+vmmlaq_u32
+vsm3partw1q_u32
+vsm3partw2q_u32
+vsm3ss1q_u32
+vsm3tt1aq_u32
+vsm3tt1bq_u32
+vsm3tt2aq_u32
+vsm3tt2bq_u32
+vsm4ekeyq_u32
+vsm4eq_u32
+vusmmlaq_s32
+
+# LLVM select error in debug builds
+vqshlu_n_s16
+vqshlu_n_s32
+vqshlu_n_s64
+vqshlu_n_s8
+vqshlub_n_s8
+vqshlud_n_s64
+vqshluh_n_s16
+vqshluq_n_s16
+vqshluq_n_s32
+vqshluq_n_s64
+vqshluq_n_s8
+vqshlus_n_s32
diff --git a/library/stdarch/crates/intrinsic-test/missing_arm.txt b/library/stdarch/crates/intrinsic-test/missing_arm.txt
new file mode 100644
index 000000000..bbc8de584
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/missing_arm.txt
@@ -0,0 +1,334 @@
+# Not implemented in stdarch yet
+vbfdot_f32
+vbfdot_lane_f32
+vbfdot_laneq_f32
+vbfdotq_f32
+vbfdotq_lane_f32
+vbfdotq_laneq_f32
+vbfmlalbq_f32
+vbfmlalbq_lane_f32
+vbfmlalbq_laneq_f32
+vbfmlaltq_f32
+vbfmlaltq_lane_f32
+vbfmlaltq_laneq_f32
+vbfmmlaq_f32
+vsudot_laneq_s32
+vsudot_lane_s32
+vsudotq_laneq_s32
+vsudotq_lane_s32
+vusdot_laneq_s32
+vusdot_lane_s32
+vusdotq_laneq_s32
+vusdotq_lane_s32
+vusdotq_s32
+vusdot_s32
+
+# Implemented in Clang and stdarch but missing from CSV
+vtst_p16
+vtstq_p16
+
+# QEMU 6.0 doesn't support these instructions
+vmmlaq_s32
+vmmlaq_u32
+vusmmlaq_s32
+
+# Implemented in Clang and stdarch for A64 only even though CSV claims A32 support
+__crc32d
+__crc32cd
+vaddq_p64
+vbsl_p64
+vbslq_p64
+vceq_p64
+vceqq_p64
+vceqz_p64
+vceqzq_p64
+vcombine_p64
+vcopy_lane_p64
+vcopy_laneq_p64
+vcopyq_lane_p64
+vcopyq_laneq_p64
+vcreate_p64
+vdup_lane_p64
+vdup_n_p64
+vdupq_lane_p64
+vdupq_n_p64
+vext_p64
+vextq_p64
+vget_high_p64
+vget_lane_p64
+vget_low_p64
+vgetq_lane_p64
+vmovn_high_s16
+vmovn_high_s32
+vmovn_high_s64
+vmovn_high_u16
+vmovn_high_u32
+vmovn_high_u64
+vmull_high_p64
+vmull_p64
+vreinterpret_p16_p64
+vreinterpret_p64_f32
+vreinterpret_p64_p16
+vreinterpret_p64_p8
+vreinterpret_p64_s16
+vreinterpret_p64_s32
+vreinterpret_p64_s8
+vreinterpret_p64_u16
+vreinterpret_p64_u32
+vreinterpret_p64_u64
+vreinterpret_p64_u8
+vreinterpret_p8_p64
+vreinterpretq_f64_u64
+vreinterpretq_p128_f32
+vreinterpretq_p128_p16
+vreinterpretq_p128_p8
+vreinterpretq_p128_s16
+vreinterpretq_p128_s32
+vreinterpretq_p128_s64
+vreinterpretq_p128_s8
+vreinterpretq_p128_u16
+vreinterpretq_p128_u32
+vreinterpretq_p128_u64
+vreinterpretq_p128_u8
+vreinterpretq_p16_p64
+vreinterpretq_p64_f32
+vreinterpretq_p64_p16
+vreinterpretq_p64_p8
+vreinterpretq_p64_s16
+vreinterpretq_p64_s32
+vreinterpretq_p64_s64
+vreinterpretq_p64_s8
+vreinterpretq_p64_u16
+vreinterpretq_p64_u32
+vreinterpretq_p64_u64
+vreinterpretq_p64_u8
+vreinterpretq_p8_p64
+vreinterpretq_s16_p64
+vreinterpretq_s32_p64
+vreinterpretq_s64_p64
+vreinterpretq_s8_p64
+vreinterpretq_u16_p64
+vreinterpretq_u32_p64
+vreinterpretq_u64_p64
+vreinterpretq_u8_p64
+vreinterpret_s16_p64
+vreinterpret_s32_p64
+vreinterpret_s64_p64
+vreinterpret_s8_p64
+vreinterpret_u16_p64
+vreinterpret_u32_p64
+vreinterpret_u64_p64
+vreinterpret_u8_p64
+vrndn_f64
+vrndnq_f64
+vset_lane_p64
+vsetq_lane_p64
+vsli_n_p64
+vsliq_n_p64
+vsri_n_p64
+vsriq_n_p64
+vtst_p64
+vtstq_p64
+
+# Present in Clang header but triggers an ICE due to lack of backend support.
+vcmla_f32
+vcmla_lane_f32
+vcmla_laneq_f32
+vcmla_rot180_f32
+vcmla_rot180_lane_f32
+vcmla_rot180_laneq_f32
+vcmla_rot270_f32
+vcmla_rot270_lane_f32
+vcmla_rot270_laneq_f32
+vcmla_rot90_f32
+vcmla_rot90_lane_f32
+vcmla_rot90_laneq_f32
+vcmlaq_f32
+vcmlaq_lane_f32
+vcmlaq_laneq_f32
+vcmlaq_rot180_f32
+vcmlaq_rot180_lane_f32
+vcmlaq_rot180_laneq_f32
+vcmlaq_rot270_f32
+vcmlaq_rot270_lane_f32
+vcmlaq_rot270_laneq_f32
+vcmlaq_rot90_f32
+vcmlaq_rot90_lane_f32
+vcmlaq_rot90_laneq_f32
+
+# Implemented in stdarch for A64 only, Clang support both A32/A64
+vadd_s64
+vadd_u64
+vcaddq_rot270_f32
+vcaddq_rot90_f32
+vcadd_rot270_f32
+vcadd_rot90_f32
+vcombine_f32
+vcombine_p16
+vcombine_p8
+vcombine_s16
+vcombine_s32
+vcombine_s64
+vcombine_s8
+vcombine_u16
+vcombine_u32
+vcombine_u64
+vcombine_u8
+vcvtaq_s32_f32
+vcvtaq_u32_f32
+vcvta_s32_f32
+vcvta_u32_f32
+vcvtmq_s32_f32
+vcvtmq_u32_f32
+vcvtm_s32_f32
+vcvtm_u32_f32
+vcvtnq_s32_f32
+vcvtnq_u32_f32
+vcvtn_s32_f32
+vcvtn_u32_f32
+vcvtpq_s32_f32
+vcvtpq_u32_f32
+vcvtp_s32_f32
+vcvtp_u32_f32
+vdot_lane_s32
+vdot_lane_u32
+vdotq_lane_s32
+vdotq_lane_u32
+vdotq_s32
+vdotq_u32
+vdot_s32
+vdot_u32
+vqdmulh_lane_s16
+vqdmulh_lane_s32
+vqdmulhq_lane_s16
+vqdmulhq_lane_s32
+vrnda_f32
+vrnda_f32
+vrndaq_f32
+vrndaq_f32
+vrnd_f32
+vrnd_f32
+vrndi_f32
+vrndi_f32
+vrndiq_f32
+vrndiq_f32
+vrndm_f32
+vrndm_f32
+vrndmq_f32
+vrndmq_f32
+vrndns_f32
+vrndp_f32
+vrndpq_f32
+vrndq_f32
+vrndq_f32
+vrndx_f32
+vrndxq_f32
+
+# LLVM select error in debug builds
+vqrshrn_n_s16
+vqrshrn_n_s32
+vqrshrn_n_s64
+vqrshrn_n_u16
+vqrshrn_n_u32
+vqrshrn_n_u64
+vqrshrun_n_s16
+vqrshrun_n_s32
+vqrshrun_n_s64
+vqshrn_n_s16
+vqshrn_n_s32
+vqshrn_n_s64
+vqshrn_n_u16
+vqshrn_n_u32
+vqshrn_n_u64
+vqshrun_n_s16
+vqshrun_n_s32
+vqshrun_n_s64
+vrshrn_n_s16
+vrshrn_n_s32
+vrshrn_n_s64
+vrshrn_n_u16
+vrshrn_n_u32
+vrshrn_n_u64
+vshrq_n_u64
+vshr_n_u64
+
+# Failing tests: stdarch has incorrect results compared to Clang
+vqshlu_n_s16
+vqshlu_n_s32
+vqshlu_n_s64
+vqshlu_n_s8
+vqshluq_n_s16
+vqshluq_n_s32
+vqshluq_n_s64
+vqshluq_n_s8
+vsli_n_p16
+vsli_n_p8
+vsli_n_s16
+vsli_n_s32
+vsli_n_s64
+vsli_n_s8
+vsli_n_u16
+vsli_n_u32
+vsli_n_u64
+vsli_n_u8
+vsliq_n_p16
+vsliq_n_p8
+vsliq_n_s16
+vsliq_n_s32
+vsliq_n_s64
+vsliq_n_s8
+vsliq_n_u16
+vsliq_n_u32
+vsliq_n_u64
+vsliq_n_u8
+vsri_n_p16
+vsri_n_p8
+vsri_n_s16
+vsri_n_s32
+vsri_n_s64
+vsri_n_s8
+vsri_n_u16
+vsri_n_u32
+vsri_n_u64
+vsri_n_u8
+vsriq_n_p16
+vsriq_n_p8
+vsriq_n_s16
+vsriq_n_s32
+vsriq_n_s64
+vsriq_n_s8
+vsriq_n_u16
+vsriq_n_u32
+vsriq_n_u64
+vsriq_n_u8
+
+# These produce a different result on Clang depending on the optimization level.
+# This is definitely a bug in LLVM.
+vadd_f32
+vaddq_f32
+vcvt_s32_f32
+vcvt_u32_f32
+vcvtq_s32_f32
+vcvtq_u32_f32
+vfma_f32
+vfma_n_f32
+vfmaq_f32
+vfmaq_n_f32
+vfms_f32
+vfmsq_f32
+vmla_f32
+vmla_lane_f32
+vmla_n_f32
+vmlaq_f32
+vmlaq_lane_f32
+vmlaq_n_f32
+vmls_f32
+vmls_lane_f32
+vmls_n_f32
+vmlsq_f32
+vmlsq_lane_f32
+vmlsq_n_f32
+vmul_lane_f32
+vmul_n_f32
+vmulq_lane_f32
+vmulq_n_f32
diff --git a/library/stdarch/crates/intrinsic-test/src/acle_csv_parser.rs b/library/stdarch/crates/intrinsic-test/src/acle_csv_parser.rs
new file mode 100644
index 000000000..d7b066485
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/acle_csv_parser.rs
@@ -0,0 +1,319 @@
+use itertools::Itertools;
+use regex::Regex;
+use serde::Deserialize;
+
+use crate::argument::{Argument, ArgumentList, Constraint};
+use crate::intrinsic::Intrinsic;
+use crate::types::{IntrinsicType, TypeKind};
+
+pub fn get_acle_intrinsics(filename: &str) -> Vec<Intrinsic> {
+    let data = std::fs::read_to_string(filename).expect("Failed to open ACLE intrinsics file");
+
+    let data = data
+        .lines()
+        .filter_map(|l| {
+            (!(l.starts_with("<COMMENT>") || l.is_empty() || l.starts_with("<SECTION>")))
+                .then(|| l.replace("<HEADER>\t", ""))
+        })
+        .join("\n");
+
+    let mut csv_reader = csv::ReaderBuilder::new()
+        .delimiter(b'\t')
+        .from_reader(data.as_bytes());
+
+    let mut intrinsics: Vec<Intrinsic> = csv_reader
+        .deserialize()
+        .filter_map(|x: Result<ACLEIntrinsicLine, _>| x.ok().map(|i| i.into()))
+        .collect::<Vec<_>>();
+
+    // Intrinsics such as vshll_n_s8 exist twice in the ACLE with different constraints.
+    intrinsics.sort_by(|a, b| a.name.cmp(&b.name));
+    let (intrinsics, duplicates) = intrinsics.partition_dedup_by(|a, b| a.name == b.name);
+    for duplicate in duplicates {
+        let name = &duplicate.name;
+        let constraints = duplicate
+            .arguments
+            .args
+            .drain(..)
+            .filter(|a| a.has_constraint());
+        let intrinsic = intrinsics.iter_mut().find(|i| &i.name == name).unwrap();
+
+        for mut constraint in constraints {
+            let real_constraint = intrinsic
+                .arguments
+                .args
+                .iter_mut()
+                .find(|a| a.name == constraint.name)
+                .unwrap();
+            real_constraint
+                .constraints
+                .push(constraint.constraints.pop().unwrap());
+        }
+    }
+
+    intrinsics.to_vec()
+}
+
+impl Into<Intrinsic> for ACLEIntrinsicLine {
+    fn into(self) -> Intrinsic {
+        let signature = self.intrinsic;
+        let (ret_ty, remaining) = signature.split_once(' ').unwrap();
+
+        let results = type_from_c(ret_ty)
+            .unwrap_or_else(|_| panic!("Failed to parse return type: {}", ret_ty));
+
+        let (name, args) = remaining.split_once('(').unwrap();
+        let args = args.trim_end_matches(')');
+
+        // Typo in ACLE data
+        let args = args.replace("int16x8q_t", "int16x8_t");
+
+        let arg_prep = self.argument_preparation.as_str();
+        let args = args
+            .split(',')
+            .enumerate()
+            .map(move |(idx, arg)| {
+                let arg = arg.trim();
+                if arg.starts_with("__builtin_constant_p") {
+                    handle_constraint(idx, arg, arg_prep)
+                } else {
+                    from_c(idx, arg)
+                }
+            })
+            .collect();
+        let arguments = ArgumentList { args };
+        let a64_only = match &*self.supported_architectures {
+            "A64" => true,
+            "v7/A32/A64" | "A32/A64" => false,
+            _ => panic!("Invalid supported architectures"),
+        };
+
+        Intrinsic {
+            name: name.to_string(),
+            arguments,
+            results,
+            a64_only,
+        }
+    }
+}
+
+fn handle_constraint(idx: usize, arg: &str, prep: &str) -> Argument {
+    let prep = prep.replace(' ', "");
+
+    let name = arg
+        .trim_start_matches("__builtin_constant_p")
+        .trim_start_matches(|ref c| c == &' ' || c == &'(')
+        .trim_end_matches(')')
+        .to_string();
+
+    let ty = IntrinsicType::Type {
+        constant: true,
+        kind: TypeKind::Int,
+        bit_len: Some(32),
+        simd_len: None,
+        vec_len: None,
+    };
+
+    let constraints = prep
+        .split(';')
+        .find_map(|p| handle_range_constraint(&name, p).or_else(|| handle_eq_constraint(&name, p)))
+        .map(|c| vec![c])
+        .unwrap_or_default();
+
+    Argument {
+        pos: idx,
+        name,
+        ty,
+        constraints,
+    }
+}
+
+fn handle_range_constraint(name: &str, data: &str) -> Option<Constraint> {
+    lazy_static! {
+        static ref RANGE_CONSTRAINT: Regex =
+            Regex::new(r#"([0-9]+)<=([[:alnum:]]+)<=([0-9]+)"#).unwrap();
+    }
+
+    let captures = RANGE_CONSTRAINT.captures(data)?;
+    if captures.get(2).map(|c| c.as_str() == name).unwrap_or(false) {
+        match (captures.get(1), captures.get(3)) {
+            (Some(start), Some(end)) => {
+                let start = start.as_str().parse::<i64>().unwrap();
+                let end = end.as_str().parse::<i64>().unwrap() + 1;
+                Some(Constraint::Range(start..end))
+            }
+            _ => panic!("Invalid constraint"),
+        }
+    } else {
+        None
+    }
+}
+
+fn handle_eq_constraint(name: &str, data: &str) -> Option<Constraint> {
+    lazy_static! {
+        static ref EQ_CONSTRAINT: Regex = Regex::new(r#"([[:alnum:]]+)==([0-9]+)"#).unwrap();
+    }
+    let captures = EQ_CONSTRAINT.captures(data)?;
+    if captures.get(1).map(|c| c.as_str() == name).unwrap_or(false) {
+        captures
+            .get(2)
+            .map(|c| Constraint::Equal(c.as_str().parse::<i64>().unwrap()))
+    } else {
+        None
+    }
+}
+
+fn from_c(pos: usize, s: &str) -> Argument {
+    let name_index = s
+        .chars()
+        .rev()
+        .take_while(|c| c != &'*' && c != &' ')
+        .count();
+
+    let name_start = s.len() - name_index;
+    let name = s[name_start..].to_string();
+    let s = s[..name_start].trim();
+
+    Argument {
+        pos,
+        name,
+        ty: type_from_c(s).unwrap_or_else(|_| panic!("Failed to parse type: {}", s)),
+        constraints: vec![],
+    }
+}
+
+fn type_from_c(s: &str) -> Result<IntrinsicType, String> {
+    const CONST_STR: &str = "const ";
+
+    if let Some(s) = s.strip_suffix('*') {
+        let (s, constant) = if s.ends_with(CONST_STR) {
+            (&s[..s.len() - (CONST_STR.len() + 1)], true)
+        } else {
+            (s, false)
+        };
+
+        let s = s.trim_end();
+
+        Ok(IntrinsicType::Ptr {
+            constant,
+            child: Box::new(type_from_c(s)?),
+        })
+    } else {
+        // [const ]TYPE[{bitlen}[x{simdlen}[x{vec_len}]]][_t]
+
+        let (mut s, constant) = if let Some(s) = s.strip_prefix(CONST_STR) {
+            (s, true)
+        } else {
+            (s, false)
+        };
+        s = s.strip_suffix("_t").unwrap_or(s);
+
+        let mut parts = s.split('x'); // [[{bitlen}], [{simdlen}], [{vec_len}] ]
+
+        let start = parts.next().ok_or("Impossible to parse type")?;
+
+        if let Some(digit_start) = start.find(|c: char| c.is_ascii_digit()) {
+            let (arg_kind, bit_len) = start.split_at(digit_start);
+
+            let arg_kind = arg_kind.parse::<TypeKind>()?;
+            let bit_len = bit_len.parse::<u32>().map_err(|err| err.to_string())?;
+
+            let simd_len = parts.next().map(|part| part.parse::<u32>().ok()).flatten();
+            let vec_len = parts.next().map(|part| part.parse::<u32>().ok()).flatten();
+
+            Ok(IntrinsicType::Type {
+                constant,
+                kind: arg_kind,
+                bit_len: Some(bit_len),
+                simd_len,
+                vec_len,
+            })
+        } else {
+            Ok(IntrinsicType::Type {
+                constant,
+                kind: start.parse::<TypeKind>()?,
+                bit_len: None,
+                simd_len: None,
+                vec_len: None,
+            })
+        }
+    }
+}
+
+#[derive(Deserialize, Debug, PartialEq, Clone)]
+struct ACLEIntrinsicLine {
+    #[serde(rename = "Intrinsic")]
+    intrinsic: String,
+    #[serde(rename = "Argument preparation")]
+    argument_preparation: String,
+    #[serde(rename = "AArch64 Instruction")]
+    aarch64_instruction: String,
+    #[serde(rename = "Result")]
+    result: String,
+    #[serde(rename = "Supported architectures")]
+    supported_architectures: String,
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::argument::Argument;
+    use crate::types::{IntrinsicType, TypeKind};
+
+    #[test]
+    fn parse_simd() {
+        let expected = Argument {
+            pos: 0,
+            name: "a".into(),
+            ty: IntrinsicType::Type {
+                constant: false,
+                kind: TypeKind::Int,
+                bit_len: Some(32),
+                simd_len: Some(4),
+                vec_len: None,
+            },
+            constraints: vec![],
+        };
+        let actual = from_c(0, "int32x4_t a");
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    fn parse_simd_with_vec() {
+        let expected = Argument {
+            pos: 0,
+            name: "a".into(),
+            ty: IntrinsicType::Type {
+                constant: false,
+                kind: TypeKind::Int,
+                bit_len: Some(32),
+                simd_len: Some(4),
+                vec_len: Some(2),
+            },
+            constraints: vec![],
+        };
+        let actual = from_c(0, "int32x4x2_t a");
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    fn test_ptr() {
+        let expected = Argument {
+            pos: 0,
+            name: "ptr".into(),
+            ty: crate::types::IntrinsicType::Ptr {
+                constant: true,
+                child: Box::new(IntrinsicType::Type {
+                    constant: false,
+                    kind: TypeKind::Int,
+                    bit_len: Some(8),
+                    simd_len: None,
+                    vec_len: None,
+                }),
+            },
+            constraints: vec![],
+        };
+        let actual = from_c(0, "int8_t const *ptr");
+        assert_eq!(expected, actual);
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/argument.rs b/library/stdarch/crates/intrinsic-test/src/argument.rs
new file mode 100644
index 000000000..f4cb77992
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/argument.rs
@@ -0,0 +1,139 @@
+use std::ops::Range;
+
+use crate::types::IntrinsicType;
+use crate::Language;
+
+/// An argument for the intrinsic.
+#[derive(Debug, PartialEq, Clone)]
+pub struct Argument {
+    /// The argument's index in the intrinsic function call.
+    pub pos: usize,
+    /// The argument name.
+    pub name: String,
+    /// The type of the argument.
+    pub ty: IntrinsicType,
+    /// Any constraints that are on this argument
+    pub constraints: Vec<Constraint>,
+}
+
+#[derive(Debug, PartialEq, Clone)]
+pub enum Constraint {
+    Equal(i64),
+    Range(Range<i64>),
+}
+
+impl Constraint {
+    pub fn to_range(&self) -> Range<i64> {
+        match self {
+            Constraint::Equal(eq) => *eq..*eq + 1,
+            Constraint::Range(range) => range.clone(),
+        }
+    }
+}
+
+impl Argument {
+    fn to_c_type(&self) -> String {
+        self.ty.c_type()
+    }
+
+    fn is_simd(&self) -> bool {
+        self.ty.is_simd()
+    }
+
+    pub fn is_ptr(&self) -> bool {
+        self.ty.is_ptr()
+    }
+
+    pub fn has_constraint(&self) -> bool {
+        !self.constraints.is_empty()
+    }
+}
+
+#[derive(Debug, PartialEq, Clone)]
+pub struct ArgumentList {
+    pub args: Vec<Argument>,
+}
+
+impl ArgumentList {
+    /// Converts the argument list into the call parameters for a C function call.
+    /// e.g. this would generate something like `a, &b, c`
+    pub fn as_call_param_c(&self) -> String {
+        self.args
+            .iter()
+            .map(|arg| match arg.ty {
+                IntrinsicType::Ptr { .. } => {
+                    format!("&{}", arg.name)
+                }
+                IntrinsicType::Type { .. } => arg.name.clone(),
+            })
+            .collect::<Vec<String>>()
+            .join(", ")
+    }
+
+    /// Converts the argument list into the call parameters for a Rust function.
+    /// e.g. this would generate something like `a, b, c`
+    pub fn as_call_param_rust(&self) -> String {
+        self.args
+            .iter()
+            .filter(|a| !a.has_constraint())
+            .map(|arg| arg.name.clone())
+            .collect::<Vec<String>>()
+            .join(", ")
+    }
+
+    pub fn as_constraint_parameters_rust(&self) -> String {
+        self.args
+            .iter()
+            .filter(|a| a.has_constraint())
+            .map(|arg| arg.name.clone())
+            .collect::<Vec<String>>()
+            .join(", ")
+    }
+
+    /// Creates a line that initializes this argument for C code.
+    /// e.g. `int32x2_t a = { 0x1, 0x2 };`
+    pub fn init_random_values_c(&self, pass: usize) -> String {
+        self.iter()
+            .filter_map(|arg| {
+                (!arg.has_constraint()).then(|| {
+                    format!(
+                        "{ty} {name} = {{ {values} }};",
+                        ty = arg.to_c_type(),
+                        name = arg.name,
+                        values = arg.ty.populate_random(pass, &Language::C)
+                    )
+                })
+            })
+            .collect::<Vec<_>>()
+            .join("\n    ")
+    }
+
+    /// Creates a line that initializes this argument for Rust code.
+    /// e.g. `let a = transmute([0x1, 0x2]);`
+    pub fn init_random_values_rust(&self, pass: usize) -> String {
+        self.iter()
+            .filter_map(|arg| {
+                (!arg.has_constraint()).then(|| {
+                    if arg.is_simd() {
+                        format!(
+                            "let {name} = ::std::mem::transmute([{values}]);",
+                            name = arg.name,
+                            values = arg.ty.populate_random(pass, &Language::Rust),
+                        )
+                    } else {
+                        format!(
+                            "let {name} = {value};",
+                            name = arg.name,
+                            value = arg.ty.populate_random(pass, &Language::Rust)
+                        )
+                    }
+                })
+            })
+            .collect::<Vec<_>>()
+            .join("\n        ")
+    }
+
+    pub fn iter(&self) -> std::slice::Iter<'_, Argument> {
+        self.args.iter()
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/intrinsic.rs b/library/stdarch/crates/intrinsic-test/src/intrinsic.rs
new file mode 100644
index 000000000..2b7130440
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/intrinsic.rs
@@ -0,0 +1,125 @@
+use crate::types::{IntrinsicType, TypeKind};
+
+use super::argument::ArgumentList;
+
+/// An intrinsic
+#[derive(Debug, PartialEq, Clone)]
+pub struct Intrinsic {
+    /// The function name of this intrinsic.
+    pub name: String,
+
+    /// Any arguments for this intrinsic.
+    pub arguments: ArgumentList,
+
+    /// The return type of this intrinsic.
+    pub results: IntrinsicType,
+
+    /// Whether this intrinsic is only available on A64.
+    pub a64_only: bool,
+}
+
+impl Intrinsic {
+    /// Generates a std::cout for the intrinsics results that will match the
+    /// rust debug output format for the return type.
+    pub fn print_result_c(&self, index: usize, additional: &str) -> String {
+        let lanes = if self.results.num_vectors() > 1 {
+            (0..self.results.num_vectors())
+                .map(|vector| {
+                    format!(
+                        r#""{ty}(" << {lanes} << ")""#,
+                        ty = self.results.c_single_vector_type(),
+                        lanes = (0..self.results.num_lanes())
+                            .map(move |idx| -> std::string::String {
+                                format!(
+                                    "{cast}{lane_fn}(__return_value.val[{vector}], {lane})",
+                                    cast = self.results.c_promotion(),
+                                    lane_fn = self.results.get_lane_function(),
+                                    lane = idx,
+                                    vector = vector,
+                                )
+                            })
+                            .collect::<Vec<_>>()
+                            .join(r#" << ", " << "#)
+                    )
+                })
+                .collect::<Vec<_>>()
+                .join(r#" << ", " << "#)
+        } else if self.results.num_lanes() > 1 {
+            (0..self.results.num_lanes())
+                .map(|idx| -> std::string::String {
+                    format!(
+                        "{cast}{lane_fn}(__return_value, {lane})",
+                        cast = self.results.c_promotion(),
+                        lane_fn = self.results.get_lane_function(),
+                        lane = idx
+                    )
+                })
+                .collect::<Vec<_>>()
+                .join(r#" << ", " << "#)
+        } else {
+            format!(
+                "{promote}cast<{cast}>(__return_value)",
+                cast = match self.results.kind() {
+                    TypeKind::Float if self.results.inner_size() == 32 => "float".to_string(),
+                    TypeKind::Float if self.results.inner_size() == 64 => "double".to_string(),
+                    TypeKind::Int => format!("int{}_t", self.results.inner_size()),
+                    TypeKind::UInt => format!("uint{}_t", self.results.inner_size()),
+                    TypeKind::Poly => format!("poly{}_t", self.results.inner_size()),
+                    ty => todo!("print_result_c - Unknown type: {:#?}", ty),
+                },
+                promote = self.results.c_promotion(),
+            )
+        };
+
+        format!(
+            r#"std::cout << "Result {additional}-{idx}: {ty}" << std::fixed << std::setprecision(150) <<  {lanes} << "{close}" << std::endl;"#,
+            ty = if self.results.is_simd() {
+                format!("{}(", self.results.c_type())
+            } else {
+                String::from("")
+            },
+            close = if self.results.is_simd() { ")" } else { "" },
+            lanes = lanes,
+            additional = additional,
+            idx = index,
+        )
+    }
+
+    pub fn generate_pass_rust(&self, index: usize, additional: &str) -> String {
+        let constraints = self.arguments.as_constraint_parameters_rust();
+        let constraints = if !constraints.is_empty() {
+            format!("::<{}>", constraints)
+        } else {
+            constraints
+        };
+
+        format!(
+            r#"
+    unsafe {{
+        {initialized_args}
+        let res = {intrinsic_call}{const}({args});
+        println!("Result {additional}-{idx}: {{:.150?}}", res);
+    }}"#,
+            initialized_args = self.arguments.init_random_values_rust(index),
+            intrinsic_call = self.name,
+            args = self.arguments.as_call_param_rust(),
+            additional = additional,
+            idx = index,
+            const = constraints,
+        )
+    }
+
+    pub fn generate_pass_c(&self, index: usize, additional: &str) -> String {
+        format!(
+            r#"  {{
+    {initialized_args}
+    auto __return_value = {intrinsic_call}({args});
+    {print_result}
+  }}"#,
+            initialized_args = self.arguments.init_random_values_c(index),
+            intrinsic_call = self.name,
+            args = self.arguments.as_call_param_c(),
+            print_result = self.print_result_c(index, additional)
+        )
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/main.rs b/library/stdarch/crates/intrinsic-test/src/main.rs
new file mode 100644
index 000000000..1b58da2fd
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/main.rs
@@ -0,0 +1,479 @@
+#![feature(slice_partition_dedup)]
+#[macro_use]
+extern crate lazy_static;
+#[macro_use]
+extern crate log;
+
+use std::fs::File;
+use std::io::Write;
+use std::process::Command;
+
+use clap::{App, Arg};
+use intrinsic::Intrinsic;
+use itertools::Itertools;
+use rayon::prelude::*;
+use types::TypeKind;
+
+use crate::acle_csv_parser::get_acle_intrinsics;
+use crate::argument::Argument;
+
+mod acle_csv_parser;
+mod argument;
+mod intrinsic;
+mod types;
+mod values;
+
+#[derive(Debug, PartialEq)]
+pub enum Language {
+    Rust,
+    C,
+}
+
+fn gen_code_c(intrinsic: &Intrinsic, constraints: &[&Argument], name: String) -> String {
+    if let Some((current, constraints)) = constraints.split_last() {
+        let range = current
+            .constraints
+            .iter()
+            .map(|c| c.to_range())
+            .flat_map(|r| r.into_iter());
+
+        range
+            .map(|i| {
+                format!(
+                    r#"  {{
+  {ty} {name} = {val};
+{pass}
+  }}"#,
+                    name = current.name,
+                    ty = current.ty.c_type(),
+                    val = i,
+                    pass = gen_code_c(intrinsic, constraints, format!("{}-{}", name, i))
+                )
+            })
+            .collect()
+    } else {
+        (1..20)
+            .map(|idx| intrinsic.generate_pass_c(idx, &name))
+            .collect::<Vec<_>>()
+            .join("\n")
+    }
+}
+
+fn generate_c_program(header_files: &[&str], intrinsic: &Intrinsic) -> String {
+    let constraints = intrinsic
+        .arguments
+        .iter()
+        .filter(|i| i.has_constraint())
+        .collect_vec();
+
+    format!(
+        r#"{header_files}
+#include <iostream>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
+
+template<typename T1, typename T2> T1 cast(T2 x) {{
+  static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same");
+  T1 ret = 0;
+  memcpy(&ret, &x, sizeof(T1));
+  return ret;
+}}
+
+#ifdef __aarch64__
+std::ostream& operator<<(std::ostream& os, poly128_t value) {{
+  std::stringstream temp;
+  do {{
+    int n = value % 10;
+    value /= 10;
+    temp << n;
+  }} while (value != 0);
+  std::string tempstr(temp.str());
+  std::string res(tempstr.rbegin(), tempstr.rend());
+  os << res;
+  return os;
+}}
+#endif
+
+int main(int argc, char **argv) {{
+{passes}
+    return 0;
+}}"#,
+        header_files = header_files
+            .iter()
+            .map(|header| format!("#include <{}>", header))
+            .collect::<Vec<_>>()
+            .join("\n"),
+        passes = gen_code_c(intrinsic, constraints.as_slice(), Default::default()),
+    )
+}
+
+fn gen_code_rust(intrinsic: &Intrinsic, constraints: &[&Argument], name: String) -> String {
+    if let Some((current, constraints)) = constraints.split_last() {
+        let range = current
+            .constraints
+            .iter()
+            .map(|c| c.to_range())
+            .flat_map(|r| r.into_iter());
+
+        range
+            .map(|i| {
+                format!(
+                    r#"  {{
+    const {name}: {ty} = {val};
+{pass}
+  }}"#,
+                    name = current.name,
+                    ty = current.ty.rust_type(),
+                    val = i,
+                    pass = gen_code_rust(intrinsic, constraints, format!("{}-{}", name, i))
+                )
+            })
+            .collect()
+    } else {
+        (1..20)
+            .map(|idx| intrinsic.generate_pass_rust(idx, &name))
+            .collect::<Vec<_>>()
+            .join("\n")
+    }
+}
+
+fn generate_rust_program(intrinsic: &Intrinsic, a32: bool) -> String {
+    let constraints = intrinsic
+        .arguments
+        .iter()
+        .filter(|i| i.has_constraint())
+        .collect_vec();
+
+    format!(
+        r#"#![feature(simd_ffi)]
+#![feature(link_llvm_intrinsics)]
+#![feature(stdsimd)]
+#![allow(overflowing_literals)]
+#![allow(non_upper_case_globals)]
+use core_arch::arch::{target_arch}::*;
+
+fn main() {{
+{passes}
+}}
+"#,
+        target_arch = if a32 { "arm" } else { "aarch64" },
+        passes = gen_code_rust(intrinsic, &constraints, Default::default())
+    )
+}
+
+fn compile_c(c_filename: &str, intrinsic: &Intrinsic, compiler: &str, a32: bool) -> bool {
+    let flags = std::env::var("CPPFLAGS").unwrap_or("".into());
+
+    let output = Command::new("sh")
+        .arg("-c")
+        .arg(format!(
+            "{cpp} {cppflags} {arch_flags} -Wno-narrowing -O2 -target {target} -o c_programs/{intrinsic} {filename}",
+            target = if a32 { "armv7-unknown-linux-gnueabihf" } else { "aarch64-unknown-linux-gnu" },
+            arch_flags = if a32 { "-march=armv8.6-a+crypto+crc+dotprod" } else { "-march=armv8.6-a+crypto+sha3+crc+dotprod" },
+            filename = c_filename,
+            intrinsic = intrinsic.name,
+            cpp = compiler,
+            cppflags = flags,
+        ))
+        .output();
+    if let Ok(output) = output {
+        if output.status.success() {
+            true
+        } else {
+            error!(
+                "Failed to compile code for intrinsic: {}\n\nstdout:\n{}\n\nstderr:\n{}",
+                intrinsic.name,
+                std::str::from_utf8(&output.stdout).unwrap_or(""),
+                std::str::from_utf8(&output.stderr).unwrap_or("")
+            );
+            false
+        }
+    } else {
+        error!("Command failed: {:#?}", output);
+        false
+    }
+}
+
+fn build_c(intrinsics: &Vec<Intrinsic>, compiler: &str, a32: bool) -> bool {
+    let _ = std::fs::create_dir("c_programs");
+    intrinsics
+        .par_iter()
+        .map(|i| {
+            let c_filename = format!(r#"c_programs/{}.cpp"#, i.name);
+            let mut file = File::create(&c_filename).unwrap();
+
+            let c_code = generate_c_program(&["arm_neon.h", "arm_acle.h"], &i);
+            file.write_all(c_code.into_bytes().as_slice()).unwrap();
+            compile_c(&c_filename, &i, compiler, a32)
+        })
+        .find_any(|x| !x)
+        .is_none()
+}
+
+fn build_rust(intrinsics: &Vec<Intrinsic>, toolchain: &str, a32: bool) -> bool {
+    intrinsics.iter().for_each(|i| {
+        let rust_dir = format!(r#"rust_programs/{}"#, i.name);
+        let _ = std::fs::create_dir_all(&rust_dir);
+        let rust_filename = format!(r#"{}/main.rs"#, rust_dir);
+        let mut file = File::create(&rust_filename).unwrap();
+
+        let c_code = generate_rust_program(&i, a32);
+        file.write_all(c_code.into_bytes().as_slice()).unwrap();
+    });
+
+    let mut cargo = File::create("rust_programs/Cargo.toml").unwrap();
+    cargo
+        .write_all(
+            format!(
+                r#"[package]
+name = "intrinsic-test"
+version = "{version}"
+authors = ["{authors}"]
+edition = "2018"
+[workspace]
+[dependencies]
+core_arch = {{ path = "../crates/core_arch" }}
+{binaries}"#,
+                version = env!("CARGO_PKG_VERSION"),
+                authors = env!("CARGO_PKG_AUTHORS"),
+                binaries = intrinsics
+                    .iter()
+                    .map(|i| {
+                        format!(
+                            r#"[[bin]]
+name = "{intrinsic}"
+path = "{intrinsic}/main.rs""#,
+                            intrinsic = i.name
+                        )
+                    })
+                    .collect::<Vec<_>>()
+                    .join("\n")
+            )
+            .into_bytes()
+            .as_slice(),
+        )
+        .unwrap();
+
+    let output = Command::new("sh")
+        .current_dir("rust_programs")
+        .arg("-c")
+        .arg(format!(
+            "cargo {toolchain} build --target {target}",
+            toolchain = toolchain,
+            target = if a32 {
+                "armv7-unknown-linux-gnueabihf"
+            } else {
+                "aarch64-unknown-linux-gnu"
+            },
+        ))
+        .env("RUSTFLAGS", "-Cdebuginfo=0")
+        .output();
+    if let Ok(output) = output {
+        if output.status.success() {
+            true
+        } else {
+            error!(
+                "Failed to compile code for intrinsics\n\nstdout:\n{}\n\nstderr:\n{}",
+                std::str::from_utf8(&output.stdout).unwrap_or(""),
+                std::str::from_utf8(&output.stderr).unwrap_or("")
+            );
+            false
+        }
+    } else {
+        error!("Command failed: {:#?}", output);
+        false
+    }
+}
+
+fn main() {
+    pretty_env_logger::init();
+
+    let matches = App::new("Intrinsic test tool")
+        .about("Generates Rust and C programs for intrinsics and compares the output")
+        .arg(
+            Arg::with_name("INPUT")
+                .help("The input file containing the intrinsics")
+                .required(true)
+                .index(1),
+        )
+        .arg(
+            Arg::with_name("TOOLCHAIN")
+                .takes_value(true)
+                .long("toolchain")
+                .help("The rust toolchain to use for building the rust code"),
+        )
+        .arg(
+            Arg::with_name("CPPCOMPILER")
+                .takes_value(true)
+                .default_value("clang++")
+                .long("cppcompiler")
+                .help("The C++ compiler to use for compiling the c++ code"),
+        )
+        .arg(
+            Arg::with_name("RUNNER")
+                .takes_value(true)
+                .long("runner")
+                .help("Run the C programs under emulation with this command"),
+        )
+        .arg(
+            Arg::with_name("SKIP")
+                .takes_value(true)
+                .long("skip")
+                .help("Filename for a list of intrinsics to skip (one per line)"),
+        )
+        .arg(
+            Arg::with_name("A32")
+                .takes_value(false)
+                .long("a32")
+                .help("Run tests for A32 instrinsics instead of A64"),
+        )
+        .get_matches();
+
+    let filename = matches.value_of("INPUT").unwrap();
+    let toolchain = matches
+        .value_of("TOOLCHAIN")
+        .map_or("".into(), |t| format!("+{}", t));
+
+    let cpp_compiler = matches.value_of("CPPCOMPILER").unwrap();
+    let c_runner = matches.value_of("RUNNER").unwrap_or("");
+    let skip = if let Some(filename) = matches.value_of("SKIP") {
+        let data = std::fs::read_to_string(&filename).expect("Failed to open file");
+        data.lines()
+            .map(str::trim)
+            .filter(|s| !s.contains('#'))
+            .map(String::from)
+            .collect_vec()
+    } else {
+        Default::default()
+    };
+    let a32 = matches.is_present("A32");
+
+    let intrinsics = get_acle_intrinsics(filename);
+
+    let mut intrinsics = intrinsics
+        .into_iter()
+        // Not sure how we would compare intrinsic that returns void.
+        .filter(|i| i.results.kind() != TypeKind::Void)
+        .filter(|i| i.results.kind() != TypeKind::BFloat)
+        .filter(|i| !(i.results.kind() == TypeKind::Float && i.results.inner_size() == 16))
+        .filter(|i| !i.arguments.iter().any(|a| a.ty.kind() == TypeKind::BFloat))
+        .filter(|i| {
+            !i.arguments
+                .iter()
+                .any(|a| a.ty.kind() == TypeKind::Float && a.ty.inner_size() == 16)
+        })
+        // Skip pointers for now, we would probably need to look at the return
+        // type to work out how many elements we need to point to.
+        .filter(|i| !i.arguments.iter().any(|a| a.is_ptr()))
+        .filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128))
+        .filter(|i| !skip.contains(&i.name))
+        .filter(|i| !(a32 && i.a64_only))
+        .collect::<Vec<_>>();
+    intrinsics.dedup();
+
+    if !build_c(&intrinsics, cpp_compiler, a32) {
+        std::process::exit(2);
+    }
+
+    if !build_rust(&intrinsics, &toolchain, a32) {
+        std::process::exit(3);
+    }
+
+    if !compare_outputs(&intrinsics, &toolchain, &c_runner, a32) {
+        std::process::exit(1)
+    }
+}
+
+enum FailureReason {
+    RunC(String),
+    RunRust(String),
+    Difference(String, String, String),
+}
+
+fn compare_outputs(intrinsics: &Vec<Intrinsic>, toolchain: &str, runner: &str, a32: bool) -> bool {
+    let intrinsics = intrinsics
+        .par_iter()
+        .filter_map(|intrinsic| {
+            let c = Command::new("sh")
+                .arg("-c")
+                .arg(format!(
+                    "{runner} ./c_programs/{intrinsic}",
+                    runner = runner,
+                    intrinsic = intrinsic.name,
+                ))
+                .output();
+            let rust = Command::new("sh")
+                .current_dir("rust_programs")
+                .arg("-c")
+                .arg(format!(
+                    "cargo {toolchain} run --target {target} --bin {intrinsic}",
+                    intrinsic = intrinsic.name,
+                    toolchain = toolchain,
+                    target = if a32 {
+                        "armv7-unknown-linux-gnueabihf"
+                    } else {
+                        "aarch64-unknown-linux-gnu"
+                    },
+                ))
+                .env("RUSTFLAGS", "-Cdebuginfo=0")
+                .output();
+
+            let (c, rust) = match (c, rust) {
+                (Ok(c), Ok(rust)) => (c, rust),
+                a => panic!("{:#?}", a),
+            };
+
+            if !c.status.success() {
+                error!("Failed to run C program for intrinsic {}", intrinsic.name);
+                return Some(FailureReason::RunC(intrinsic.name.clone()));
+            }
+
+            if !rust.status.success() {
+                error!(
+                    "Failed to run rust program for intrinsic {}",
+                    intrinsic.name
+                );
+                return Some(FailureReason::RunRust(intrinsic.name.clone()));
+            }
+
+            info!("Comparing intrinsic: {}", intrinsic.name);
+
+            let c = std::str::from_utf8(&c.stdout)
+                .unwrap()
+                .to_lowercase()
+                .replace("-nan", "nan");
+            let rust = std::str::from_utf8(&rust.stdout)
+                .unwrap()
+                .to_lowercase()
+                .replace("-nan", "nan");
+
+            if c == rust {
+                None
+            } else {
+                Some(FailureReason::Difference(intrinsic.name.clone(), c, rust))
+            }
+        })
+        .collect::<Vec<_>>();
+
+    intrinsics.iter().for_each(|reason| match reason {
+        FailureReason::Difference(intrinsic, c, rust) => {
+            println!("Difference for intrinsic: {}", intrinsic);
+            let diff = diff::lines(c, rust);
+            diff.iter().for_each(|diff| match diff {
+                diff::Result::Left(c) => println!("C: {}", c),
+                diff::Result::Right(rust) => println!("Rust: {}", rust),
+                diff::Result::Both(_, _) => (),
+            });
+            println!("****************************************************************");
+        }
+        FailureReason::RunC(intrinsic) => {
+            println!("Failed to run C program for intrinsic {}", intrinsic)
+        }
+        FailureReason::RunRust(intrinsic) => {
+            println!("Failed to run rust program for intrinsic {}", intrinsic)
+        }
+    });
+    println!("{} differences found", intrinsics.len());
+    intrinsics.is_empty()
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/types.rs b/library/stdarch/crates/intrinsic-test/src/types.rs
new file mode 100644
index 000000000..e51e61649
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/types.rs
@@ -0,0 +1,436 @@
+use std::fmt;
+use std::str::FromStr;
+
+use crate::values::values_for_pass;
+use crate::Language;
+
+#[derive(Debug, PartialEq, Copy, Clone)]
+pub enum TypeKind {
+    BFloat,
+    Float,
+    Int,
+    UInt,
+    Poly,
+    Void,
+}
+
+impl FromStr for TypeKind {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "bfloat" => Ok(Self::BFloat),
+            "float" => Ok(Self::Float),
+            "int" => Ok(Self::Int),
+            "poly" => Ok(Self::Poly),
+            "uint" | "unsigned" => Ok(Self::UInt),
+            "void" => Ok(Self::Void),
+            _ => Err(format!("Impossible to parse argument kind {}", s)),
+        }
+    }
+}
+
+impl fmt::Display for TypeKind {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}",
+            match self {
+                Self::BFloat => "bfloat",
+                Self::Float => "float",
+                Self::Int => "int",
+                Self::UInt => "uint",
+                Self::Poly => "poly",
+                Self::Void => "void",
+            }
+        )
+    }
+}
+
+impl TypeKind {
+    /// Gets the type part of a c typedef for a type that's in the form of {type}{size}_t.
+    pub fn c_prefix(&self) -> &str {
+        match self {
+            Self::Float => "float",
+            Self::Int => "int",
+            Self::UInt => "uint",
+            Self::Poly => "poly",
+            _ => unreachable!("Not used: {:#?}", self),
+        }
+    }
+
+    /// Gets the rust prefix for the type kind i.e. i, u, f.
+    pub fn rust_prefix(&self) -> &str {
+        match self {
+            Self::Float => "f",
+            Self::Int => "i",
+            Self::UInt => "u",
+            Self::Poly => "u",
+            _ => unreachable!("Unused type kind: {:#?}", self),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Clone)]
+pub enum IntrinsicType {
+    Ptr {
+        constant: bool,
+        child: Box<IntrinsicType>,
+    },
+    Type {
+        constant: bool,
+        kind: TypeKind,
+        /// The bit length of this type (e.g. 32 for u32).
+        bit_len: Option<u32>,
+
+        /// Length of the SIMD vector (i.e. 4 for uint32x4_t), A value of `None`
+        /// means this is not a simd type. A `None` can be assumed to be 1,
+        /// although in some places a distinction is needed between `u64` and
+        /// `uint64x1_t` this signals that.
+        simd_len: Option<u32>,
+
+        /// The number of rows for SIMD matrices (i.e. 2 for uint8x8x2_t).
+        /// A value of `None` represents a type that does not contain any
+        /// rows encoded in the type (e.g. uint8x8_t).
+        /// A value of `None` can be assumed to be 1 though.
+        vec_len: Option<u32>,
+    },
+}
+
+impl IntrinsicType {
+    /// Get the TypeKind for this type, recursing into pointers.
+    pub fn kind(&self) -> TypeKind {
+        match *self {
+            IntrinsicType::Ptr { ref child, .. } => child.kind(),
+            IntrinsicType::Type { kind, .. } => kind,
+        }
+    }
+
+    /// Get the size of a single element inside this type, recursing into
+    /// pointers, i.e. a pointer to a u16 would be 16 rather than the size
+    /// of a pointer.
+    pub fn inner_size(&self) -> u32 {
+        match *self {
+            IntrinsicType::Ptr { ref child, .. } => child.inner_size(),
+            IntrinsicType::Type {
+                bit_len: Some(bl), ..
+            } => bl,
+            _ => unreachable!(""),
+        }
+    }
+
+    pub fn num_lanes(&self) -> u32 {
+        match *self {
+            IntrinsicType::Ptr { ref child, .. } => child.num_lanes(),
+            IntrinsicType::Type {
+                simd_len: Some(sl), ..
+            } => sl,
+            _ => 1,
+        }
+    }
+
+    pub fn num_vectors(&self) -> u32 {
+        match *self {
+            IntrinsicType::Ptr { ref child, .. } => child.num_vectors(),
+            IntrinsicType::Type {
+                vec_len: Some(vl), ..
+            } => vl,
+            _ => 1,
+        }
+    }
+
+    /// Determine if the type is a simd type, this will treat a type such as
+    /// `uint64x1` as simd.
+    pub fn is_simd(&self) -> bool {
+        match *self {
+            IntrinsicType::Ptr { ref child, .. } => child.is_simd(),
+            IntrinsicType::Type {
+                simd_len: None,
+                vec_len: None,
+                ..
+            } => false,
+            _ => true,
+        }
+    }
+
+    pub fn is_ptr(&self) -> bool {
+        match *self {
+            IntrinsicType::Ptr { .. } => true,
+            IntrinsicType::Type { .. } => false,
+        }
+    }
+
+    #[allow(unused)]
+    fn c_scalar_type(&self) -> String {
+        format!(
+            "{prefix}{bits}_t",
+            prefix = self.kind().c_prefix(),
+            bits = self.inner_size()
+        )
+    }
+
+    fn rust_scalar_type(&self) -> String {
+        format!(
+            "{prefix}{bits}",
+            prefix = self.kind().rust_prefix(),
+            bits = self.inner_size()
+        )
+    }
+
+    /// Gets a string containing the typename for this type in C format.
+    pub fn c_type(&self) -> String {
+        match self {
+            IntrinsicType::Ptr { child, .. } => child.c_type(),
+            IntrinsicType::Type {
+                constant,
+                kind,
+                bit_len: Some(bit_len),
+                simd_len: None,
+                vec_len: None,
+                ..
+            } => format!(
+                "{}{}{}_t",
+                if *constant { "const " } else { "" },
+                kind.c_prefix(),
+                bit_len
+            ),
+            IntrinsicType::Type {
+                kind,
+                bit_len: Some(bit_len),
+                simd_len: Some(simd_len),
+                vec_len: None,
+                ..
+            } => format!("{}{}x{}_t", kind.c_prefix(), bit_len, simd_len),
+            IntrinsicType::Type {
+                kind,
+                bit_len: Some(bit_len),
+                simd_len: Some(simd_len),
+                vec_len: Some(vec_len),
+                ..
+            } => format!("{}{}x{}x{}_t", kind.c_prefix(), bit_len, simd_len, vec_len),
+            _ => todo!("{:#?}", self),
+        }
+    }
+
+    pub fn c_single_vector_type(&self) -> String {
+        match self {
+            IntrinsicType::Ptr { child, .. } => child.c_single_vector_type(),
+            IntrinsicType::Type {
+                kind,
+                bit_len: Some(bit_len),
+                simd_len: Some(simd_len),
+                vec_len: Some(_),
+                ..
+            } => format!("{}{}x{}_t", kind.c_prefix(), bit_len, simd_len),
+            _ => unreachable!("Shouldn't be called on this type"),
+        }
+    }
+
+    pub fn rust_type(&self) -> String {
+        match self {
+            IntrinsicType::Ptr { child, .. } => child.c_type(),
+            IntrinsicType::Type {
+                kind,
+                bit_len: Some(bit_len),
+                simd_len: None,
+                vec_len: None,
+                ..
+            } => format!("{}{}", kind.rust_prefix(), bit_len),
+            IntrinsicType::Type {
+                kind,
+                bit_len: Some(bit_len),
+                simd_len: Some(simd_len),
+                vec_len: None,
+                ..
+            } => format!("{}{}x{}_t", kind.c_prefix(), bit_len, simd_len),
+            IntrinsicType::Type {
+                kind,
+                bit_len: Some(bit_len),
+                simd_len: Some(simd_len),
+                vec_len: Some(vec_len),
+                ..
+            } => format!("{}{}x{}x{}_t", kind.c_prefix(), bit_len, simd_len, vec_len),
+            _ => todo!("{:#?}", self),
+        }
+    }
+
+    /// Gets a cast for this type if needs promotion.
+    /// This is required for 8 bit types due to printing as the 8 bit types use
+    /// a char and when using that in `std::cout` it will print as a character,
+    /// which means value of 0 will be printed as a null byte.
+    ///
+    /// This is also needed for polynomial types because we want them to be
+    /// printed as unsigned integers to match Rust's `Debug` impl.
+    pub fn c_promotion(&self) -> &str {
+        match *self {
+            IntrinsicType::Type {
+                kind,
+                bit_len: Some(bit_len),
+                ..
+            } if bit_len == 8 => match kind {
+                TypeKind::Int => "(int)",
+                TypeKind::UInt => "(unsigned int)",
+                TypeKind::Poly => "(unsigned int)(uint8_t)",
+                _ => "",
+            },
+            IntrinsicType::Type {
+                kind: TypeKind::Poly,
+                bit_len: Some(bit_len),
+                ..
+            } => match bit_len {
+                8 => unreachable!("handled above"),
+                16 => "(uint16_t)",
+                32 => "(uint32_t)",
+                64 => "(uint64_t)",
+                128 => "",
+                _ => panic!("invalid bit_len"),
+            },
+            _ => "",
+        }
+    }
+
+    /// Generates a comma list of values that can be used to initialize an
+    /// argument for the intrinsic call.
+    /// This is determistic based on the pass number.
+    ///
+    /// * `pass`: The pass index, i.e. the iteration index for the call to an intrinsic
+    ///
+    /// Returns a string such as
+    /// * `0x1, 0x7F, 0xFF` if `language` is `Language::C`
+    /// * `0x1 as _, 0x7F as _, 0xFF as _` if `language` is `Language::Rust`
+    pub fn populate_random(&self, pass: usize, language: &Language) -> String {
+        match self {
+            IntrinsicType::Ptr { child, .. } => child.populate_random(pass, language),
+            IntrinsicType::Type {
+                bit_len: Some(bit_len),
+                kind,
+                simd_len,
+                vec_len,
+                ..
+            } if kind == &TypeKind::Int || kind == &TypeKind::UInt || kind == &TypeKind::Poly => (0
+                ..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1)))
+                .map(|i| {
+                    format!(
+                        "{}{}",
+                        values_for_pass(*bit_len, i, pass),
+                        match language {
+                            &Language::Rust => format!(" as {ty} ", ty = self.rust_scalar_type()),
+                            &Language::C => String::from(""),
+                        }
+                    )
+                })
+                .collect::<Vec<_>>()
+                .join(","),
+            IntrinsicType::Type {
+                kind: TypeKind::Float,
+                bit_len: Some(32),
+                simd_len,
+                vec_len,
+                ..
+            } => (0..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1)))
+                .map(|i| {
+                    format!(
+                        "{}({})",
+                        match language {
+                            &Language::Rust => "f32::from_bits",
+                            &Language::C => "cast<float, uint32_t>",
+                        },
+                        values_for_pass(32, i, pass),
+                    )
+                })
+                .collect::<Vec<_>>()
+                .join(","),
+            IntrinsicType::Type {
+                kind: TypeKind::Float,
+                bit_len: Some(64),
+                simd_len,
+                vec_len,
+                ..
+            } => (0..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1)))
+                .map(|i| {
+                    format!(
+                        "{}({}{})",
+                        match language {
+                            &Language::Rust => "f64::from_bits",
+                            &Language::C => "cast<double, uint64_t>",
+                        },
+                        values_for_pass(64, i, pass),
+                        match language {
+                            &Language::Rust => " as u64",
+                            &Language::C => "",
+                        }
+                    )
+                })
+                .collect::<Vec<_>>()
+                .join(","),
+            _ => unreachable!("populate random: {:#?}", self),
+        }
+    }
+
+    /// Determines the load function for this type.
+    #[allow(unused)]
+    pub fn get_load_function(&self) -> String {
+        match self {
+            IntrinsicType::Ptr { child, .. } => child.get_load_function(),
+            IntrinsicType::Type {
+                kind: k,
+                bit_len: Some(bl),
+                simd_len,
+                vec_len,
+                ..
+            } => {
+                let quad = if (simd_len.unwrap_or(1) * bl) > 64 {
+                    "q"
+                } else {
+                    ""
+                };
+                format!(
+                    "vld{len}{quad}_{type}{size}",
+                    type = match k {
+                        TypeKind::UInt => "u",
+                        TypeKind::Int => "s",
+                        TypeKind::Float => "f",
+                        TypeKind::Poly => "p",
+                        x => todo!("get_load_function TypeKind: {:#?}", x),
+                    },
+                    size = bl,
+                    quad = quad,
+                    len = vec_len.unwrap_or(1),
+                )
+            }
+            _ => todo!("get_load_function IntrinsicType: {:#?}", self),
+        }
+    }
+
+    /// Determines the get lane function for this type.
+    pub fn get_lane_function(&self) -> String {
+        match self {
+            IntrinsicType::Ptr { child, .. } => child.get_lane_function(),
+            IntrinsicType::Type {
+                kind: k,
+                bit_len: Some(bl),
+                simd_len,
+                ..
+            } => {
+                let quad = if (simd_len.unwrap_or(1) * bl) > 64 {
+                    "q"
+                } else {
+                    ""
+                };
+                format!(
+                    "vget{quad}_lane_{type}{size}",
+                    type = match k {
+                        TypeKind::UInt => "u",
+                        TypeKind::Int => "s",
+                        TypeKind::Float => "f",
+                        TypeKind::Poly => "p",
+                        x => todo!("get_load_function TypeKind: {:#?}", x),
+                    },
+                    size = bl,
+                    quad = quad,
+                )
+            }
+            _ => todo!("get_lane_function IntrinsicType: {:#?}", self),
+        }
+    }
+}
diff --git a/library/stdarch/crates/intrinsic-test/src/values.rs b/library/stdarch/crates/intrinsic-test/src/values.rs
new file mode 100644
index 000000000..4565edca0
--- /dev/null
+++ b/library/stdarch/crates/intrinsic-test/src/values.rs
@@ -0,0 +1,126 @@
+/// Gets a hex constant value for a single lane in in a determistic way
+/// * `bits`: The number of bits for the type, only 8, 16, 32, 64 are valid values
+/// * `simd`: The index of the simd lane we are generating for
+/// * `pass`: The index of the pass we are generating the values for
+pub fn values_for_pass(bits: u32, simd: u32, pass: usize) -> String {
+    let index = pass + (simd as usize);
+
+    if bits == 8 {
+        format!("{:#X}", VALUES_8[index % VALUES_8.len()])
+    } else if bits == 16 {
+        format!("{:#X}", VALUES_16[index % VALUES_16.len()])
+    } else if bits == 32 {
+        format!("{:#X}", VALUES_32[index % VALUES_32.len()])
+    } else if bits == 64 {
+        format!("{:#X}", VALUES_64[index % VALUES_64.len()])
+    } else {
+        panic!("Unknown size: {}", bits);
+    }
+}
+
+pub const VALUES_8: &[u8] = &[
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0xf0, 0x80, 0x3b, 0xff,
+];
+
+pub const VALUES_16: &[u16] = &[
+    0x0000, // 0.0
+    0x0400, // The smallest normal value.
+    0x37ff, // The value just below 0.5.
+    0x3800, // 0.5
+    0x3801, // The value just above 0.5.
+    0x3bff, // The value just below 1.0.
+    0x3c00, // 1.0
+    0x3c01, // The value just above 1.0.
+    0x3e00, // 1.5
+    0x4900, // 10
+    0x7bff, // The largest finite value.
+    0x7c00, // Infinity.
+    // NaNs.
+    //  - Quiet NaNs
+    0x7f23, 0x7e00, //  - Signalling NaNs
+    0x7d23, 0x7c01, // Subnormals.
+    //  - A recognisable bit pattern.
+    0x0012, //  - The largest subnormal value.
+    0x03ff, //  - The smallest subnormal value.
+    0x0001, // The same values again, but negated.
+    0x8000, 0x8400, 0xb7ff, 0xb800, 0xb801, 0xbbff, 0xbc00, 0xbc01, 0xbe00, 0xc900, 0xfbff, 0xfc00,
+    0xff23, 0xfe00, 0xfd23, 0xfc01, 0x8012, 0x83ff, 0x8001,
+];
+
+pub const VALUES_32: &[u32] = &[
+    // Simple values.
+    0x00000000, // 0.0
+    0x00800000, // The smallest normal value.
+    0x3effffff, // The value just below 0.5.
+    0x3f000000, // 0.5
+    0x3f000001, // The value just above 0.5.
+    0x3f7fffff, // The value just below 1.0.
+    0x3f800000, // 1.0
+    0x3f800001, // The value just above 1.0.
+    0x3fc00000, // 1.5
+    0x41200000, // 10
+    0x7f8fffff, // The largest finite value.
+    0x7f800000, // Infinity.
+    // NaNs.
+    //  - Quiet NaNs
+    0x7fd23456, 0x7fc00000, //  - Signalling NaNs
+    0x7f923456, 0x7f800001, // Subnormals.
+    //  - A recognisable bit pattern.
+    0x00123456, //  - The largest subnormal value.
+    0x007fffff, //  - The smallest subnormal value.
+    0x00000001, // The same values again, but negated.
+    0x80000000, 0x80800000, 0xbeffffff, 0xbf000000, 0xbf000001, 0xbf7fffff, 0xbf800000, 0xbf800001,
+    0xbfc00000, 0xc1200000, 0xff8fffff, 0xff800000, 0xffd23456, 0xffc00000, 0xff923456, 0xff800001,
+    0x80123456, 0x807fffff, 0x80000001,
+];
+
+pub const VALUES_64: &[u64] = &[
+    // Simple values.
+    0x0000000000000000, // 0.0
+    0x0010000000000000, // The smallest normal value.
+    0x3fdfffffffffffff, // The value just below 0.5.
+    0x3fe0000000000000, // 0.5
+    0x3fe0000000000001, // The value just above 0.5.
+    0x3fefffffffffffff, // The value just below 1.0.
+    0x3ff0000000000000, // 1.0
+    0x3ff0000000000001, // The value just above 1.0.
+    0x3ff8000000000000, // 1.5
+    0x4024000000000000, // 10
+    0x7fefffffffffffff, // The largest finite value.
+    0x7ff0000000000000, // Infinity.
+    // NaNs.
+    //  - Quiet NaNs
+    0x7ff923456789abcd,
+    0x7ff8000000000000,
+    //  - Signalling NaNs
+    0x7ff123456789abcd,
+    0x7ff0000000000000,
+    // Subnormals.
+    //  - A recognisable bit pattern.
+    0x000123456789abcd,
+    //  - The largest subnormal value.
+    0x000fffffffffffff,
+    //  - The smallest subnormal value.
+    0x0000000000000001,
+    // The same values again, but negated.
+    0x8000000000000000,
+    0x8010000000000000,
+    0xbfdfffffffffffff,
+    0xbfe0000000000000,
+    0xbfe0000000000001,
+    0xbfefffffffffffff,
+    0xbff0000000000000,
+    0xbff0000000000001,
+    0xbff8000000000000,
+    0xc024000000000000,
+    0xffefffffffffffff,
+    0xfff0000000000000,
+    0xfff923456789abcd,
+    0xfff8000000000000,
+    0xfff123456789abcd,
+    0xfff0000000000000,
+    0x800123456789abcd,
+    0x800fffffffffffff,
+    0x8000000000000001,
+];
diff --git a/library/stdarch/crates/simd-test-macro/Cargo.toml b/library/stdarch/crates/simd-test-macro/Cargo.toml
new file mode 100644
index 000000000..c3ecf981e
--- /dev/null
+++ b/library/stdarch/crates/simd-test-macro/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "simd-test-macro"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2018"
+
+[lib]
+proc-macro = true
+test = false
+
+[dependencies]
+proc-macro2 = "1.0"
+quote = "1.0"
diff --git a/library/stdarch/crates/simd-test-macro/src/lib.rs b/library/stdarch/crates/simd-test-macro/src/lib.rs
new file mode 100644
index 000000000..9d81a4c5e
--- /dev/null
+++ b/library/stdarch/crates/simd-test-macro/src/lib.rs
@@ -0,0 +1,151 @@
+//! Implementation of the `#[simd_test]` macro
+//!
+//! This macro expands to a `#[test]` function which tests the local machine
+//! for the appropriate cfg before calling the inner test function.
+#![deny(rust_2018_idioms)]
+
+#[macro_use]
+extern crate quote;
+
+use proc_macro2::{Delimiter, Ident, Literal, Span, TokenStream, TokenTree};
+use quote::ToTokens;
+use std::env;
+
+fn string(s: &str) -> TokenTree {
+    Literal::string(s).into()
+}
+
+#[proc_macro_attribute]
+pub fn simd_test(
+    attr: proc_macro::TokenStream,
+    item: proc_macro::TokenStream,
+) -> proc_macro::TokenStream {
+    let tokens = TokenStream::from(attr).into_iter().collect::<Vec<_>>();
+    if tokens.len() != 3 {
+        panic!("expected #[simd_test(enable = \"feature\")]");
+    }
+    match &tokens[0] {
+        TokenTree::Ident(tt) if *tt == "enable" => {}
+        _ => panic!("expected #[simd_test(enable = \"feature\")]"),
+    }
+    match &tokens[1] {
+        TokenTree::Punct(tt) if tt.as_char() == '=' => {}
+        _ => panic!("expected #[simd_test(enable = \"feature\")]"),
+    }
+    let enable_feature = match &tokens[2] {
+        TokenTree::Literal(tt) => tt.to_string(),
+        _ => panic!("expected #[simd_test(enable = \"feature\")]"),
+    };
+    let enable_feature = enable_feature.trim_start_matches('"').trim_end_matches('"');
+    let target_features: Vec<String> = enable_feature
+        .replace('+', "")
+        .split(',')
+        .map(String::from)
+        .collect();
+
+    let enable_feature = string(enable_feature);
+    let item = TokenStream::from(item);
+    let name = find_name(item.clone());
+
+    let name: TokenStream = name
+        .to_string()
+        .parse()
+        .unwrap_or_else(|_| panic!("failed to parse name: {}", name.to_string()));
+
+    let target = env::var("TARGET").expect(
+        "TARGET environment variable should be set for rustc (e.g. TARGET=x86_64-apple-darwin cargo test)"
+    );
+    let mut force_test = false;
+    let macro_test = match target
+        .split('-')
+        .next()
+        .unwrap_or_else(|| panic!("target triple contained no \"-\": {}", target))
+    {
+        "i686" | "x86_64" | "i586" => "is_x86_feature_detected",
+        "arm" | "armv7" => "is_arm_feature_detected",
+        "aarch64" => "is_aarch64_feature_detected",
+        maybe_riscv if maybe_riscv.starts_with("riscv") => "is_riscv_feature_detected",
+        "powerpc" | "powerpcle" => "is_powerpc_feature_detected",
+        "powerpc64" | "powerpc64le" => "is_powerpc64_feature_detected",
+        "mips" | "mipsel" | "mipsisa32r6" | "mipsisa32r6el" => {
+            // FIXME:
+            // On MIPS CI run-time feature detection always returns false due
+            // to this qemu bug: https://bugs.launchpad.net/qemu/+bug/1754372
+            //
+            // This is a workaround to force the MIPS tests to always run on
+            // CI.
+            force_test = true;
+            "is_mips_feature_detected"
+        }
+        "mips64" | "mips64el" | "mipsisa64r6" | "mipsisa64r6el" => {
+            // FIXME: see above
+            force_test = true;
+            "is_mips64_feature_detected"
+        }
+        t => panic!("unknown target: {}", t),
+    };
+    let macro_test = Ident::new(macro_test, Span::call_site());
+
+    let mut cfg_target_features = TokenStream::new();
+    for feature in target_features {
+        let q = quote_spanned! {
+            proc_macro2::Span::call_site() =>
+            #macro_test!(#feature) &&
+        };
+        q.to_tokens(&mut cfg_target_features);
+    }
+    let q = quote! { true };
+    q.to_tokens(&mut cfg_target_features);
+
+    let test_norun = std::env::var("STDSIMD_TEST_NORUN").is_ok();
+    let maybe_ignore = if test_norun {
+        quote! { #[ignore] }
+    } else {
+        TokenStream::new()
+    };
+
+    let ret: TokenStream = quote_spanned! {
+        proc_macro2::Span::call_site() =>
+        #[allow(non_snake_case)]
+        #[test]
+        #maybe_ignore
+        fn #name() {
+            if #force_test | (#cfg_target_features) {
+                let v = unsafe { #name() };
+                return v;
+            } else {
+                ::stdarch_test::assert_skip_test_ok(stringify!(#name));
+            }
+
+            #[target_feature(enable = #enable_feature)]
+            #item
+        }
+    };
+    ret.into()
+}
+
+fn find_name(item: TokenStream) -> Ident {
+    let mut tokens = item.into_iter();
+    while let Some(tok) = tokens.next() {
+        if let TokenTree::Ident(word) = tok {
+            if word == "fn" {
+                break;
+            }
+        }
+    }
+
+    fn get_ident(tt: TokenTree) -> Option<Ident> {
+        match tt {
+            TokenTree::Ident(i) => Some(i),
+            TokenTree::Group(g) if g.delimiter() == Delimiter::None => {
+                get_ident(g.stream().into_iter().next()?)
+            }
+            _ => None,
+        }
+    }
+
+    tokens
+        .next()
+        .and_then(get_ident)
+        .expect("failed to find function name")
+}
diff --git a/library/stdarch/crates/std_detect/Cargo.toml b/library/stdarch/crates/std_detect/Cargo.toml
new file mode 100644
index 000000000..1ca0d9c5d
--- /dev/null
+++ b/library/stdarch/crates/std_detect/Cargo.toml
@@ -0,0 +1,45 @@
+[package]
+name = "std_detect"
+version = "0.1.5"
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Andrew Gallant <jamslam@gmail.com>",
+    "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
+]
+description = "`std::detect` - Rust's standard library run-time CPU feature detection."
+homepage = "https://github.com/rust-lang/stdarch"
+repository = "https://github.com/rust-lang/stdarch"
+readme = "README.md"
+keywords = ["std", "run-time", "feature", "detection"]
+categories = ["hardware-support"]
+license = "MIT OR Apache-2.0"
+edition = "2018"
+
+[badges]
+is-it-maintained-issue-resolution = { repository = "rust-lang/stdarch" }
+is-it-maintained-open-issues = { repository = "rust-lang/stdarch" }
+maintenance = { status = "experimental" }
+
+[dependencies]
+libc = { version = "0.2", optional = true, default-features = false }
+cfg-if = "0.1.10"
+
+# When built as part of libstd
+core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
+compiler_builtins = { version = "0.1.2", optional = true }
+alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" }
+
+[dev-dependencies]
+auxv = "0.3.3"
+cupid = "0.6.0"
+
+[features]
+default = [ "std_detect_dlsym_getauxval", "std_detect_file_io" ]
+std_detect_file_io = [ "libc" ]
+std_detect_dlsym_getauxval = [ "libc" ]
+std_detect_env_override = [ "libc" ]
+rustc-dep-of-std = [
+    "core",
+    "compiler_builtins",
+    "alloc",
+]
diff --git a/library/stdarch/crates/std_detect/LICENSE-APACHE b/library/stdarch/crates/std_detect/LICENSE-APACHE
new file mode 100644
index 000000000..16fe87b06
--- /dev/null
+++ b/library/stdarch/crates/std_detect/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/library/stdarch/crates/std_detect/LICENSE-MIT b/library/stdarch/crates/std_detect/LICENSE-MIT
new file mode 100644
index 000000000..52d82415d
--- /dev/null
+++ b/library/stdarch/crates/std_detect/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2017 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/library/stdarch/crates/std_detect/README.md b/library/stdarch/crates/std_detect/README.md
new file mode 100644
index 000000000..bea7d941a
--- /dev/null
+++ b/library/stdarch/crates/std_detect/README.md
@@ -0,0 +1,73 @@
+`std::detect` - Rust's standard library run-time CPU feature detection
+=======
+
+The private `std::detect` module implements run-time feature detection in Rust's
+standard library. This allows detecting whether the CPU the binary runs on
+supports certain features, like SIMD instructions.
+
+# Usage 
+
+`std::detect` APIs are available as part of `libstd`. Prefer using it via the
+standard library than through this crate. Unstable features of `std::detect` are
+available on nightly Rust behind the `feature(stdsimd)` feature-gate.
+
+If you need run-time feature detection in `#[no_std]` environments, Rust `core`
+library cannot help you. By design, Rust `core` is platform independent, but
+performing run-time feature detection requires a certain level of cooperation
+from the platform.
+
+You can then manually include `std_detect` as a dependency to get similar
+run-time feature detection support than the one offered by Rust's standard
+library. We intend to make `std_detect` more flexible and configurable in this
+regard to better serve the needs of `#[no_std]` targets. 
+
+# Features
+
+* `std_detect_dlsym_getauxval` (enabled by default, requires `libc`): Enable to
+use `libc::dlsym` to query whether [`getauxval`] is linked into the binary. When
+this is not the case, this feature allows other fallback methods to perform
+run-time feature detection. When this feature is disabled, `std_detect` assumes
+that [`getauxval`] is linked to the binary. If that is not the case the behavior
+is undefined.
+
+* `std_detect_file_io` (enabled by default, requires `std`): Enable to perform run-time feature
+detection using file APIs (e.g. `/proc/cpuinfo`, etc.) if other more performant
+methods fail. This feature requires `libstd` as a dependency, preventing the
+crate from working on applications in which `std` is not available.
+
+[`getauxval`]: http://man7.org/linux/man-pages/man3/getauxval.3.html
+
+# Platform support
+
+* All `x86`/`x86_64` targets are supported on all platforms by querying the
+  `cpuid` instruction directly for the features supported by the hardware and
+  the operating system. `std_detect` assumes that the binary is an user-space
+  application. If you need raw support for querying `cpuid`, consider using the
+  [`cupid`](https://crates.io/crates/cupid) crate.
+  
+* Linux:
+  * `arm{32, 64}`, `mips{32,64}{,el}`, `powerpc{32,64}{,le}`: `std_detect`
+    supports these on Linux by querying ELF auxiliary vectors (using `getauxval`
+    when available), and if that fails, by querying `/proc/cpuinfo`. 
+  * `arm64`: partial support for doing run-time feature detection by directly
+    querying `mrs` is implemented for Linux >= 4.11, but not enabled by default.
+
+* FreeBSD:
+  * `arm64`: run-time feature detection is implemented by directly querying `mrs`.
+
+# License
+
+This project is licensed under either of
+
+ * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
+   http://www.apache.org/licenses/LICENSE-2.0)
+ * MIT license ([LICENSE-MIT](LICENSE-MIT) or
+   http://opensource.org/licenses/MIT)
+
+at your option.
+
+# Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in `std_detect` by you, as defined in the Apache-2.0 license,
+shall be dual licensed as above, without any additional terms or conditions.
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs
new file mode 100644
index 000000000..f32f961ae
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/aarch64.rs
@@ -0,0 +1,152 @@
+//! Aarch64 run-time features.
+
+features! {
+    @TARGET: aarch64;
+    @CFG: target_arch = "aarch64";
+    @MACRO_NAME: is_aarch64_feature_detected;
+    @MACRO_ATTRS:
+    /// This macro tests, at runtime, whether an `aarch64` feature is enabled on aarch64 platforms.
+    /// Currently most features are only supported on linux-based platforms.
+    ///
+    /// This macro takes one argument which is a string literal of the feature being tested for.
+    /// The feature names are mostly taken from their FEAT_* definitions in the [ARM Architecture
+    /// Reference Manual][docs].
+    ///
+    /// ## Supported arguments
+    ///
+    /// * `"asimd"` or "neon" - FEAT_AdvSIMD
+    /// * `"pmull"` - FEAT_PMULL
+    /// * `"fp"` - FEAT_FP
+    /// * `"fp16"` - FEAT_FP16
+    /// * `"sve"` - FEAT_SVE
+    /// * `"crc"` - FEAT_CRC
+    /// * `"lse"` - FEAT_LSE
+    /// * `"lse2"` - FEAT_LSE2
+    /// * `"rdm"` - FEAT_RDM
+    /// * `"rcpc"` - FEAT_LRCPC
+    /// * `"rcpc2"` - FEAT_LRCPC2
+    /// * `"dotprod"` - FEAT_DotProd
+    /// * `"tme"` - FEAT_TME
+    /// * `"fhm"` - FEAT_FHM
+    /// * `"dit"` - FEAT_DIT
+    /// * `"flagm"` - FEAT_FLAGM
+    /// * `"ssbs"` - FEAT_SSBS
+    /// * `"sb"` - FEAT_SB
+    /// * `"paca"` - FEAT_PAuth (address authentication)
+    /// * `"pacg"` - FEAT_Pauth (generic authentication)
+    /// * `"dpb"` - FEAT_DPB
+    /// * `"dpb2"` - FEAT_DPB2
+    /// * `"sve2"` - FEAT_SVE2
+    /// * `"sve2-aes"` - FEAT_SVE2_AES
+    /// * `"sve2-sm4"` - FEAT_SVE2_SM4
+    /// * `"sve2-sha3"` - FEAT_SVE2_SHA3
+    /// * `"sve2-bitperm"` - FEAT_SVE2_BitPerm
+    /// * `"frintts"` - FEAT_FRINTTS
+    /// * `"i8mm"` - FEAT_I8MM
+    /// * `"f32mm"` - FEAT_F32MM
+    /// * `"f64mm"` - FEAT_F64MM
+    /// * `"bf16"` - FEAT_BF16
+    /// * `"rand"` - FEAT_RNG
+    /// * `"bti"` - FEAT_BTI
+    /// * `"mte"` - FEAT_MTE
+    /// * `"jsconv"` - FEAT_JSCVT
+    /// * `"fcma"` - FEAT_FCMA
+    /// * `"aes"` - FEAT_AES
+    /// * `"sha2"` - FEAT_SHA1 & FEAT_SHA256
+    /// * `"sha3"` - FEAT_SHA512 & FEAT_SHA3
+    /// * `"sm4"` - FEAT_SM3 & FEAT_SM4
+    ///
+    /// [docs]: https://developer.arm.com/documentation/ddi0487/latest
+    #[stable(feature = "simd_aarch64", since = "1.60.0")]
+    @BIND_FEATURE_NAME: "asimd"; "neon";
+    @NO_RUNTIME_DETECTION: "ras";
+    @NO_RUNTIME_DETECTION: "v8.1a";
+    @NO_RUNTIME_DETECTION: "v8.2a";
+    @NO_RUNTIME_DETECTION: "v8.3a";
+    @NO_RUNTIME_DETECTION: "v8.4a";
+    @NO_RUNTIME_DETECTION: "v8.5a";
+    @NO_RUNTIME_DETECTION: "v8.6a";
+    @NO_RUNTIME_DETECTION: "v8.7a";
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] asimd: "neon";
+    /// FEAT_AdvSIMD (Advanced SIMD/NEON)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] pmull: "pmull";
+    /// FEAT_PMULL (Polynomial Multiply)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] fp: "fp";
+    /// FEAT_FP (Floating point support)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] fp16: "fp16";
+    /// FEAT_FP16 (Half-float support)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve: "sve";
+    /// FEAT_SVE (Scalable Vector Extension)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] crc: "crc";
+    /// FEAT_CRC32 (Cyclic Redundancy Check)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] lse: "lse";
+    /// FEAT_LSE (Large System Extension - atomics)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] lse2: "lse2";
+    /// FEAT_LSE2 (unaligned and register-pair atomics)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] rdm: "rdm";
+    /// FEAT_RDM (Rounding Doubling Multiply - ASIMDRDM)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] rcpc: "rcpc";
+    /// FEAT_LRCPC (Release consistent Processor consistent)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] rcpc2: "rcpc2";
+    /// FEAT_LRCPC2 (RCPC with immediate offsets)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] dotprod: "dotprod";
+    /// FEAT_DotProd (Vector Dot-Product - ASIMDDP)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] tme: "tme";
+    /// FEAT_TME (Transactional Memory Extensions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] fhm: "fhm";
+    /// FEAT_FHM (fp16 multiplication instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] dit: "dit";
+    /// FEAT_DIT (Data Independent Timing instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] flagm: "flagm";
+    /// FEAT_FLAGM (flag manipulation instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] ssbs: "ssbs";
+    /// FEAT_SSBS (speculative store bypass safe)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sb: "sb";
+    /// FEAT_SB (speculation barrier)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] paca: "paca";
+    /// FEAT_PAuth (address authentication)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] pacg: "pacg";
+    /// FEAT_PAuth (generic authentication)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] dpb: "dpb";
+    /// FEAT_DPB (aka dcpop - data cache clean to point of persistence)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] dpb2: "dpb2";
+    /// FEAT_DPB2 (aka dcpodp - data cache clean to point of deep persistence)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2: "sve2";
+    /// FEAT_SVE2 (Scalable Vector Extension 2)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2_aes: "sve2-aes";
+    /// FEAT_SVE_AES (SVE2 AES crypto)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2_sm4: "sve2-sm4";
+    /// FEAT_SVE_SM4 (SVE2 SM4 crypto)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2_sha3: "sve2-sha3";
+    /// FEAT_SVE_SHA3 (SVE2 SHA3 crypto)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sve2_bitperm: "sve2-bitperm";
+    /// FEAT_SVE_BitPerm (SVE2 bit permutation instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] frintts: "frintts";
+    /// FEAT_FRINTTS (float to integer rounding instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] i8mm: "i8mm";
+    /// FEAT_I8MM (integer matrix multiplication, plus ASIMD support)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] f32mm: "f32mm";
+    /// FEAT_F32MM (single-precision matrix multiplication)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] f64mm: "f64mm";
+    /// FEAT_F64MM (double-precision matrix multiplication)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] bf16: "bf16";
+    /// FEAT_BF16 (BFloat16 type, plus MM instructions, plus ASIMD support)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] rand: "rand";
+    /// FEAT_RNG (Random Number Generator)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] bti: "bti";
+    /// FEAT_BTI (Branch Target Identification)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] mte: "mte";
+    /// FEAT_MTE (Memory Tagging Extension)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] jsconv: "jsconv";
+    /// FEAT_JSCVT (JavaScript float conversion instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] fcma: "fcma";
+    /// FEAT_FCMA (float complex number operations)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] aes: "aes";
+    /// FEAT_AES (AES instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sha2: "sha2";
+    /// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sha3: "sha3";
+    /// FEAT_SHA512 & FEAT_SHA3 (SHA2-512 & SHA3 instructions)
+    @FEATURE: #[stable(feature = "simd_aarch64", since = "1.60.0")] sm4: "sm4";
+    /// FEAT_SM3 & FEAT_SM4 (SM3 & SM4 instructions)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/arm.rs b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs
new file mode 100644
index 000000000..897dc314c
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs
@@ -0,0 +1,28 @@
+//! Run-time feature detection on ARM Aarch32.
+
+features! {
+    @TARGET: arm;
+    @CFG: target_arch = "arm";
+    @MACRO_NAME: is_arm_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `arm` feature is enabled.
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    @NO_RUNTIME_DETECTION: "v7";
+    @NO_RUNTIME_DETECTION: "vfp2";
+    @NO_RUNTIME_DETECTION: "vfp3";
+    @NO_RUNTIME_DETECTION: "vfp4";
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] neon: "neon";
+    /// ARM Advanced SIMD (NEON) - Aarch32
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] pmull: "pmull";
+    /// Polynomial Multiply
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] crc: "crc";
+    /// CRC32 (Cyclic Redundancy Check)
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] crypto: "crypto";
+    /// Crypto: AES + PMULL + SHA1 + SHA256. Prefer using the individual features where possible.
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] aes: "aes";
+    /// FEAT_AES (AES instructions)
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] sha2: "sha2";
+    /// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions)
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] i8mm: "i8mm";
+    /// FEAT_I8MM
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/mips.rs b/library/stdarch/crates/std_detect/src/detect/arch/mips.rs
new file mode 100644
index 000000000..ae27d0093
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/mips.rs
@@ -0,0 +1,12 @@
+//! Run-time feature detection on MIPS.
+
+features! {
+    @TARGET: mips;
+    @CFG: target_arch = "mips";
+    @MACRO_NAME: is_mips_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `mips` feature is enabled.
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] msa: "msa";
+    /// MIPS SIMD Architecture (MSA)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/mips64.rs b/library/stdarch/crates/std_detect/src/detect/arch/mips64.rs
new file mode 100644
index 000000000..7182ec2da
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/mips64.rs
@@ -0,0 +1,12 @@
+//! Run-time feature detection on MIPS64.
+
+features! {
+    @TARGET: mips64;
+    @CFG: target_arch = "mips64";
+    @MACRO_NAME: is_mips64_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `mips64` feature is enabled.
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] msa: "msa";
+    /// MIPS SIMD Architecture (MSA)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/mod.rs b/library/stdarch/crates/std_detect/src/detect/arch/mod.rs
new file mode 100644
index 000000000..81a1f23e8
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/mod.rs
@@ -0,0 +1,56 @@
+#![allow(dead_code)]
+
+use cfg_if::cfg_if;
+
+// Export the macros for all supported architectures.
+#[macro_use]
+mod x86;
+#[macro_use]
+mod arm;
+#[macro_use]
+mod aarch64;
+#[macro_use]
+mod riscv;
+#[macro_use]
+mod powerpc;
+#[macro_use]
+mod powerpc64;
+#[macro_use]
+mod mips;
+#[macro_use]
+mod mips64;
+
+cfg_if! {
+    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        pub use x86::*;
+    } else if #[cfg(target_arch = "arm")] {
+        pub use arm::*;
+    } else if #[cfg(target_arch = "aarch64")] {
+        pub use aarch64::*;
+    } else if #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] {
+        pub use riscv::*;
+    } else if #[cfg(target_arch = "powerpc")] {
+        pub use powerpc::*;
+    } else if #[cfg(target_arch = "powerpc64")] {
+        pub use powerpc64::*;
+    } else if #[cfg(target_arch = "mips")] {
+        pub use mips::*;
+    } else if #[cfg(target_arch = "mips64")] {
+        pub use mips64::*;
+    } else {
+        // Unimplemented architecture:
+        #[doc(hidden)]
+        pub(crate) enum Feature {
+            Null
+        }
+        #[doc(hidden)]
+        pub mod __is_feature_detected {}
+
+        impl Feature {
+            #[doc(hidden)]
+            pub(crate) fn from_str(_s: &str) -> Result<Feature, ()> { Err(()) }
+            #[doc(hidden)]
+            pub(crate) fn to_str(self) -> &'static str { "" }
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs b/library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs
new file mode 100644
index 000000000..d135cd95d
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/powerpc.rs
@@ -0,0 +1,16 @@
+//! Run-time feature detection on PowerPC.
+
+features! {
+    @TARGET: powerpc;
+    @CFG: target_arch = "powerpc";
+    @MACRO_NAME: is_powerpc_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `powerpc` feature is enabled.
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] altivec: "altivec";
+    /// Altivec
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] vsx: "vsx";
+    /// VSX
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] power8: "power8";
+    /// Power8
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs b/library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs
new file mode 100644
index 000000000..773afd6ce
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/powerpc64.rs
@@ -0,0 +1,16 @@
+//! Run-time feature detection on PowerPC64.
+
+features! {
+    @TARGET: powerpc64;
+    @CFG: target_arch = "powerpc64";
+    @MACRO_NAME: is_powerpc64_feature_detected;
+    @MACRO_ATTRS:
+    /// Checks if `powerpc` feature is enabled.
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] altivec: "altivec";
+    /// Altivec
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] vsx: "vsx";
+    /// VSX
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] power8: "power8";
+    /// Power8
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/riscv.rs b/library/stdarch/crates/std_detect/src/detect/arch/riscv.rs
new file mode 100644
index 000000000..5ea36e7c1
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/riscv.rs
@@ -0,0 +1,206 @@
+//! Run-time feature detection on RISC-V.
+
+features! {
+    @TARGET: riscv;
+    @CFG: any(target_arch = "riscv32", target_arch = "riscv64");
+    @MACRO_NAME: is_riscv_feature_detected;
+    @MACRO_ATTRS:
+    /// A macro to test at *runtime* whether instruction sets are available on
+    /// RISC-V platforms.
+    ///
+    /// RISC-V standard defined the base sets and the extension sets.
+    /// The base sets are RV32I, RV64I, RV32E or RV128I. Any RISC-V platform
+    /// must support one base set and/or multiple extension sets.
+    ///
+    /// Any RISC-V standard instruction sets can be in state of either ratified,
+    /// frozen or draft. The version and status of current standard instruction
+    /// sets can be checked out from preface section of the [ISA manual].
+    ///
+    /// Platform may define and support their own custom instruction sets with
+    /// ISA prefix X. These sets are highly platform specific and should be
+    /// detected with their own platform support crates.
+    ///
+    /// # Unprivileged Specification
+    ///
+    /// The supported ratified RISC-V instruction sets are as follows:
+    ///
+    /// * RV32I: `"rv32i"`
+    /// * Zifencei: `"zifencei"`
+    /// * Zihintpause: `"zihintpause"`
+    /// * RV64I: `"rv64i"`
+    /// * M: `"m"`
+    /// * A: `"a"`
+    /// * Zicsr: `"zicsr"`
+    /// * Zicntr: `"zicntr"`
+    /// * Zihpm: `"zihpm"`
+    /// * F: `"f"`
+    /// * D: `"d"`
+    /// * Q: `"q"`
+    /// * C: `"c"`
+    ///
+    /// There's also bases and extensions marked as standard instruction set,
+    /// but they are in frozen or draft state. These instruction sets are also
+    /// reserved by this macro and can be detected in the future platforms.
+    ///
+    /// Frozen RISC-V instruction sets:
+    ///
+    /// * Zfinx: `"zfinx"`
+    /// * Zdinx: `"zdinx"`
+    /// * Zhinx: `"zhinx"`
+    /// * Zhinxmin: `"zhinxmin"`
+    /// * Ztso: `"ztso"`
+    ///
+    /// Draft RISC-V instruction sets:
+    ///
+    /// * RV32E: `"rv32e"`
+    /// * RV128I: `"rv128i"`
+    /// * Zfh: `"zfh"`
+    /// * Zfhmin: `"zfhmin"`
+    /// * B: `"b"`
+    /// * J: `"j"`
+    /// * P: `"p"`
+    /// * V: `"v"`
+    /// * Zam: `"zam"`
+    ///
+    /// Defined by Privileged Specification:
+    ///
+    /// * Supervisor: `"s"`
+    /// * Svnapot: `"svnapot"`
+    /// * Svpbmt: `"svpbmt"`
+    /// * Svinval: `"svinval"`
+    /// * Hypervisor: `"h"`
+    ///
+    /// # RISC-V Bit-Manipulation ISA-extensions
+    ///
+    /// This document defined the following extensions:
+    ///
+    /// * Zba: `"zba"`
+    /// * Zbb: `"zbb"`
+    /// * Zbc: `"zbc"`
+    /// * Zbs: `"zbs"`
+    ///
+    /// # RISC-V Cryptography Extensions
+    ///
+    /// These extensions are defined in Volume I, Scalar & Entropy Source
+    /// Instructions:
+    ///
+    /// * Zbkb: `"zbkb"`
+    /// * Zbkc: `"zbkc"`
+    /// * Zbkx: `"zbkx"`
+    /// * Zknd: `"zknd"`
+    /// * Zkne: `"zkne"`
+    /// * Zknh: `"zknh"`
+    /// * Zksed: `"zksed"`
+    /// * Zksh: `"zksh"`
+    /// * Zkr: `"zkr"`
+    /// * Zkn: `"zkn"`
+    /// * Zks: `"zks"`
+    /// * Zk: `"zk"`
+    /// * Zkt: `"zkt"`
+    ///
+    /// [ISA manual]: https://github.com/riscv/riscv-isa-manual/
+    #[unstable(feature = "stdsimd", issue = "27731")]
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] rv32i: "rv32i";
+    /// RV32I Base Integer Instruction Set
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zifencei: "zifencei";
+    /// "Zifencei" Instruction-Fetch Fence
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zihintpause: "zihintpause";
+    /// "Zihintpause" Pause Hint
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] rv64i: "rv64i";
+    /// RV64I Base Integer Instruction Set
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] m: "m";
+    /// "M" Standard Extension for Integer Multiplication and Division
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] a: "a";
+    /// "A" Standard Extension for Atomic Instructions
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zicsr: "zicsr";
+    /// "Zicsr", Control and Status Register (CSR) Instructions
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zicntr: "zicntr";
+    /// "Zicntr", Standard Extension for Base Counters and Timers
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zihpm: "zihpm";
+    /// "Zihpm", Standard Extension for Hardware Performance Counters
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] f: "f";
+    /// "F" Standard Extension for Single-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] d: "d";
+    /// "D" Standard Extension for Double-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] q: "q";
+    /// "Q" Standard Extension for Quad-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] c: "c";
+    /// "C" Standard Extension for Compressed Instructions
+
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zfinx: "zfinx";
+    /// "Zfinx" Standard Extension for Single-Precision Floating-Point in Integer Registers
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zdinx: "zdinx";
+    /// "Zdinx" Standard Extension for Double-Precision Floating-Point in Integer Registers
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zhinx: "zhinx";
+    /// "Zhinx" Standard Extension for Half-Precision Floating-Point in Integer Registers
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zhinxmin: "zhinxmin";
+    /// "Zhinxmin" Standard Extension for Minimal Half-Precision Floating-Point in Integer Registers
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] ztso: "ztso";
+    /// "Ztso" Standard Extension for Total Store Ordering
+
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] rv32e: "rv32e";
+    /// RV32E Base Integer Instruction Set
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] rv128i: "rv128i";
+    /// RV128I Base Integer Instruction Set
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zfh: "zfh";
+    /// "Zfh" Standard Extension for 16-Bit Half-Precision Floating-Point
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zfhmin: "zfhmin";
+    /// "Zfhmin" Standard Extension for Minimal Half-Precision Floating-Point Support
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] b: "b";
+    /// "B" Standard Extension for Bit Manipulation
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] j: "j";
+    /// "J" Standard Extension for Dynamically Translated Languages
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] p: "p";
+    /// "P" Standard Extension for Packed-SIMD Instructions
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] v: "v";
+    /// "V" Standard Extension for Vector Operations
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zam: "zam";
+    /// "Zam" Standard Extension for Misaligned Atomics
+
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] s: "s";
+    /// Supervisor-Level ISA
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] svnapot: "svnapot";
+    /// "Svnapot" Standard Extension for NAPOT Translation Contiguity
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] svpbmt: "svpbmt";
+    /// "Svpbmt" Standard Extension for Page-Based Memory Types
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] svinval: "svinval";
+    /// "Svinval" Standard Extension for Fine-Grained Address-Translation Cache Invalidation
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] h: "h";
+    /// Hypervisor Extension
+
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zba: "zba";
+    /// "Zba" Standard Extension for Address Generation Instructions
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zbb: "zbb";
+    /// "Zbb" Standard Extension for Basic Bit-Manipulation
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zbc: "zbc";
+    /// "Zbc" Standard Extension for Carry-less Multiplication
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zbs: "zbs";
+    /// "Zbs" Standard Extension for Single-Bit instructions
+
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zbkb: "zbkb";
+    /// "Zbkb" Standard Extension for Bitmanip instructions for Cryptography
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zbkc: "zbkc";
+    /// "Zbkc" Standard Extension for Carry-less multiply instructions
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zbkx: "zbkx";
+    /// "Zbkx" Standard Extension for Crossbar permutation instructions
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zknd: "zknd";
+    /// "Zknd" Standard Extension for NIST Suite: AES Decryption
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zkne: "zkne";
+    /// "Zkne" Standard Extension for NIST Suite: AES Encryption
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zknh: "zknh";
+    /// "Zknh" Standard Extension for NIST Suite: Hash Function Instructions
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zksed: "zksed";
+    /// "Zksed" Standard Extension for ShangMi Suite: SM4 Block Cipher Instructions
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zksh: "zksh";
+    /// "Zksh" Standard Extension for ShangMi Suite: SM3 Hash Function Instructions
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zkr: "zkr";
+    /// "Zkr" Standard Extension for Entropy Source Extension
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zkn: "zkn";
+    /// "Zkn" Standard Extension for NIST Algorithm Suite
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zks: "zks";
+    /// "Zks" Standard Extension for ShangMi Algorithm Suite
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zk: "zk";
+    /// "Zk" Standard Extension for Standard scalar cryptography extension
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] zkt: "zkt";
+    /// "Zkt" Standard Extension for Data Independent Execution Latency
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/x86.rs b/library/stdarch/crates/std_detect/src/detect/arch/x86.rs
new file mode 100644
index 000000000..893e1a887
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/arch/x86.rs
@@ -0,0 +1,197 @@
+//! This module implements minimal run-time feature detection for x86.
+//!
+//! The features are detected using the `detect_features` function below.
+//! This function uses the CPUID instruction to read the feature flags from the
+//! CPU and encodes them in a `usize` where each bit position represents
+//! whether a feature is available (bit is set) or unavailable (bit is cleared).
+//!
+//! The enum `Feature` is used to map bit positions to feature names, and the
+//! the `__crate::detect::check_for!` macro is used to map string literals (e.g.,
+//! "avx") to these bit positions (e.g., `Feature::avx`).
+//!
+//! The run-time feature detection is performed by the
+//! `__crate::detect::check_for(Feature) -> bool` function. On its first call,
+//! this functions queries the CPU for the available features and stores them
+//! in a global `AtomicUsize` variable. The query is performed by just checking
+//! whether the feature bit in this global variable is set or cleared.
+
+features! {
+    @TARGET: x86;
+    @CFG: any(target_arch = "x86", target_arch = "x86_64");
+    @MACRO_NAME: is_x86_feature_detected;
+    @MACRO_ATTRS:
+    /// A macro to test at *runtime* whether a CPU feature is available on
+    /// x86/x86-64 platforms.
+    ///
+    /// This macro is provided in the standard library and will detect at runtime
+    /// whether the specified CPU feature is detected. This does **not** resolve at
+    /// compile time unless the specified feature is already enabled for the entire
+    /// crate. Runtime detection currently relies mostly on the `cpuid` instruction.
+    ///
+    /// This macro only takes one argument which is a string literal of the feature
+    /// being tested for. The feature names supported are the lowercase versions of
+    /// the ones defined by Intel in [their documentation][docs].
+    ///
+    /// ## Supported arguments
+    ///
+    /// This macro supports the same names that `#[target_feature]` supports. Unlike
+    /// `#[target_feature]`, however, this macro does not support names separated
+    /// with a comma. Instead testing for multiple features must be done through
+    /// separate macro invocations for now.
+    ///
+    /// Supported arguments are:
+    ///
+    /// * `"aes"`
+    /// * `"pclmulqdq"`
+    /// * `"rdrand"`
+    /// * `"rdseed"`
+    /// * `"tsc"`
+    /// * `"mmx"`
+    /// * `"sse"`
+    /// * `"sse2"`
+    /// * `"sse3"`
+    /// * `"ssse3"`
+    /// * `"sse4.1"`
+    /// * `"sse4.2"`
+    /// * `"sse4a"`
+    /// * `"sha"`
+    /// * `"avx"`
+    /// * `"avx2"`
+    /// * `"avx512f"`
+    /// * `"avx512cd"`
+    /// * `"avx512er"`
+    /// * `"avx512pf"`
+    /// * `"avx512bw"`
+    /// * `"avx512dq"`
+    /// * `"avx512vl"`
+    /// * `"avx512ifma"`
+    /// * `"avx512vbmi"`
+    /// * `"avx512vpopcntdq"`
+    /// * `"avx512vbmi2"`
+    /// * `"avx512gfni"`
+    /// * `"avx512vaes"`
+    /// * `"avx512vpclmulqdq"`
+    /// * `"avx512vnni"`
+    /// * `"avx512bitalg"`
+    /// * `"avx512bf16"`
+    /// * `"avx512vp2intersect"`
+    /// * `"f16c"`
+    /// * `"fma"`
+    /// * `"bmi1"`
+    /// * `"bmi2"`
+    /// * `"abm"`
+    /// * `"lzcnt"`
+    /// * `"tbm"`
+    /// * `"popcnt"`
+    /// * `"fxsr"`
+    /// * `"xsave"`
+    /// * `"xsaveopt"`
+    /// * `"xsaves"`
+    /// * `"xsavec"`
+    /// * `"cmpxchg16b"`
+    /// * `"adx"`
+    /// * `"rtm"`
+    ///
+    /// [docs]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
+    #[stable(feature = "simd_x86", since = "1.27.0")]
+    @BIND_FEATURE_NAME: "abm"; "lzcnt"; // abm is a synonym for lzcnt
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] aes: "aes";
+    /// AES (Advanced Encryption Standard New Instructions AES-NI)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] pclmulqdq: "pclmulqdq";
+    /// CLMUL (Carry-less Multiplication)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] rdrand: "rdrand";
+    /// RDRAND
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] rdseed: "rdseed";
+    /// RDSEED
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] tsc: "tsc";
+    /// TSC (Time Stamp Counter)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] mmx: "mmx";
+    /// MMX (MultiMedia eXtensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse: "sse";
+    /// SSE (Streaming SIMD Extensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse2: "sse2";
+    /// SSE2 (Streaming SIMD Extensions 2)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse3: "sse3";
+    /// SSE3 (Streaming SIMD Extensions 3)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] ssse3: "ssse3";
+    /// SSSE3 (Supplemental Streaming SIMD Extensions 3)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse4_1: "sse4.1";
+    /// SSE4.1 (Streaming SIMD Extensions 4.1)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse4_2: "sse4.2";
+    /// SSE4.2 (Streaming SIMD Extensions 4.2)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sse4a: "sse4a";
+    /// SSE4a (Streaming SIMD Extensions 4a)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] sha: "sha";
+    /// SHA
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx: "avx";
+    /// AVX (Advanced Vector Extensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx2: "avx2";
+    /// AVX2 (Advanced Vector Extensions 2)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512f: "avx512f" ;
+    /// AVX-512 F (Foundation)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512cd: "avx512cd" ;
+    /// AVX-512 CD (Conflict Detection Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512er: "avx512er";
+    /// AVX-512 ER (Expo nential and Reciprocal Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512pf: "avx512pf";
+    /// AVX-512 PF (Prefetch Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512bw: "avx512bw";
+    /// AVX-512 BW (Byte and Word Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512dq: "avx512dq";
+    /// AVX-512 DQ (Doubleword and Quadword)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vl: "avx512vl";
+    /// AVX-512 VL (Vector Length Extensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512ifma: "avx512ifma";
+    /// AVX-512 IFMA (Integer Fused Multiply Add)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vbmi: "avx512vbmi";
+    /// AVX-512 VBMI (Vector Byte Manipulation Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vpopcntdq: "avx512vpopcntdq";
+    /// AVX-512 VPOPCNTDQ (Vector Population Count Doubleword and
+    /// Quadword)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vbmi2: "avx512vbmi2";
+    /// AVX-512 VBMI2 (Additional byte, word, dword and qword capabilities)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512gfni: "avx512gfni";
+    /// AVX-512 GFNI (Galois Field New Instruction)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vaes: "avx512vaes";
+    /// AVX-512 VAES (Vector AES instruction)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vpclmulqdq: "avx512vpclmulqdq";
+    /// AVX-512 VPCLMULQDQ (Vector PCLMULQDQ instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vnni: "avx512vnni";
+    /// AVX-512 VNNI (Vector Neural Network Instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512bitalg: "avx512bitalg";
+    /// AVX-512 BITALG (Support for VPOPCNT\[B,W\] and VPSHUFBITQMB)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512bf16: "avx512bf16";
+    /// AVX-512 BF16 (BFLOAT16 instructions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vp2intersect: "avx512vp2intersect";
+    /// AVX-512 P2INTERSECT
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c";
+    /// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma";
+    /// FMA (Fused Multiply Add)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] bmi1: "bmi1" ;
+    /// BMI1 (Bit Manipulation Instructions 1)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] bmi2: "bmi2" ;
+    /// BMI2 (Bit Manipulation Instructions 2)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] lzcnt: "lzcnt";
+    /// ABM (Advanced Bit Manipulation) / LZCNT (Leading Zero Count)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] tbm: "tbm";
+    /// TBM (Trailing Bit Manipulation)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] popcnt: "popcnt";
+    /// POPCNT (Population Count)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fxsr: "fxsr";
+    /// FXSR (Floating-point context fast save and restore)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] xsave: "xsave";
+    /// XSAVE (Save Processor Extended States)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] xsaveopt: "xsaveopt";
+    /// XSAVEOPT (Save Processor Extended States Optimized)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] xsaves: "xsaves";
+    /// XSAVES (Save Processor Extended States Supervisor)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] xsavec: "xsavec";
+    /// XSAVEC (Save Processor Extended States Compacted)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] cmpxchg16b: "cmpxchg16b";
+    /// CMPXCH16B (16-byte compare-and-swap instruction)
+    @FEATURE: #[stable(feature = "simd_x86_adx", since = "1.33.0")] adx: "adx";
+    /// ADX, Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] rtm: "rtm";
+    /// RTM, Intel (Restricted Transactional Memory)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/bit.rs b/library/stdarch/crates/std_detect/src/detect/bit.rs
new file mode 100644
index 000000000..6f06c5523
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/bit.rs
@@ -0,0 +1,9 @@
+//! Bit manipulation utilities.
+
+/// Tests the `bit` of `x`.
+#[allow(dead_code)]
+#[inline]
+pub(crate) fn test(x: usize, bit: u32) -> bool {
+    debug_assert!(bit < usize::BITS, "bit index out-of-bounds");
+    x & (1 << bit) != 0
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/cache.rs b/library/stdarch/crates/std_detect/src/detect/cache.rs
new file mode 100644
index 000000000..d01a5ea24
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/cache.rs
@@ -0,0 +1,194 @@
+//! Caches run-time feature detection so that it only needs to be computed
+//! once.
+
+#![allow(dead_code)] // not used on all platforms
+
+use core::sync::atomic::Ordering;
+
+use core::sync::atomic::AtomicUsize;
+
+/// Sets the `bit` of `x`.
+#[inline]
+const fn set_bit(x: u64, bit: u32) -> u64 {
+    x | 1 << bit
+}
+
+/// Tests the `bit` of `x`.
+#[inline]
+const fn test_bit(x: u64, bit: u32) -> bool {
+    x & (1 << bit) != 0
+}
+
+/// Unset the `bit of `x`.
+#[inline]
+const fn unset_bit(x: u64, bit: u32) -> u64 {
+    x & !(1 << bit)
+}
+
+/// Maximum number of features that can be cached.
+const CACHE_CAPACITY: u32 = 62;
+
+/// This type is used to initialize the cache
+#[derive(Copy, Clone)]
+pub(crate) struct Initializer(u64);
+
+#[allow(clippy::use_self)]
+impl Default for Initializer {
+    fn default() -> Self {
+        Initializer(0)
+    }
+}
+
+// NOTE: the `debug_assert!` would catch that we do not add more Features than
+// the one fitting our cache.
+impl Initializer {
+    /// Tests the `bit` of the cache.
+    #[inline]
+    pub(crate) fn test(self, bit: u32) -> bool {
+        debug_assert!(
+            bit < CACHE_CAPACITY,
+            "too many features, time to increase the cache size!"
+        );
+        test_bit(self.0, bit)
+    }
+
+    /// Sets the `bit` of the cache.
+    #[inline]
+    pub(crate) fn set(&mut self, bit: u32) {
+        debug_assert!(
+            bit < CACHE_CAPACITY,
+            "too many features, time to increase the cache size!"
+        );
+        let v = self.0;
+        self.0 = set_bit(v, bit);
+    }
+
+    /// Unsets the `bit` of the cache.
+    #[inline]
+    pub(crate) fn unset(&mut self, bit: u32) {
+        debug_assert!(
+            bit < CACHE_CAPACITY,
+            "too many features, time to increase the cache size!"
+        );
+        let v = self.0;
+        self.0 = unset_bit(v, bit);
+    }
+}
+
+/// This global variable is a cache of the features supported by the CPU.
+// Note: on x64, we only use the first slot
+static CACHE: [Cache; 2] = [Cache::uninitialized(), Cache::uninitialized()];
+
+/// Feature cache with capacity for `size_of::<usize::MAX>() * 8 - 1` features.
+///
+/// Note: 0 is used to represent an uninitialized cache, and (at least) the most
+/// significant bit is set on any cache which has been initialized.
+///
+/// Note: we use `Relaxed` atomic operations, because we are only interested in
+/// the effects of operations on a single memory location. That is, we only need
+/// "modification order", and not the full-blown "happens before".
+struct Cache(AtomicUsize);
+
+impl Cache {
+    const CAPACITY: u32 = (core::mem::size_of::<usize>() * 8 - 1) as u32;
+    const MASK: usize = (1 << Cache::CAPACITY) - 1;
+    const INITIALIZED_BIT: usize = 1usize << Cache::CAPACITY;
+
+    /// Creates an uninitialized cache.
+    #[allow(clippy::declare_interior_mutable_const)]
+    const fn uninitialized() -> Self {
+        Cache(AtomicUsize::new(0))
+    }
+
+    /// Is the `bit` in the cache set? Returns `None` if the cache has not been initialized.
+    #[inline]
+    pub(crate) fn test(&self, bit: u32) -> Option<bool> {
+        let cached = self.0.load(Ordering::Relaxed);
+        if cached == 0 {
+            None
+        } else {
+            Some(test_bit(cached as u64, bit))
+        }
+    }
+
+    /// Initializes the cache.
+    #[inline]
+    fn initialize(&self, value: usize) -> usize {
+        debug_assert_eq!((value & !Cache::MASK), 0);
+        self.0
+            .store(value | Cache::INITIALIZED_BIT, Ordering::Relaxed);
+        value
+    }
+}
+
+cfg_if::cfg_if! {
+    if #[cfg(feature = "std_detect_env_override")] {
+        #[inline]
+        fn initialize(mut value: Initializer) -> Initializer {
+            let env = unsafe {
+                libc::getenv(b"RUST_STD_DETECT_UNSTABLE\0".as_ptr() as *const libc::c_char)
+            };
+            if !env.is_null() {
+                let len = unsafe { libc::strlen(env) };
+                let env = unsafe { core::slice::from_raw_parts(env as *const u8, len) };
+                if let Ok(disable) = core::str::from_utf8(env) {
+                    for v in disable.split(" ") {
+                        let _ = super::Feature::from_str(v).map(|v| value.unset(v as u32));
+                    }
+                }
+            }
+            do_initialize(value);
+            value
+        }
+    } else {
+        #[inline]
+        fn initialize(value: Initializer) -> Initializer {
+            do_initialize(value);
+            value
+        }
+    }
+}
+
+#[inline]
+fn do_initialize(value: Initializer) {
+    CACHE[0].initialize((value.0) as usize & Cache::MASK);
+    CACHE[1].initialize((value.0 >> Cache::CAPACITY) as usize & Cache::MASK);
+}
+
+// We only have to detect features once, and it's fairly costly, so hint to LLVM
+// that it should assume that cache hits are more common than misses (which is
+// the point of caching). It's possibly unfortunate that this function needs to
+// reach across modules like this to call `os::detect_features`, but it produces
+// the best code out of several attempted variants.
+//
+// The `Initializer` that the cache was initialized with is returned, so that
+// the caller can call `test()` on it without having to load the value from the
+// cache again.
+#[cold]
+fn detect_and_initialize() -> Initializer {
+    initialize(super::os::detect_features())
+}
+
+/// Tests the `bit` of the storage. If the storage has not been initialized,
+/// initializes it with the result of `os::detect_features()`.
+///
+/// On its first invocation, it detects the CPU features and caches them in the
+/// `CACHE` global variable as an `AtomicU64`.
+///
+/// It uses the `Feature` variant to index into this variable as a bitset. If
+/// the bit is set, the feature is enabled, and otherwise it is disabled.
+///
+/// If the feature `std_detect_env_override` is enabled looks for the env
+/// variable `RUST_STD_DETECT_UNSTABLE` and uses its its content to disable
+/// Features that would had been otherwise detected.
+#[inline]
+pub(crate) fn test(bit: u32) -> bool {
+    let (relative_bit, idx) = if bit < Cache::CAPACITY {
+        (bit, 0)
+    } else {
+        (bit - Cache::CAPACITY, 1)
+    };
+    CACHE[idx]
+        .test(relative_bit)
+        .unwrap_or_else(|| detect_and_initialize().test(bit))
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/macros.rs b/library/stdarch/crates/std_detect/src/detect/macros.rs
new file mode 100644
index 000000000..7548c9780
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/macros.rs
@@ -0,0 +1,153 @@
+#[allow(unused)]
+macro_rules! features {
+    (
+      @TARGET: $target:ident;
+      @CFG: $cfg:meta;
+      @MACRO_NAME: $macro_name:ident;
+      @MACRO_ATTRS: $(#[$macro_attrs:meta])*
+      $(@BIND_FEATURE_NAME: $bind_feature:tt; $feature_impl:tt; )*
+      $(@NO_RUNTIME_DETECTION: $nort_feature:tt; )*
+      $(@FEATURE: #[$stability_attr:meta] $feature:ident: $feature_lit:tt; $(#[$feature_comment:meta])*)*
+    ) => {
+        #[macro_export]
+        $(#[$macro_attrs])*
+        #[allow_internal_unstable(stdsimd_internal, stdsimd)]
+        #[cfg($cfg)]
+        #[doc(cfg($cfg))]
+        macro_rules! $macro_name {
+            $(
+                ($feature_lit) => {
+                    cfg!(target_feature = $feature_lit) ||
+                        $crate::detect::__is_feature_detected::$feature()
+                };
+            )*
+            $(
+                ($bind_feature) => { $macro_name!($feature_impl) };
+            )*
+            $(
+                ($nort_feature) => {
+                    compile_error!(
+                        concat!(
+                            stringify!($nort_feature),
+                            " feature cannot be detected at run-time"
+                        )
+                    )
+                };
+            )*
+            ($t:tt,) => {
+                    $macro_name!($t);
+            };
+            ($t:tt) => {
+                compile_error!(
+                    concat!(
+                        concat!("unknown ", stringify!($target)),
+                        concat!(" target feature: ", $t)
+                    )
+                )
+            };
+        }
+
+        $(#[$macro_attrs])*
+        #[macro_export]
+        #[cfg(not($cfg))]
+        #[doc(cfg($cfg))]
+        macro_rules! $macro_name {
+            $(
+                ($feature_lit) => {
+                    compile_error!(
+                        concat!(
+                            r#"This macro cannot be used on the current target.
+                            You can prevent it from being used in other architectures by
+                            guarding it behind a cfg("#,
+                            stringify!($cfg),
+                            ")."
+                        )
+                    )
+                };
+            )*
+            $(
+                ($bind_feature) => { $macro_name!($feature_impl) };
+            )*
+            $(
+                ($nort_feature) => {
+                    compile_error!(
+                        concat!(
+                            stringify!($nort_feature),
+                            " feature cannot be detected at run-time"
+                        )
+                    )
+                };
+            )*
+            ($t:tt,) => {
+                    $macro_name!($t);
+            };
+            ($t:tt) => {
+                compile_error!(
+                    concat!(
+                        concat!("unknown ", stringify!($target)),
+                        concat!(" target feature: ", $t)
+                    )
+                )
+            };
+        }
+
+        /// Each variant denotes a position in a bitset for a particular feature.
+        ///
+        /// PLEASE: do not use this, it is an implementation detail subject
+        /// to change.
+        #[doc(hidden)]
+        #[allow(non_camel_case_types)]
+        #[derive(Copy, Clone)]
+        #[repr(u8)]
+        #[unstable(feature = "stdsimd_internal", issue = "none")]
+        #[cfg($cfg)]
+        pub(crate) enum Feature {
+            $(
+                $(#[$feature_comment])*
+                $feature,
+            )*
+
+            // Do not add variants after last:
+            _last
+        }
+
+        #[cfg($cfg)]
+        impl Feature {
+            pub(crate) fn to_str(self) -> &'static str {
+                match self {
+                    $(Feature::$feature => $feature_lit,)*
+                    Feature::_last => unreachable!(),
+                }
+            }
+            #[cfg(feature = "std_detect_env_override")]
+            pub(crate) fn from_str(s: &str) -> Result<Feature, ()> {
+                match s {
+                    $($feature_lit => Ok(Feature::$feature),)*
+                    _ => Err(())
+                }
+            }
+        }
+
+        /// Each function performs run-time feature detection for a single
+        /// feature. This allow us to use stability attributes on a per feature
+        /// basis.
+        ///
+        /// PLEASE: do not use this, it is an implementation detail subject
+        /// to change.
+        #[doc(hidden)]
+        #[cfg($cfg)]
+        pub mod __is_feature_detected {
+            $(
+
+                /// PLEASE: do not use this, it is an implementation detail
+                /// subject to change.
+                #[inline]
+                #[doc(hidden)]
+                #[$stability_attr]
+                pub fn $feature() -> bool {
+                    $crate::detect::check_for($crate::detect::Feature::$feature)
+                }
+            )*
+        }
+    };
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/mod.rs b/library/stdarch/crates/std_detect/src/detect/mod.rs
new file mode 100644
index 000000000..2bca84ca1
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/mod.rs
@@ -0,0 +1,104 @@
+//! This module implements run-time feature detection.
+//!
+//! The `is_{arch}_feature_detected!("feature-name")` macros take the name of a
+//! feature as a string-literal, and return a boolean indicating whether the
+//! feature is enabled at run-time or not.
+//!
+//! These macros do two things:
+//! * map the string-literal into an integer stored as a `Feature` enum,
+//! * call a `os::check_for(x: Feature)` function that returns `true` if the
+//! feature is enabled.
+//!
+//! The `Feature` enums are also implemented in the `arch/{target_arch}.rs`
+//! modules.
+//!
+//! The `check_for` functions are, in general, Operating System dependent. Most
+//! architectures do not allow user-space programs to query the feature bits
+//! due to security concerns (x86 is the big exception). These functions are
+//! implemented in the `os/{target_os}.rs` modules.
+
+use cfg_if::cfg_if;
+
+#[macro_use]
+mod macros;
+
+mod arch;
+
+// This module needs to be public because the `is_{arch}_feature_detected!`
+// macros expand calls to items within it in user crates.
+#[doc(hidden)]
+pub use self::arch::__is_feature_detected;
+
+pub(crate) use self::arch::Feature;
+
+mod bit;
+mod cache;
+
+cfg_if! {
+    if #[cfg(miri)] {
+        // When running under miri all target-features that are not enabled at
+        // compile-time are reported as disabled at run-time.
+        //
+        // For features for which `cfg(target_feature)` returns true,
+        // this run-time detection logic is never called.
+        #[path = "os/other.rs"]
+        mod os;
+    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        // On x86/x86_64 no OS specific functionality is required.
+        #[path = "os/x86.rs"]
+        mod os;
+    } else if #[cfg(all(target_os = "linux", feature = "libc"))] {
+        #[path = "os/linux/mod.rs"]
+        mod os;
+    } else if #[cfg(all(target_os = "freebsd", feature = "libc"))] {
+        #[cfg(target_arch = "aarch64")]
+        #[path = "os/aarch64.rs"]
+        mod aarch64;
+        #[path = "os/freebsd/mod.rs"]
+        mod os;
+    } else if #[cfg(all(target_os = "windows", target_arch = "aarch64"))] {
+        #[path = "os/windows/aarch64.rs"]
+        mod os;
+    } else {
+        #[path = "os/other.rs"]
+        mod os;
+    }
+}
+
+/// Performs run-time feature detection.
+#[inline]
+#[allow(dead_code)]
+fn check_for(x: Feature) -> bool {
+    cache::test(x as u32)
+}
+
+/// Returns an `Iterator<Item=(&'static str, bool)>` where
+/// `Item.0` is the feature name, and `Item.1` is a `bool` which
+/// is `true` if the feature is supported by the host and `false` otherwise.
+#[unstable(feature = "stdsimd", issue = "27731")]
+pub fn features() -> impl Iterator<Item = (&'static str, bool)> {
+    cfg_if! {
+        if #[cfg(any(
+            target_arch = "x86",
+            target_arch = "x86_64",
+            target_arch = "arm",
+            target_arch = "aarch64",
+            target_arch = "riscv32",
+            target_arch = "riscv64",
+            target_arch = "powerpc",
+            target_arch = "powerpc64",
+            target_arch = "mips",
+            target_arch = "mips64",
+        ))] {
+            (0_u8..Feature::_last as u8).map(|discriminant: u8| {
+                #[allow(bindings_with_variant_name)] // RISC-V has Feature::f
+                let f: Feature = unsafe { core::mem::transmute(discriminant) };
+                let name: &'static str = f.to_str();
+                let enabled: bool = check_for(f);
+                (name, enabled)
+            })
+        } else {
+            None.into_iter()
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/aarch64.rs
new file mode 100644
index 000000000..e0e62ee33
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/aarch64.rs
@@ -0,0 +1,104 @@
+//! Run-time feature detection for Aarch64 on any OS that emulates the mrs instruction.
+//!
+//! On FreeBSD >= 12.0, Linux >= 4.11 and other operating systems, it is possible to use
+//! privileged system registers from userspace to check CPU feature support.
+//!
+//! AArch64 system registers ID_AA64ISAR0_EL1, ID_AA64PFR0_EL1, ID_AA64ISAR1_EL1
+//! have bits dedicated to features like AdvSIMD, CRC32, AES, atomics (LSE), etc.
+//! Each part of the register indicates the level of support for a certain feature, e.g.
+//! when ID_AA64ISAR0_EL1\[7:4\] is >= 1, AES is supported; when it's >= 2, PMULL is supported.
+//!
+//! For proper support of [SoCs where different cores have different capabilities](https://medium.com/@jadr2ddude/a-big-little-problem-a-tale-of-big-little-gone-wrong-e7778ce744bb),
+//! the OS has to always report only the features supported by all cores, like [FreeBSD does](https://reviews.freebsd.org/D17137#393947).
+//!
+//! References:
+//!
+//! - [Zircon implementation](https://fuchsia.googlesource.com/zircon/+/master/kernel/arch/arm64/feature.cpp)
+//! - [Linux documentation](https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt)
+
+use crate::detect::{cache, Feature};
+use core::arch::asm;
+
+/// Try to read the features from the system registers.
+///
+/// This will cause SIGILL if the current OS is not trapping the mrs instruction.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+
+    {
+        let mut enable_feature = |f, enable| {
+            if enable {
+                value.set(f as u32);
+            }
+        };
+
+        // ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0
+        let aa64isar0: u64;
+        unsafe {
+            asm!(
+                "mrs {}, ID_AA64ISAR0_EL1",
+                out(reg) aa64isar0,
+                options(pure, nomem, preserves_flags, nostack)
+            );
+        }
+
+        enable_feature(Feature::pmull, bits_shift(aa64isar0, 7, 4) >= 2);
+        enable_feature(Feature::tme, bits_shift(aa64isar0, 27, 24) == 1);
+        enable_feature(Feature::lse, bits_shift(aa64isar0, 23, 20) >= 1);
+        enable_feature(Feature::crc, bits_shift(aa64isar0, 19, 16) >= 1);
+
+        // ID_AA64PFR0_EL1 - Processor Feature Register 0
+        let aa64pfr0: u64;
+        unsafe {
+            asm!(
+                "mrs {}, ID_AA64PFR0_EL1",
+                out(reg) aa64pfr0,
+                options(pure, nomem, preserves_flags, nostack)
+            );
+        }
+
+        let fp = bits_shift(aa64pfr0, 19, 16) < 0xF;
+        let fphp = bits_shift(aa64pfr0, 19, 16) >= 1;
+        let asimd = bits_shift(aa64pfr0, 23, 20) < 0xF;
+        let asimdhp = bits_shift(aa64pfr0, 23, 20) >= 1;
+        enable_feature(Feature::fp, fp);
+        enable_feature(Feature::fp16, fphp);
+        // SIMD support requires float support - if half-floats are
+        // supported, it also requires half-float support:
+        enable_feature(Feature::asimd, fp && asimd && (!fphp | asimdhp));
+        // SIMD extensions require SIMD support:
+        enable_feature(Feature::aes, asimd && bits_shift(aa64isar0, 7, 4) >= 1);
+        let sha1 = bits_shift(aa64isar0, 11, 8) >= 1;
+        let sha2 = bits_shift(aa64isar0, 15, 12) >= 1;
+        enable_feature(Feature::sha2, asimd && sha1 && sha2);
+        enable_feature(Feature::rdm, asimd && bits_shift(aa64isar0, 31, 28) >= 1);
+        enable_feature(
+            Feature::dotprod,
+            asimd && bits_shift(aa64isar0, 47, 44) >= 1,
+        );
+        enable_feature(Feature::sve, asimd && bits_shift(aa64pfr0, 35, 32) >= 1);
+
+        // ID_AA64ISAR1_EL1 - Instruction Set Attribute Register 1
+        let aa64isar1: u64;
+        unsafe {
+            asm!(
+                "mrs {}, ID_AA64ISAR1_EL1",
+                out(reg) aa64isar1,
+                options(pure, nomem, preserves_flags, nostack)
+            );
+        }
+
+        // Check for either APA or API field
+        enable_feature(Feature::paca, bits_shift(aa64isar1, 11, 4) >= 1);
+        enable_feature(Feature::rcpc, bits_shift(aa64isar1, 23, 20) >= 1);
+        // Check for either GPA or GPI field
+        enable_feature(Feature::pacg, bits_shift(aa64isar1, 31, 24) >= 1);
+    }
+
+    value
+}
+
+#[inline]
+fn bits_shift(x: u64, high: usize, low: usize) -> u64 {
+    (x >> low) & ((1 << (high - low + 1)) - 1)
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs
new file mode 100644
index 000000000..7d972b373
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/aarch64.rs
@@ -0,0 +1,21 @@
+//! Run-time feature detection for Aarch64 on FreeBSD.
+
+pub(crate) use super::super::aarch64::detect_features;
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn dump() {
+        println!("asimd: {:?}", is_aarch64_feature_detected!("asimd"));
+        println!("pmull: {:?}", is_aarch64_feature_detected!("pmull"));
+        println!("fp: {:?}", is_aarch64_feature_detected!("fp"));
+        println!("fp16: {:?}", is_aarch64_feature_detected!("fp16"));
+        println!("sve: {:?}", is_aarch64_feature_detected!("sve"));
+        println!("crc: {:?}", is_aarch64_feature_detected!("crc"));
+        println!("lse: {:?}", is_aarch64_feature_detected!("lse"));
+        println!("rdm: {:?}", is_aarch64_feature_detected!("rdm"));
+        println!("rcpc: {:?}", is_aarch64_feature_detected!("rcpc"));
+        println!("dotprod: {:?}", is_aarch64_feature_detected!("dotprod"));
+        println!("tme: {:?}", is_aarch64_feature_detected!("tme"));
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/arm.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/arm.rs
new file mode 100644
index 000000000..4c9d763b4
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/arm.rs
@@ -0,0 +1,21 @@
+//! Run-time feature detection for ARM on FreeBSD
+
+use super::auxvec;
+use crate::detect::{cache, Feature};
+
+/// Try to read the features from the auxiliary vector
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::neon, auxv.hwcap & 0x00001000 != 0);
+        enable_feature(&mut value, Feature::pmull, auxv.hwcap2 & 0x00000002 != 0);
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/auxvec.rs
new file mode 100644
index 000000000..29fcc8cb0
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/auxvec.rs
@@ -0,0 +1,102 @@
+//! Parses ELF auxiliary vectors.
+#![cfg_attr(
+    any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "powerpc64",
+        target_arch = "riscv64"
+    ),
+    allow(dead_code)
+)]
+
+/// Key to access the CPU Hardware capabilities bitfield.
+pub(crate) const AT_HWCAP: usize = 25;
+/// Key to access the CPU Hardware capabilities 2 bitfield.
+pub(crate) const AT_HWCAP2: usize = 26;
+
+/// Cache HWCAP bitfields of the ELF Auxiliary Vector.
+///
+/// If an entry cannot be read all the bits in the bitfield are set to zero.
+/// This should be interpreted as all the features being disabled.
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct AuxVec {
+    pub hwcap: usize,
+    pub hwcap2: usize,
+}
+
+/// ELF Auxiliary Vector
+///
+/// The auxiliary vector is a memory region in a running ELF program's stack
+/// composed of (key: usize, value: usize) pairs.
+///
+/// The keys used in the aux vector are platform dependent. For FreeBSD, they are
+/// defined in [sys/elf_common.h][elf_common_h]. The hardware capabilities of a given
+/// CPU can be queried with the  `AT_HWCAP` and `AT_HWCAP2` keys.
+///
+/// Note that run-time feature detection is not invoked for features that can
+/// be detected at compile-time.
+///
+/// [elf_common.h]: https://svnweb.freebsd.org/base/release/12.0.0/sys/sys/elf_common.h?revision=341707
+pub(crate) fn auxv() -> Result<AuxVec, ()> {
+    if let Ok(hwcap) = archauxv(AT_HWCAP) {
+        if let Ok(hwcap2) = archauxv(AT_HWCAP2) {
+            if hwcap != 0 && hwcap2 != 0 {
+                return Ok(AuxVec { hwcap, hwcap2 });
+            }
+        }
+    }
+    Err(())
+}
+
+/// Tries to read the `key` from the auxiliary vector.
+fn archauxv(key: usize) -> Result<usize, ()> {
+    use core::mem;
+
+    #[derive(Copy, Clone)]
+    #[repr(C)]
+    pub struct Elf_Auxinfo {
+        pub a_type: usize,
+        pub a_un: unnamed,
+    }
+    #[derive(Copy, Clone)]
+    #[repr(C)]
+    pub union unnamed {
+        pub a_val: libc::c_long,
+        pub a_ptr: *mut libc::c_void,
+        pub a_fcn: Option<unsafe extern "C" fn() -> ()>,
+    }
+
+    let mut auxv: [Elf_Auxinfo; 27] = [Elf_Auxinfo {
+        a_type: 0,
+        a_un: unnamed { a_val: 0 },
+    }; 27];
+
+    let mut len: libc::c_uint = mem::size_of_val(&auxv) as libc::c_uint;
+
+    unsafe {
+        let mut mib = [
+            libc::CTL_KERN,
+            libc::KERN_PROC,
+            libc::KERN_PROC_AUXV,
+            libc::getpid(),
+        ];
+
+        let ret = libc::sysctl(
+            mib.as_mut_ptr(),
+            mib.len() as u32,
+            &mut auxv as *mut _ as *mut _,
+            &mut len as *mut _ as *mut _,
+            0 as *mut libc::c_void,
+            0,
+        );
+
+        if ret != -1 {
+            for i in 0..auxv.len() {
+                if auxv[i].a_type == key {
+                    return Ok(auxv[i].a_un.a_val as usize);
+                }
+            }
+        }
+    }
+    return Ok(0);
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs
new file mode 100644
index 000000000..ade7fb626
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/mod.rs
@@ -0,0 +1,22 @@
+//! Run-time feature detection on FreeBSD
+
+mod auxvec;
+
+cfg_if::cfg_if! {
+    if #[cfg(target_arch = "aarch64")] {
+        mod aarch64;
+        pub(crate) use self::aarch64::detect_features;
+    } else if #[cfg(target_arch = "arm")] {
+        mod arm;
+        pub(crate) use self::arm::detect_features;
+    } else if #[cfg(target_arch = "powerpc64")] {
+        mod powerpc;
+        pub(crate) use self::powerpc::detect_features;
+    } else {
+        use crate::detect::cache;
+        /// Performs run-time feature detection.
+        pub(crate) fn detect_features() -> cache::Initializer {
+            cache::Initializer::default()
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/freebsd/powerpc.rs b/library/stdarch/crates/std_detect/src/detect/os/freebsd/powerpc.rs
new file mode 100644
index 000000000..6bfab631a
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/freebsd/powerpc.rs
@@ -0,0 +1,21 @@
+//! Run-time feature detection for PowerPC on FreeBSD.
+
+use super::auxvec;
+use crate::detect::{cache, Feature};
+
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::altivec, auxv.hwcap & 0x10000000 != 0);
+        enable_feature(&mut value, Feature::vsx, auxv.hwcap & 0x00000080 != 0);
+        enable_feature(&mut value, Feature::power8, auxv.hwcap2 & 0x80000000 != 0);
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs
new file mode 100644
index 000000000..b6a2e5218
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs
@@ -0,0 +1,290 @@
+//! Run-time feature detection for Aarch64 on Linux.
+
+use super::auxvec;
+use crate::detect::{bit, cache, Feature};
+
+/// Try to read the features from the auxiliary vector, and if that fails, try
+/// to read them from /proc/cpuinfo.
+pub(crate) fn detect_features() -> cache::Initializer {
+    if let Ok(auxv) = auxvec::auxv() {
+        let hwcap: AtHwcap = auxv.into();
+        return hwcap.cache();
+    }
+    #[cfg(feature = "std_detect_file_io")]
+    if let Ok(c) = super::cpuinfo::CpuInfo::new() {
+        let hwcap: AtHwcap = c.into();
+        return hwcap.cache();
+    }
+    cache::Initializer::default()
+}
+
+/// These values are part of the platform-specific [asm/hwcap.h][hwcap] .
+///
+/// The names match those used for cpuinfo.
+///
+/// [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h
+struct AtHwcap {
+    fp: bool,    // 0
+    asimd: bool, // 1
+    // evtstrm: bool, // 2 No LLVM support
+    aes: bool,     // 3
+    pmull: bool,   // 4
+    sha1: bool,    // 5
+    sha2: bool,    // 6
+    crc32: bool,   // 7
+    atomics: bool, // 8
+    fphp: bool,    // 9
+    asimdhp: bool, // 10
+    // cpuid: bool, // 11 No LLVM support
+    asimdrdm: bool, // 12
+    jscvt: bool,    // 13
+    fcma: bool,     // 14
+    lrcpc: bool,    // 15
+    dcpop: bool,    // 16
+    sha3: bool,     // 17
+    sm3: bool,      // 18
+    sm4: bool,      // 19
+    asimddp: bool,  // 20
+    sha512: bool,   // 21
+    sve: bool,      // 22
+    fhm: bool,      // 23
+    dit: bool,      // 24
+    uscat: bool,    // 25
+    ilrcpc: bool,   // 26
+    flagm: bool,    // 27
+    ssbs: bool,     // 28
+    sb: bool,       // 29
+    paca: bool,     // 30
+    pacg: bool,     // 31
+    dcpodp: bool,   // 32
+    sve2: bool,     // 33
+    sveaes: bool,   // 34
+    // svepmull: bool, // 35 No LLVM support
+    svebitperm: bool, // 36
+    svesha3: bool,    // 37
+    svesm4: bool,     // 38
+    // flagm2: bool, // 39 No LLVM support
+    frint: bool, // 40
+    // svei8mm: bool, // 41 See i8mm feature
+    svef32mm: bool, // 42
+    svef64mm: bool, // 43
+    // svebf16: bool, // 44 See bf16 feature
+    i8mm: bool, // 45
+    bf16: bool, // 46
+    // dgh: bool, // 47 No LLVM support
+    rng: bool, // 48
+    bti: bool, // 49
+    mte: bool, // 50
+}
+
+impl From<auxvec::AuxVec> for AtHwcap {
+    /// Reads AtHwcap from the auxiliary vector.
+    fn from(auxv: auxvec::AuxVec) -> Self {
+        AtHwcap {
+            fp: bit::test(auxv.hwcap, 0),
+            asimd: bit::test(auxv.hwcap, 1),
+            // evtstrm: bit::test(auxv.hwcap, 2),
+            aes: bit::test(auxv.hwcap, 3),
+            pmull: bit::test(auxv.hwcap, 4),
+            sha1: bit::test(auxv.hwcap, 5),
+            sha2: bit::test(auxv.hwcap, 6),
+            crc32: bit::test(auxv.hwcap, 7),
+            atomics: bit::test(auxv.hwcap, 8),
+            fphp: bit::test(auxv.hwcap, 9),
+            asimdhp: bit::test(auxv.hwcap, 10),
+            // cpuid: bit::test(auxv.hwcap, 11),
+            asimdrdm: bit::test(auxv.hwcap, 12),
+            jscvt: bit::test(auxv.hwcap, 13),
+            fcma: bit::test(auxv.hwcap, 14),
+            lrcpc: bit::test(auxv.hwcap, 15),
+            dcpop: bit::test(auxv.hwcap, 16),
+            sha3: bit::test(auxv.hwcap, 17),
+            sm3: bit::test(auxv.hwcap, 18),
+            sm4: bit::test(auxv.hwcap, 19),
+            asimddp: bit::test(auxv.hwcap, 20),
+            sha512: bit::test(auxv.hwcap, 21),
+            sve: bit::test(auxv.hwcap, 22),
+            fhm: bit::test(auxv.hwcap, 23),
+            dit: bit::test(auxv.hwcap, 24),
+            uscat: bit::test(auxv.hwcap, 25),
+            ilrcpc: bit::test(auxv.hwcap, 26),
+            flagm: bit::test(auxv.hwcap, 27),
+            ssbs: bit::test(auxv.hwcap, 28),
+            sb: bit::test(auxv.hwcap, 29),
+            paca: bit::test(auxv.hwcap, 30),
+            pacg: bit::test(auxv.hwcap, 31),
+            dcpodp: bit::test(auxv.hwcap, 32),
+            sve2: bit::test(auxv.hwcap, 33),
+            sveaes: bit::test(auxv.hwcap, 34),
+            // svepmull: bit::test(auxv.hwcap, 35),
+            svebitperm: bit::test(auxv.hwcap, 36),
+            svesha3: bit::test(auxv.hwcap, 37),
+            svesm4: bit::test(auxv.hwcap, 38),
+            // flagm2: bit::test(auxv.hwcap, 39),
+            frint: bit::test(auxv.hwcap, 40),
+            // svei8mm: bit::test(auxv.hwcap, 41),
+            svef32mm: bit::test(auxv.hwcap, 42),
+            svef64mm: bit::test(auxv.hwcap, 43),
+            // svebf16: bit::test(auxv.hwcap, 44),
+            i8mm: bit::test(auxv.hwcap, 45),
+            bf16: bit::test(auxv.hwcap, 46),
+            // dgh: bit::test(auxv.hwcap, 47),
+            rng: bit::test(auxv.hwcap, 48),
+            bti: bit::test(auxv.hwcap, 49),
+            mte: bit::test(auxv.hwcap, 50),
+        }
+    }
+}
+
+#[cfg(feature = "std_detect_file_io")]
+impl From<super::cpuinfo::CpuInfo> for AtHwcap {
+    /// Reads AtHwcap from /proc/cpuinfo .
+    fn from(c: super::cpuinfo::CpuInfo) -> Self {
+        let f = &c.field("Features");
+        AtHwcap {
+            // 64-bit names. FIXME: In 32-bit compatibility mode /proc/cpuinfo will
+            // map some of the 64-bit names to some 32-bit feature names. This does not
+            // cover that yet.
+            fp: f.has("fp"),
+            asimd: f.has("asimd"),
+            // evtstrm: f.has("evtstrm"),
+            aes: f.has("aes"),
+            pmull: f.has("pmull"),
+            sha1: f.has("sha1"),
+            sha2: f.has("sha2"),
+            crc32: f.has("crc32"),
+            atomics: f.has("atomics"),
+            fphp: f.has("fphp"),
+            asimdhp: f.has("asimdhp"),
+            // cpuid: f.has("cpuid"),
+            asimdrdm: f.has("asimdrdm"),
+            jscvt: f.has("jscvt"),
+            fcma: f.has("fcma"),
+            lrcpc: f.has("lrcpc"),
+            dcpop: f.has("dcpop"),
+            sha3: f.has("sha3"),
+            sm3: f.has("sm3"),
+            sm4: f.has("sm4"),
+            asimddp: f.has("asimddp"),
+            sha512: f.has("sha512"),
+            sve: f.has("sve"),
+            fhm: f.has("asimdfhm"),
+            dit: f.has("dit"),
+            uscat: f.has("uscat"),
+            ilrcpc: f.has("ilrcpc"),
+            flagm: f.has("flagm"),
+            ssbs: f.has("ssbs"),
+            sb: f.has("sb"),
+            paca: f.has("paca"),
+            pacg: f.has("pacg"),
+            dcpodp: f.has("dcpodp"),
+            sve2: f.has("sve2"),
+            sveaes: f.has("sveaes"),
+            // svepmull: f.has("svepmull"),
+            svebitperm: f.has("svebitperm"),
+            svesha3: f.has("svesha3"),
+            svesm4: f.has("svesm4"),
+            // flagm2: f.has("flagm2"),
+            frint: f.has("frint"),
+            // svei8mm: f.has("svei8mm"),
+            svef32mm: f.has("svef32mm"),
+            svef64mm: f.has("svef64mm"),
+            // svebf16: f.has("svebf16"),
+            i8mm: f.has("i8mm"),
+            bf16: f.has("bf16"),
+            // dgh: f.has("dgh"),
+            rng: f.has("rng"),
+            bti: f.has("bti"),
+            mte: f.has("mte"),
+        }
+    }
+}
+
+impl AtHwcap {
+    /// Initializes the cache from the feature -bits.
+    ///
+    /// The feature dependencies here come directly from LLVM's feature definintions:
+    /// https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64.td
+    fn cache(self) -> cache::Initializer {
+        let mut value = cache::Initializer::default();
+        {
+            let mut enable_feature = |f, enable| {
+                if enable {
+                    value.set(f as u32);
+                }
+            };
+
+            enable_feature(Feature::fp, self.fp);
+            // Half-float support requires float support
+            enable_feature(Feature::fp16, self.fp && self.fphp);
+            // FHM (fp16fml in LLVM) requires half float support
+            enable_feature(Feature::fhm, self.fphp && self.fhm);
+            enable_feature(Feature::pmull, self.pmull);
+            enable_feature(Feature::crc, self.crc32);
+            enable_feature(Feature::lse, self.atomics);
+            enable_feature(Feature::lse2, self.uscat);
+            enable_feature(Feature::rcpc, self.lrcpc);
+            // RCPC2 (rcpc-immo in LLVM) requires RCPC support
+            enable_feature(Feature::rcpc2, self.ilrcpc && self.lrcpc);
+            enable_feature(Feature::dit, self.dit);
+            enable_feature(Feature::flagm, self.flagm);
+            enable_feature(Feature::ssbs, self.ssbs);
+            enable_feature(Feature::sb, self.sb);
+            enable_feature(Feature::paca, self.paca);
+            enable_feature(Feature::pacg, self.pacg);
+            enable_feature(Feature::dpb, self.dcpop);
+            enable_feature(Feature::dpb2, self.dcpodp);
+            enable_feature(Feature::rand, self.rng);
+            enable_feature(Feature::bti, self.bti);
+            enable_feature(Feature::mte, self.mte);
+            // jsconv requires float support
+            enable_feature(Feature::jsconv, self.jscvt && self.fp);
+            enable_feature(Feature::rdm, self.asimdrdm);
+            enable_feature(Feature::dotprod, self.asimddp);
+            enable_feature(Feature::frintts, self.frint);
+
+            // FEAT_I8MM & FEAT_BF16 also include optional SVE components which linux exposes
+            // separately. We ignore that distinction here.
+            enable_feature(Feature::i8mm, self.i8mm);
+            enable_feature(Feature::bf16, self.bf16);
+
+            // ASIMD support requires float support - if half-floats are
+            // supported, it also requires half-float support:
+            let asimd = self.fp && self.asimd && (!self.fphp | self.asimdhp);
+            enable_feature(Feature::asimd, asimd);
+            // ASIMD extensions require ASIMD support:
+            enable_feature(Feature::fcma, self.fcma && asimd);
+            enable_feature(Feature::sve, self.sve && asimd);
+
+            // SVE extensions require SVE & ASIMD
+            enable_feature(Feature::f32mm, self.svef32mm && self.sve && asimd);
+            enable_feature(Feature::f64mm, self.svef64mm && self.sve && asimd);
+
+            // Cryptographic extensions require ASIMD
+            enable_feature(Feature::aes, self.aes && asimd);
+            enable_feature(Feature::sha2, self.sha1 && self.sha2 && asimd);
+            // SHA512/SHA3 require SHA1 & SHA256
+            enable_feature(
+                Feature::sha3,
+                self.sha512 && self.sha3 && self.sha1 && self.sha2 && asimd,
+            );
+            enable_feature(Feature::sm4, self.sm3 && self.sm4 && asimd);
+
+            // SVE2 requires SVE
+            let sve2 = self.sve2 && self.sve && asimd;
+            enable_feature(Feature::sve2, sve2);
+            // SVE2 extensions require SVE2 and crypto features
+            enable_feature(Feature::sve2_aes, self.sveaes && sve2 && self.aes);
+            enable_feature(
+                Feature::sve2_sm4,
+                self.svesm4 && sve2 && self.sm3 && self.sm4,
+            );
+            enable_feature(
+                Feature::sve2_sha3,
+                self.svesha3 && sve2 && self.sha512 && self.sha3 && self.sha1 && self.sha2,
+            );
+            enable_feature(Feature::sve2_bitperm, self.svebitperm && self.sve2);
+        }
+        value
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs
new file mode 100644
index 000000000..7383e487f
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs
@@ -0,0 +1,79 @@
+//! Run-time feature detection for ARM on Linux.
+
+use super::auxvec;
+use crate::detect::{bit, cache, Feature};
+
+/// Try to read the features from the auxiliary vector, and if that fails, try
+/// to read them from /proc/cpuinfo.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [asm/hwcap.h][hwcap]
+    //
+    // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm/include/uapi/asm/hwcap.h
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::neon, bit::test(auxv.hwcap, 12));
+        enable_feature(&mut value, Feature::pmull, bit::test(auxv.hwcap2, 1));
+        enable_feature(&mut value, Feature::crc, bit::test(auxv.hwcap2, 4));
+        enable_feature(
+            &mut value,
+            Feature::crypto,
+            bit::test(auxv.hwcap2, 0)
+                && bit::test(auxv.hwcap2, 1)
+                && bit::test(auxv.hwcap2, 2)
+                && bit::test(auxv.hwcap2, 3),
+        );
+        enable_feature(&mut value, Feature::aes, bit::test(auxv.hwcap2, 0));
+        // SHA2 requires SHA1 & SHA2 features
+        enable_feature(
+            &mut value,
+            Feature::sha2,
+            bit::test(auxv.hwcap2, 2) && bit::test(auxv.hwcap2, 3),
+        );
+        return value;
+    }
+
+    #[cfg(feature = "std_detect_file_io")]
+    if let Ok(c) = super::cpuinfo::CpuInfo::new() {
+        enable_feature(
+            &mut value,
+            Feature::neon,
+            c.field("Features").has("neon") && !has_broken_neon(&c),
+        );
+        enable_feature(&mut value, Feature::pmull, c.field("Features").has("pmull"));
+        enable_feature(&mut value, Feature::crc, c.field("Features").has("crc32"));
+        enable_feature(
+            &mut value,
+            Feature::crypto,
+            c.field("Features").has("aes")
+                && c.field("Features").has("pmull")
+                && c.field("Features").has("sha1")
+                && c.field("Features").has("sha2"),
+        );
+        enable_feature(&mut value, Feature::aes, c.field("Features").has("aes"));
+        enable_feature(
+            &mut value,
+            Feature::sha2,
+            c.field("Features").has("sha1") && c.field("Features").has("sha2"),
+        );
+        return value;
+    }
+    value
+}
+
+/// Is the CPU known to have a broken NEON unit?
+///
+/// See https://crbug.com/341598.
+#[cfg(feature = "std_detect_file_io")]
+fn has_broken_neon(cpuinfo: &super::cpuinfo::CpuInfo) -> bool {
+    cpuinfo.field("CPU implementer") == "0x51"
+        && cpuinfo.field("CPU architecture") == "7"
+        && cpuinfo.field("CPU variant") == "0x1"
+        && cpuinfo.field("CPU part") == "0x04d"
+        && cpuinfo.field("CPU revision") == "0"
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
new file mode 100644
index 000000000..e6447d0cd
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
@@ -0,0 +1,366 @@
+//! Parses ELF auxiliary vectors.
+#![allow(dead_code)]
+
+pub(crate) const AT_NULL: usize = 0;
+
+/// Key to access the CPU Hardware capabilities bitfield.
+pub(crate) const AT_HWCAP: usize = 16;
+/// Key to access the CPU Hardware capabilities 2 bitfield.
+#[cfg(any(
+    target_arch = "arm",
+    target_arch = "powerpc",
+    target_arch = "powerpc64"
+))]
+pub(crate) const AT_HWCAP2: usize = 26;
+
+/// Cache HWCAP bitfields of the ELF Auxiliary Vector.
+///
+/// If an entry cannot be read all the bits in the bitfield are set to zero.
+/// This should be interpreted as all the features being disabled.
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct AuxVec {
+    pub hwcap: usize,
+    #[cfg(any(
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    ))]
+    pub hwcap2: usize,
+}
+
+/// ELF Auxiliary Vector
+///
+/// The auxiliary vector is a memory region in a running ELF program's stack
+/// composed of (key: usize, value: usize) pairs.
+///
+/// The keys used in the aux vector are platform dependent. For Linux, they are
+/// defined in [linux/auxvec.h][auxvec_h]. The hardware capabilities of a given
+/// CPU can be queried with the  `AT_HWCAP` and `AT_HWCAP2` keys.
+///
+/// There is no perfect way of reading the auxiliary vector.
+///
+/// - If the `std_detect_dlsym_getauxval` cargo feature is enabled, this will use
+/// `getauxval` if its linked to the binary, and otherwise proceed to a fallback implementation.
+/// When `std_detect_dlsym_getauxval` is disabled, this will assume that `getauxval` is
+/// linked to the binary - if that is not the case the behavior is undefined.
+/// - Otherwise, if the `std_detect_file_io` cargo feature is enabled, it will
+///   try to read `/proc/self/auxv`.
+/// - If that fails, this function returns an error.
+///
+/// Note that run-time feature detection is not invoked for features that can
+/// be detected at compile-time. Also note that if this function returns an
+/// error, cpuinfo still can (and will) be used to try to perform run-time
+/// feature detecton on some platforms.
+///
+/// For more information about when `getauxval` is available check the great
+/// [`auxv` crate documentation][auxv_docs].
+///
+/// [auxvec_h]: https://github.com/torvalds/linux/blob/master/include/uapi/linux/auxvec.h
+/// [auxv_docs]: https://docs.rs/auxv/0.3.3/auxv/
+pub(crate) fn auxv() -> Result<AuxVec, ()> {
+    #[cfg(feature = "std_detect_dlsym_getauxval")]
+    {
+        // Try to call a dynamically-linked getauxval function.
+        if let Ok(hwcap) = getauxval(AT_HWCAP) {
+            // Targets with only AT_HWCAP:
+            #[cfg(any(
+                target_arch = "aarch64",
+                target_arch = "riscv32",
+                target_arch = "riscv64",
+                target_arch = "mips",
+                target_arch = "mips64"
+            ))]
+            {
+                if hwcap != 0 {
+                    return Ok(AuxVec { hwcap });
+                }
+            }
+
+            // Targets with AT_HWCAP and AT_HWCAP2:
+            #[cfg(any(
+                target_arch = "arm",
+                target_arch = "powerpc",
+                target_arch = "powerpc64"
+            ))]
+            {
+                if let Ok(hwcap2) = getauxval(AT_HWCAP2) {
+                    if hwcap != 0 && hwcap2 != 0 {
+                        return Ok(AuxVec { hwcap, hwcap2 });
+                    }
+                }
+            }
+            drop(hwcap);
+        }
+    }
+
+    #[cfg(not(feature = "std_detect_dlsym_getauxval"))]
+    {
+        // Targets with only AT_HWCAP:
+        #[cfg(any(
+            target_arch = "aarch64",
+            target_arch = "riscv32",
+            target_arch = "riscv64",
+            target_arch = "mips",
+            target_arch = "mips64"
+        ))]
+        {
+            let hwcap = unsafe { libc::getauxval(AT_HWCAP as libc::c_ulong) as usize };
+            if hwcap != 0 {
+                return Ok(AuxVec { hwcap });
+            }
+        }
+
+        // Targets with AT_HWCAP and AT_HWCAP2:
+        #[cfg(any(
+            target_arch = "arm",
+            target_arch = "powerpc",
+            target_arch = "powerpc64"
+        ))]
+        {
+            let hwcap = unsafe { libc::getauxval(AT_HWCAP as libc::c_ulong) as usize };
+            let hwcap2 = unsafe { libc::getauxval(AT_HWCAP2 as libc::c_ulong) as usize };
+            if hwcap != 0 && hwcap2 != 0 {
+                return Ok(AuxVec { hwcap, hwcap2 });
+            }
+        }
+    }
+
+    #[cfg(feature = "std_detect_file_io")]
+    {
+        // If calling getauxval fails, try to read the auxiliary vector from
+        // its file:
+        auxv_from_file("/proc/self/auxv")
+    }
+    #[cfg(not(feature = "std_detect_file_io"))]
+    {
+        Err(())
+    }
+}
+
+/// Tries to read the `key` from the auxiliary vector by calling the
+/// dynamically-linked `getauxval` function. If the function is not linked,
+/// this function return `Err`.
+#[cfg(feature = "std_detect_dlsym_getauxval")]
+fn getauxval(key: usize) -> Result<usize, ()> {
+    use libc;
+    pub type F = unsafe extern "C" fn(usize) -> usize;
+    unsafe {
+        let ptr = libc::dlsym(libc::RTLD_DEFAULT, "getauxval\0".as_ptr() as *const _);
+        if ptr.is_null() {
+            return Err(());
+        }
+
+        let ffi_getauxval: F = core::mem::transmute(ptr);
+        Ok(ffi_getauxval(key))
+    }
+}
+
+/// Tries to read the auxiliary vector from the `file`. If this fails, this
+/// function returns `Err`.
+#[cfg(feature = "std_detect_file_io")]
+fn auxv_from_file(file: &str) -> Result<AuxVec, ()> {
+    let file = super::read_file(file)?;
+
+    // See <https://github.com/torvalds/linux/blob/v3.19/include/uapi/linux/auxvec.h>.
+    //
+    // The auxiliary vector contains at most 32 (key,value) fields: from
+    // `AT_EXECFN = 31` to `AT_NULL = 0`. That is, a buffer of
+    // 2*32 `usize` elements is enough to read the whole vector.
+    let mut buf = [0_usize; 64];
+    let len = core::mem::size_of_val(&buf).max(file.len());
+    unsafe {
+        core::ptr::copy_nonoverlapping(file.as_ptr(), buf.as_mut_ptr() as *mut u8, len);
+    }
+
+    auxv_from_buf(&buf)
+}
+
+/// Tries to interpret the `buffer` as an auxiliary vector. If that fails, this
+/// function returns `Err`.
+#[cfg(feature = "std_detect_file_io")]
+fn auxv_from_buf(buf: &[usize; 64]) -> Result<AuxVec, ()> {
+    // Targets with only AT_HWCAP:
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "riscv32",
+        target_arch = "riscv64",
+        target_arch = "mips",
+        target_arch = "mips64",
+    ))]
+    {
+        for el in buf.chunks(2) {
+            match el[0] {
+                AT_NULL => break,
+                AT_HWCAP => return Ok(AuxVec { hwcap: el[1] }),
+                _ => (),
+            }
+        }
+    }
+    // Targets with AT_HWCAP and AT_HWCAP2:
+    #[cfg(any(
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    ))]
+    {
+        let mut hwcap = None;
+        let mut hwcap2 = None;
+        for el in buf.chunks(2) {
+            match el[0] {
+                AT_NULL => break,
+                AT_HWCAP => hwcap = Some(el[1]),
+                AT_HWCAP2 => hwcap2 = Some(el[1]),
+                _ => (),
+            }
+        }
+
+        if let (Some(hwcap), Some(hwcap2)) = (hwcap, hwcap2) {
+            return Ok(AuxVec { hwcap, hwcap2 });
+        }
+    }
+    drop(buf);
+    Err(())
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate auxv as auxv_crate;
+    use super::*;
+
+    // Reads the Auxiliary Vector key from /proc/self/auxv
+    // using the auxv crate.
+    #[cfg(feature = "std_detect_file_io")]
+    fn auxv_crate_getprocfs(key: usize) -> Option<usize> {
+        use self::auxv_crate::procfs::search_procfs_auxv;
+        use self::auxv_crate::AuxvType;
+        let k = key as AuxvType;
+        match search_procfs_auxv(&[k]) {
+            Ok(v) => Some(v[&k] as usize),
+            Err(_) => None,
+        }
+    }
+
+    // Reads the Auxiliary Vector key from getauxval()
+    // using the auxv crate.
+    #[cfg(not(any(target_arch = "mips", target_arch = "mips64")))]
+    fn auxv_crate_getauxval(key: usize) -> Option<usize> {
+        use self::auxv_crate::getauxval::Getauxval;
+        use self::auxv_crate::AuxvType;
+        let q = auxv_crate::getauxval::NativeGetauxval {};
+        match q.getauxval(key as AuxvType) {
+            Ok(v) => Some(v as usize),
+            Err(_) => None,
+        }
+    }
+
+    // FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv
+    // does not always contain the AT_HWCAP key under qemu.
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    ))]
+    #[test]
+    fn auxv_crate() {
+        let v = auxv();
+        if let Some(hwcap) = auxv_crate_getauxval(AT_HWCAP) {
+            let rt_hwcap = v.expect("failed to find hwcap key").hwcap;
+            assert_eq!(rt_hwcap, hwcap);
+        }
+
+        // Targets with AT_HWCAP and AT_HWCAP2:
+        #[cfg(any(
+            target_arch = "arm",
+            target_arch = "powerpc",
+            target_arch = "powerpc64"
+        ))]
+        {
+            if let Some(hwcap2) = auxv_crate_getauxval(AT_HWCAP2) {
+                let rt_hwcap2 = v.expect("failed to find hwcap2 key").hwcap2;
+                assert_eq!(rt_hwcap2, hwcap2);
+            }
+        }
+    }
+
+    #[test]
+    fn auxv_dump() {
+        if let Ok(auxvec) = auxv() {
+            println!("{:?}", auxvec);
+        } else {
+            println!("both getauxval() and reading /proc/self/auxv failed!");
+        }
+    }
+
+    #[cfg(feature = "std_detect_file_io")]
+    cfg_if::cfg_if! {
+        if #[cfg(target_arch = "arm")] {
+            #[test]
+            fn linux_rpi3() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-rpi3.auxv");
+                println!("file: {}", file);
+                let v = auxv_from_file(file).unwrap();
+                assert_eq!(v.hwcap, 4174038);
+                assert_eq!(v.hwcap2, 16);
+            }
+
+            #[test]
+            #[should_panic]
+            fn linux_macos_vb() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv");
+                println!("file: {}", file);
+                let v = auxv_from_file(file).unwrap();
+                // this file is incomplete (contains hwcap but not hwcap2), we
+                // want to fall back to /proc/cpuinfo in this case, so
+                // reading should fail. assert_eq!(v.hwcap, 126614527);
+                // assert_eq!(v.hwcap2, 0);
+                let _ = v;
+            }
+        } else if #[cfg(target_arch = "aarch64")] {
+            #[test]
+            fn linux_x64() {
+                let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-x64-i7-6850k.auxv");
+                println!("file: {}", file);
+                let v = auxv_from_file(file).unwrap();
+                assert_eq!(v.hwcap, 3219913727);
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(feature = "std_detect_file_io")]
+    fn auxv_dump_procfs() {
+        if let Ok(auxvec) = auxv_from_file("/proc/self/auxv") {
+            println!("{:?}", auxvec);
+        } else {
+            println!("reading /proc/self/auxv failed!");
+        }
+    }
+
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    ))]
+    #[test]
+    #[cfg(feature = "std_detect_file_io")]
+    fn auxv_crate_procfs() {
+        let v = auxv();
+        if let Some(hwcap) = auxv_crate_getprocfs(AT_HWCAP) {
+            assert_eq!(v.unwrap().hwcap, hwcap);
+        }
+
+        // Targets with AT_HWCAP and AT_HWCAP2:
+        #[cfg(any(
+            target_arch = "arm",
+            target_arch = "powerpc",
+            target_arch = "powerpc64"
+        ))]
+        {
+            if let Some(hwcap2) = auxv_crate_getprocfs(AT_HWCAP2) {
+                assert_eq!(v.unwrap().hwcap2, hwcap2);
+            }
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/cpuinfo.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/cpuinfo.rs
new file mode 100644
index 000000000..48a5c9728
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/cpuinfo.rs
@@ -0,0 +1,331 @@
+//! Parses /proc/cpuinfo
+#![cfg_attr(not(target_arch = "arm"), allow(dead_code))]
+
+use alloc::string::String;
+
+/// cpuinfo
+pub(crate) struct CpuInfo {
+    raw: String,
+}
+
+impl CpuInfo {
+    /// Reads /proc/cpuinfo into CpuInfo.
+    pub(crate) fn new() -> Result<Self, ()> {
+        let raw = super::read_file("/proc/cpuinfo")?;
+        Ok(Self {
+            raw: String::from_utf8(raw).map_err(|_| ())?,
+        })
+    }
+    /// Returns the value of the cpuinfo `field`.
+    pub(crate) fn field(&self, field: &str) -> CpuInfoField<'_> {
+        for l in self.raw.lines() {
+            if l.trim().starts_with(field) {
+                return CpuInfoField::new(l.split(": ").nth(1));
+            }
+        }
+        CpuInfoField(None)
+    }
+
+    /// Returns the `raw` contents of `/proc/cpuinfo`
+    #[cfg(test)]
+    fn raw(&self) -> &String {
+        &self.raw
+    }
+
+    #[cfg(test)]
+    fn from_str(other: &str) -> Result<Self, ()> {
+        Ok(Self {
+            raw: String::from(other),
+        })
+    }
+}
+
+/// Field of cpuinfo
+#[derive(Debug)]
+pub(crate) struct CpuInfoField<'a>(Option<&'a str>);
+
+impl<'a> PartialEq<&'a str> for CpuInfoField<'a> {
+    fn eq(&self, other: &&'a str) -> bool {
+        match self.0 {
+            None => other.is_empty(),
+            Some(f) => f == other.trim(),
+        }
+    }
+}
+
+impl<'a> CpuInfoField<'a> {
+    pub(crate) fn new<'b>(v: Option<&'b str>) -> CpuInfoField<'b> {
+        match v {
+            None => CpuInfoField::<'b>(None),
+            Some(f) => CpuInfoField::<'b>(Some(f.trim())),
+        }
+    }
+    /// Does the field exist?
+    #[cfg(test)]
+    pub(crate) fn exists(&self) -> bool {
+        self.0.is_some()
+    }
+    /// Does the field contain `other`?
+    pub(crate) fn has(&self, other: &str) -> bool {
+        match self.0 {
+            None => other.is_empty(),
+            Some(f) => {
+                let other = other.trim();
+                for v in f.split(' ') {
+                    if v == other {
+                        return true;
+                    }
+                }
+                false
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn raw_dump() {
+        let cpuinfo = CpuInfo::new().unwrap();
+        if cpuinfo.field("vendor_id") == "GenuineIntel" {
+            assert!(cpuinfo.field("flags").exists());
+            assert!(!cpuinfo.field("vendor33_id").exists());
+            assert!(cpuinfo.field("flags").has("sse"));
+            assert!(!cpuinfo.field("flags").has("avx314"));
+        }
+        println!("{}", cpuinfo.raw());
+    }
+
+    const CORE_DUO_T6500: &str = r"processor       : 0
+vendor_id       : GenuineIntel
+cpu family      : 6
+model           : 23
+model name      : Intel(R) Core(TM)2 Duo CPU     T6500  @ 2.10GHz
+stepping        : 10
+microcode       : 0xa0b
+cpu MHz         : 1600.000
+cache size      : 2048 KB
+physical id     : 0
+siblings        : 2
+core id         : 0
+cpu cores       : 2
+apicid          : 0
+initial apicid  : 0
+fdiv_bug        : no
+hlt_bug         : no
+f00f_bug        : no
+coma_bug        : no
+fpu             : yes
+fpu_exception   : yes
+cpuid level     : 13
+wp              : yes
+flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 xsave lahf_lm dtherm
+bogomips        : 4190.43
+clflush size    : 64
+cache_alignment : 64
+address sizes   : 36 bits physical, 48 bits virtual
+power management:
+";
+
+    #[test]
+    fn core_duo_t6500() {
+        let cpuinfo = CpuInfo::from_str(CORE_DUO_T6500).unwrap();
+        assert_eq!(cpuinfo.field("vendor_id"), "GenuineIntel");
+        assert_eq!(cpuinfo.field("cpu family"), "6");
+        assert_eq!(cpuinfo.field("model"), "23");
+        assert_eq!(
+            cpuinfo.field("model name"),
+            "Intel(R) Core(TM)2 Duo CPU     T6500  @ 2.10GHz"
+        );
+        assert_eq!(
+            cpuinfo.field("flags"),
+            "fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 xsave lahf_lm dtherm"
+        );
+        assert!(cpuinfo.field("flags").has("fpu"));
+        assert!(cpuinfo.field("flags").has("dtherm"));
+        assert!(cpuinfo.field("flags").has("sse2"));
+        assert!(!cpuinfo.field("flags").has("avx"));
+    }
+
+    const ARM_CORTEX_A53: &str = r"Processor   : AArch64 Processor rev 3 (aarch64)
+        processor   : 0
+        processor   : 1
+        processor   : 2
+        processor   : 3
+        processor   : 4
+        processor   : 5
+        processor   : 6
+        processor   : 7
+        Features    : fp asimd evtstrm aes pmull sha1 sha2 crc32
+        CPU implementer : 0x41
+        CPU architecture: AArch64
+        CPU variant : 0x0
+        CPU part    : 0xd03
+        CPU revision    : 3
+
+        Hardware    : HiKey Development Board
+        ";
+
+    #[test]
+    fn arm_cortex_a53() {
+        let cpuinfo = CpuInfo::from_str(ARM_CORTEX_A53).unwrap();
+        assert_eq!(
+            cpuinfo.field("Processor"),
+            "AArch64 Processor rev 3 (aarch64)"
+        );
+        assert_eq!(
+            cpuinfo.field("Features"),
+            "fp asimd evtstrm aes pmull sha1 sha2 crc32"
+        );
+        assert!(cpuinfo.field("Features").has("pmull"));
+        assert!(!cpuinfo.field("Features").has("neon"));
+        assert!(cpuinfo.field("Features").has("asimd"));
+    }
+
+    const ARM_CORTEX_A57: &str = r"Processor	: Cortex A57 Processor rev 1 (aarch64)
+processor	: 0
+processor	: 1
+processor	: 2
+processor	: 3
+Features	: fp asimd aes pmull sha1 sha2 crc32 wp half thumb fastmult vfp edsp neon vfpv3 tlsi vfpv4 idiva idivt
+CPU implementer	: 0x41
+CPU architecture: 8
+CPU variant	: 0x1
+CPU part	: 0xd07
+CPU revision	: 1";
+
+    #[test]
+    fn arm_cortex_a57() {
+        let cpuinfo = CpuInfo::from_str(ARM_CORTEX_A57).unwrap();
+        assert_eq!(
+            cpuinfo.field("Processor"),
+            "Cortex A57 Processor rev 1 (aarch64)"
+        );
+        assert_eq!(
+            cpuinfo.field("Features"),
+            "fp asimd aes pmull sha1 sha2 crc32 wp half thumb fastmult vfp edsp neon vfpv3 tlsi vfpv4 idiva idivt"
+        );
+        assert!(cpuinfo.field("Features").has("pmull"));
+        assert!(cpuinfo.field("Features").has("neon"));
+        assert!(cpuinfo.field("Features").has("asimd"));
+    }
+
+    const RISCV_RV64GC: &str = r"processor       : 0
+hart            : 3
+isa             : rv64imafdc
+mmu             : sv39
+uarch           : sifive,u74-mc
+
+processor       : 1
+hart            : 1
+isa             : rv64imafdc
+mmu             : sv39
+uarch           : sifive,u74-mc
+
+processor       : 2
+hart            : 2
+isa             : rv64imafdc
+mmu             : sv39
+uarch           : sifive,u74-mc
+
+processor       : 3
+hart            : 4
+isa             : rv64imafdc
+mmu             : sv39
+uarch           : sifive,u74-mc";
+
+    #[test]
+    fn riscv_rv64gc() {
+        let cpuinfo = CpuInfo::from_str(RISCV_RV64GC).unwrap();
+        assert_eq!(cpuinfo.field("isa"), "rv64imafdc");
+        assert_eq!(cpuinfo.field("mmu"), "sv39");
+        assert_eq!(cpuinfo.field("uarch"), "sifive,u74-mc");
+    }
+
+    const POWER8E_POWERKVM: &str = r"processor       : 0
+cpu             : POWER8E (raw), altivec supported
+clock           : 3425.000000MHz
+revision        : 2.1 (pvr 004b 0201)
+
+processor       : 1
+cpu             : POWER8E (raw), altivec supported
+clock           : 3425.000000MHz
+revision        : 2.1 (pvr 004b 0201)
+
+processor       : 2
+cpu             : POWER8E (raw), altivec supported
+clock           : 3425.000000MHz
+revision        : 2.1 (pvr 004b 0201)
+
+processor       : 3
+cpu             : POWER8E (raw), altivec supported
+clock           : 3425.000000MHz
+revision        : 2.1 (pvr 004b 0201)
+
+timebase        : 512000000
+platform        : pSeries
+model           : IBM pSeries (emulated by qemu)
+machine         : CHRP IBM pSeries (emulated by qemu)";
+
+    #[test]
+    fn power8_powerkvm() {
+        let cpuinfo = CpuInfo::from_str(POWER8E_POWERKVM).unwrap();
+        assert_eq!(cpuinfo.field("cpu"), "POWER8E (raw), altivec supported");
+
+        assert!(cpuinfo.field("cpu").has("altivec"));
+    }
+
+    const POWER5P: &str = r"processor       : 0
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 1
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 2
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 3
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 4
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 5
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 6
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+processor       : 7
+cpu             : POWER5+ (gs)
+clock           : 1900.098000MHz
+revision        : 2.1 (pvr 003b 0201)
+
+timebase        : 237331000
+platform        : pSeries
+machine         : CHRP IBM,9133-55A";
+
+    #[test]
+    fn power5p() {
+        let cpuinfo = CpuInfo::from_str(POWER5P).unwrap();
+        assert_eq!(cpuinfo.field("cpu"), "POWER5+ (gs)");
+
+        assert!(!cpuinfo.field("cpu").has("altivec"));
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs
new file mode 100644
index 000000000..9c030f41a
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/mips.rs
@@ -0,0 +1,25 @@
+//! Run-time feature detection for MIPS on Linux.
+
+use super::auxvec;
+use crate::detect::{bit, cache, Feature};
+
+/// Try to read the features from the auxiliary vector, and if that fails, try
+/// to read them from `/proc/cpuinfo`.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [asm/hwcap.h][hwcap]
+    //
+    // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h
+    if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::msa, bit::test(auxv.hwcap, 1));
+        return value;
+    }
+    // TODO: fall back via `cpuinfo`.
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs
new file mode 100644
index 000000000..a49a72783
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/mod.rs
@@ -0,0 +1,64 @@
+//! Run-time feature detection on Linux
+//!
+#[cfg(feature = "std_detect_file_io")]
+use alloc::vec::Vec;
+
+mod auxvec;
+
+#[cfg(feature = "std_detect_file_io")]
+mod cpuinfo;
+
+#[cfg(feature = "std_detect_file_io")]
+fn read_file(path: &str) -> Result<Vec<u8>, ()> {
+    let mut path = Vec::from(path.as_bytes());
+    path.push(0);
+
+    unsafe {
+        let file = libc::open(path.as_ptr() as *const libc::c_char, libc::O_RDONLY);
+        if file == -1 {
+            return Err(());
+        }
+
+        let mut data = Vec::new();
+        loop {
+            data.reserve(4096);
+            let spare = data.spare_capacity_mut();
+            match libc::read(file, spare.as_mut_ptr() as *mut _, spare.len()) {
+                -1 => {
+                    libc::close(file);
+                    return Err(());
+                }
+                0 => break,
+                n => data.set_len(data.len() + n as usize),
+            }
+        }
+
+        libc::close(file);
+        Ok(data)
+    }
+}
+
+cfg_if::cfg_if! {
+    if #[cfg(target_arch = "aarch64")] {
+        mod aarch64;
+        pub(crate) use self::aarch64::detect_features;
+    } else if #[cfg(target_arch = "arm")] {
+        mod arm;
+        pub(crate) use self::arm::detect_features;
+    } else if #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] {
+        mod riscv;
+        pub(crate) use self::riscv::detect_features;
+    } else if #[cfg(any(target_arch = "mips", target_arch = "mips64"))] {
+        mod mips;
+        pub(crate) use self::mips::detect_features;
+    } else if #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))] {
+        mod powerpc;
+        pub(crate) use self::powerpc::detect_features;
+    } else {
+        use crate::detect::cache;
+        /// Performs run-time feature detection.
+        pub(crate) fn detect_features() -> cache::Initializer {
+            cache::Initializer::default()
+        }
+    }
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs
new file mode 100644
index 000000000..c3308e815
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/powerpc.rs
@@ -0,0 +1,36 @@
+//! Run-time feature detection for PowerPC on Linux.
+
+use super::auxvec;
+use crate::detect::{cache, Feature};
+
+/// Try to read the features from the auxiliary vector, and if that fails, try
+/// to read them from /proc/cpuinfo.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    // The values are part of the platform-specific [asm/cputable.h][cputable]
+    //
+    // [cputable]: https://github.com/torvalds/linux/blob/master/arch/powerpc/include/uapi/asm/cputable.h
+    if let Ok(auxv) = auxvec::auxv() {
+        // note: the PowerPC values are the mask to do the test (instead of the
+        // index of the bit to test like in ARM and Aarch64)
+        enable_feature(&mut value, Feature::altivec, auxv.hwcap & 0x10000000 != 0);
+        enable_feature(&mut value, Feature::vsx, auxv.hwcap & 0x00000080 != 0);
+        enable_feature(&mut value, Feature::power8, auxv.hwcap2 & 0x80000000 != 0);
+        return value;
+    }
+
+    // PowerPC's /proc/cpuinfo lacks a proper Feature field,
+    // but `altivec` support is indicated in the `cpu` field.
+    #[cfg(feature = "std_detect_file_io")]
+    if let Ok(c) = super::cpuinfo::CpuInfo::new() {
+        enable_feature(&mut value, Feature::altivec, c.field("cpu").has("altivec"));
+        return value;
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/riscv.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/riscv.rs
new file mode 100644
index 000000000..1ec06959a
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/riscv.rs
@@ -0,0 +1,73 @@
+//! Run-time feature detection for RISC-V on Linux.
+
+use super::auxvec;
+use crate::detect::{bit, cache, Feature};
+
+/// Read list of supported features from the auxiliary vector.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+    let enable_feature = |value: &mut cache::Initializer, feature, enable| {
+        if enable {
+            value.set(feature as u32);
+        }
+    };
+    let enable_features = |value: &mut cache::Initializer, feature_slice: &[Feature], enable| {
+        if enable {
+            for feature in feature_slice {
+                value.set(*feature as u32);
+            }
+        }
+    };
+
+    // The values are part of the platform-specific [asm/hwcap.h][hwcap]
+    //
+    // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/riscv/include/asm/hwcap.h
+    let auxv = auxvec::auxv().expect("read auxvec"); // should not fail on RISC-V platform
+    enable_feature(
+        &mut value,
+        Feature::a,
+        bit::test(auxv.hwcap, (b'a' - b'a').into()),
+    );
+    enable_feature(
+        &mut value,
+        Feature::c,
+        bit::test(auxv.hwcap, (b'c' - b'a').into()),
+    );
+    enable_features(
+        &mut value,
+        &[Feature::d, Feature::f, Feature::zicsr],
+        bit::test(auxv.hwcap, (b'd' - b'a').into()),
+    );
+    enable_features(
+        &mut value,
+        &[Feature::f, Feature::zicsr],
+        bit::test(auxv.hwcap, (b'f' - b'a').into()),
+    );
+    let has_i = bit::test(auxv.hwcap, (b'i' - b'a').into());
+    // If future RV128I is supported, implement with `enable_feature` here
+    #[cfg(target_pointer_width = "64")]
+    enable_feature(&mut value, Feature::rv64i, has_i);
+    #[cfg(target_pointer_width = "32")]
+    enable_feature(&mut value, Feature::rv32i, has_i);
+    #[cfg(target_pointer_width = "32")]
+    enable_feature(
+        &mut value,
+        Feature::rv32e,
+        bit::test(auxv.hwcap, (b'e' - b'a').into()),
+    );
+    enable_feature(
+        &mut value,
+        Feature::h,
+        bit::test(auxv.hwcap, (b'h' - b'a').into()),
+    );
+    enable_feature(
+        &mut value,
+        Feature::m,
+        bit::test(auxv.hwcap, (b'm' - b'a').into()),
+    );
+    // FIXME: Auxvec does not show supervisor feature support, but this mode may be useful
+    // to detect when Rust is used to write Linux kernel modules.
+    // These should be more than Auxvec way to detect supervisor features.
+
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/other.rs b/library/stdarch/crates/std_detect/src/detect/os/other.rs
new file mode 100644
index 000000000..091fafc4e
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/other.rs
@@ -0,0 +1,8 @@
+//! Other operating systems
+
+use crate::detect::cache;
+
+#[allow(dead_code)]
+pub(crate) fn detect_features() -> cache::Initializer {
+    cache::Initializer::default()
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/windows/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/windows/aarch64.rs
new file mode 100644
index 000000000..051ad6d1b
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/windows/aarch64.rs
@@ -0,0 +1,59 @@
+//! Run-time feature detection for Aarch64 on Windows.
+
+use crate::detect::{cache, Feature};
+
+/// Try to read the features using IsProcessorFeaturePresent.
+pub(crate) fn detect_features() -> cache::Initializer {
+    type DWORD = u32;
+    type BOOL = i32;
+
+    const FALSE: BOOL = 0;
+    // The following Microsoft documents isn't updated for aarch64.
+    // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent
+    // These are defined in winnt.h of Windows SDK
+    const PF_ARM_NEON_INSTRUCTIONS_AVAILABLE: u32 = 19;
+    const PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE: u32 = 30;
+    const PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE: u32 = 31;
+
+    extern "system" {
+        pub fn IsProcessorFeaturePresent(ProcessorFeature: DWORD) -> BOOL;
+    }
+
+    let mut value = cache::Initializer::default();
+    {
+        let mut enable_feature = |f, enable| {
+            if enable {
+                value.set(f as u32);
+            }
+        };
+
+        // Some features such Feature::fp may be supported on current CPU,
+        // but no way to detect it by OS API.
+        // Also, we require unsafe block for the extern "system" calls.
+        unsafe {
+            enable_feature(
+                Feature::asimd,
+                IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::crc,
+                IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            // PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE means aes, sha1, sha2 and
+            // pmull support
+            enable_feature(
+                Feature::aes,
+                IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::pmull,
+                IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+            enable_feature(
+                Feature::sha2,
+                IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != FALSE,
+            );
+        }
+    }
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/x86.rs b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
new file mode 100644
index 000000000..ea5f595ec
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
@@ -0,0 +1,273 @@
+//! x86 run-time feature detection is OS independent.
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use core::mem;
+
+use crate::detect::{bit, cache, Feature};
+
+/// Run-time feature detection on x86 works by using the CPUID instruction.
+///
+/// The [CPUID Wikipedia page][wiki_cpuid] contains
+/// all the information about which flags to set to query which values, and in
+/// which registers these are reported.
+///
+/// The definitive references are:
+/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+///   Instruction Set Reference, A-Z][intel64_ref].
+/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+///   System Instructions][amd64_ref].
+///
+/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
+/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+#[allow(clippy::similar_names)]
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+
+    // If the x86 CPU does not support the CPUID instruction then it is too
+    // old to support any of the currently-detectable features.
+    if !has_cpuid() {
+        return value;
+    }
+
+    // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
+    // has `cpuid` support.
+
+    // 0. EAX = 0: Basic Information:
+    // - EAX returns the "Highest Function Parameter", that is, the maximum
+    // leaf value for subsequent calls of `cpuinfo` in range [0,
+    // 0x8000_0000]. - The vendor ID is stored in 12 u8 ascii chars,
+    // returned in EBX, EDX, and   ECX (in that order):
+    let (max_basic_leaf, vendor_id) = unsafe {
+        let CpuidResult {
+            eax: max_basic_leaf,
+            ebx,
+            ecx,
+            edx,
+        } = __cpuid(0);
+        let vendor_id: [[u8; 4]; 3] = [
+            mem::transmute(ebx),
+            mem::transmute(edx),
+            mem::transmute(ecx),
+        ];
+        let vendor_id: [u8; 12] = mem::transmute(vendor_id);
+        (max_basic_leaf, vendor_id)
+    };
+
+    if max_basic_leaf < 1 {
+        // Earlier Intel 486, CPUID not implemented
+        return value;
+    }
+
+    // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
+    // Contains information about most x86 features.
+    let CpuidResult {
+        ecx: proc_info_ecx,
+        edx: proc_info_edx,
+        ..
+    } = unsafe { __cpuid(0x0000_0001_u32) };
+
+    // EAX = 7, ECX = 0: Queries "Extended Features";
+    // Contains information about bmi,bmi2, and avx2 support.
+    let (extended_features_ebx, extended_features_ecx) = if max_basic_leaf >= 7 {
+        let CpuidResult { ebx, ecx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
+        (ebx, ecx)
+    } else {
+        (0, 0) // CPUID does not support "Extended Features"
+    };
+
+    // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
+    // - EAX returns the max leaf value for extended information, that is,
+    // `cpuid` calls in range [0x8000_0000; u32::MAX]:
+    let CpuidResult {
+        eax: extended_max_basic_leaf,
+        ..
+    } = unsafe { __cpuid(0x8000_0000_u32) };
+
+    // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature
+    // Bits"
+    let extended_proc_info_ecx = if extended_max_basic_leaf >= 1 {
+        let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
+        ecx
+    } else {
+        0
+    };
+
+    {
+        // borrows value till the end of this scope:
+        let mut enable = |r, rb, f| {
+            if bit::test(r as usize, rb) {
+                value.set(f as u32);
+            }
+        };
+
+        enable(proc_info_ecx, 0, Feature::sse3);
+        enable(proc_info_ecx, 1, Feature::pclmulqdq);
+        enable(proc_info_ecx, 9, Feature::ssse3);
+        enable(proc_info_ecx, 13, Feature::cmpxchg16b);
+        enable(proc_info_ecx, 19, Feature::sse4_1);
+        enable(proc_info_ecx, 20, Feature::sse4_2);
+        enable(proc_info_ecx, 23, Feature::popcnt);
+        enable(proc_info_ecx, 25, Feature::aes);
+        enable(proc_info_ecx, 29, Feature::f16c);
+        enable(proc_info_ecx, 30, Feature::rdrand);
+        enable(extended_features_ebx, 18, Feature::rdseed);
+        enable(extended_features_ebx, 19, Feature::adx);
+        enable(extended_features_ebx, 11, Feature::rtm);
+        enable(proc_info_edx, 4, Feature::tsc);
+        enable(proc_info_edx, 23, Feature::mmx);
+        enable(proc_info_edx, 24, Feature::fxsr);
+        enable(proc_info_edx, 25, Feature::sse);
+        enable(proc_info_edx, 26, Feature::sse2);
+        enable(extended_features_ebx, 29, Feature::sha);
+
+        enable(extended_features_ebx, 3, Feature::bmi1);
+        enable(extended_features_ebx, 8, Feature::bmi2);
+
+        // `XSAVE` and `AVX` support:
+        let cpu_xsave = bit::test(proc_info_ecx as usize, 26);
+        if cpu_xsave {
+            // 0. Here the CPU supports `XSAVE`.
+
+            // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
+            // supports saving the state of the AVX/AVX2 vector registers on
+            // context-switches, see:
+            //
+            // - [intel: is avx enabled?][is_avx_enabled],
+            // - [mozilla: sse.cpp][mozilla_sse_cpp].
+            //
+            // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
+            // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
+            let cpu_osxsave = bit::test(proc_info_ecx as usize, 27);
+
+            if cpu_osxsave {
+                // 2. The OS must have signaled the CPU that it supports saving and
+                // restoring the:
+                //
+                // * SSE -> `XCR0.SSE[1]`
+                // * AVX -> `XCR0.AVX[2]`
+                // * AVX-512 -> `XCR0.AVX-512[7:5]`.
+                //
+                // by setting the corresponding bits of `XCR0` to `1`.
+                //
+                // This is safe because the CPU supports `xsave`
+                // and the OS has set `osxsave`.
+                let xcr0 = unsafe { _xgetbv(0) };
+                // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
+                let os_avx_support = xcr0 & 6 == 6;
+                // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`:
+                let os_avx512_support = xcr0 & 224 == 224;
+
+                // Only if the OS and the CPU support saving/restoring the AVX
+                // registers we enable `xsave` support:
+                if os_avx_support {
+                    // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
+                    // FEATURES" in the "Intel® 64 and IA-32 Architectures Software
+                    // Developer’s Manual, Volume 1: Basic Architecture":
+                    //
+                    // "Software enables the XSAVE feature set by setting
+                    // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
+                    // instruction). If this bit is 0, execution of any of XGETBV,
+                    // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
+                    // causes an invalid-opcode exception (#UD)"
+                    //
+                    enable(proc_info_ecx, 26, Feature::xsave);
+
+                    // For `xsaveopt`, `xsavec`, and `xsaves` we need to query:
+                    // Processor Extended State Enumeration Sub-leaf (EAX = 0DH,
+                    // ECX = 1):
+                    if max_basic_leaf >= 0xd {
+                        let CpuidResult {
+                            eax: proc_extended_state1_eax,
+                            ..
+                        } = unsafe { __cpuid_count(0xd_u32, 1) };
+                        enable(proc_extended_state1_eax, 0, Feature::xsaveopt);
+                        enable(proc_extended_state1_eax, 1, Feature::xsavec);
+                        enable(proc_extended_state1_eax, 3, Feature::xsaves);
+                    }
+
+                    // FMA (uses 256-bit wide registers):
+                    enable(proc_info_ecx, 12, Feature::fma);
+
+                    // And AVX/AVX2:
+                    enable(proc_info_ecx, 28, Feature::avx);
+                    enable(extended_features_ebx, 5, Feature::avx2);
+
+                    // For AVX-512 the OS also needs to support saving/restoring
+                    // the extended state, only then we enable AVX-512 support:
+                    if os_avx512_support {
+                        enable(extended_features_ebx, 16, Feature::avx512f);
+                        enable(extended_features_ebx, 17, Feature::avx512dq);
+                        enable(extended_features_ebx, 21, Feature::avx512ifma);
+                        enable(extended_features_ebx, 26, Feature::avx512pf);
+                        enable(extended_features_ebx, 27, Feature::avx512er);
+                        enable(extended_features_ebx, 28, Feature::avx512cd);
+                        enable(extended_features_ebx, 30, Feature::avx512bw);
+                        enable(extended_features_ebx, 31, Feature::avx512vl);
+                        enable(extended_features_ecx, 1, Feature::avx512vbmi);
+                        enable(extended_features_ecx, 5, Feature::avx512bf16);
+                        enable(extended_features_ecx, 6, Feature::avx512vbmi2);
+                        enable(extended_features_ecx, 8, Feature::avx512gfni);
+                        enable(extended_features_ecx, 8, Feature::avx512vp2intersect);
+                        enable(extended_features_ecx, 9, Feature::avx512vaes);
+                        enable(extended_features_ecx, 10, Feature::avx512vpclmulqdq);
+                        enable(extended_features_ecx, 11, Feature::avx512vnni);
+                        enable(extended_features_ecx, 12, Feature::avx512bitalg);
+                        enable(extended_features_ecx, 14, Feature::avx512vpopcntdq);
+                    }
+                }
+            }
+        }
+
+        // This detects ABM on AMD CPUs and LZCNT on Intel CPUs.
+        // On intel CPUs with popcnt, lzcnt implements the
+        // "missing part" of ABM, so we map both to the same
+        // internal feature.
+        //
+        // The `is_x86_feature_detected!("lzcnt")` macro then
+        // internally maps to Feature::abm.
+        enable(extended_proc_info_ecx, 5, Feature::lzcnt);
+
+        // As Hygon Dhyana originates from AMD technology and shares most of the architecture with
+        // AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series
+        // number(Family 18h).
+        //
+        // For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD
+        // family 17h.
+        //
+        // Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf.
+        // Related Hygon kernel patch can be found on
+        // http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn
+        if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" {
+            // These features are available on AMD arch CPUs:
+            enable(extended_proc_info_ecx, 6, Feature::sse4a);
+            enable(extended_proc_info_ecx, 21, Feature::tbm);
+        }
+    }
+
+    // Unfortunately, some Skylake chips erroneously report support for BMI1 and
+    // BMI2 without actual support. These chips don't support AVX, and it seems
+    // that all Intel chips with non-erroneous support BMI do (I didn't check
+    // other vendors), so we can disable these flags for chips that don't also
+    // report support for AVX.
+    //
+    // It's possible this will pessimize future chips that do support BMI and
+    // not AVX, but this seems minor compared to a hard crash you get when
+    // executing an unsupported instruction (to put it another way, it's safe
+    // for us to under-report CPU features, but not to over-report them). Still,
+    // to limit any impact this may have in the future, we only do this for
+    // Intel chips, as it's a bug only present in their chips.
+    //
+    // This bug is documented as `SKL052` in the errata section of this document:
+    // http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/desktop-6th-gen-core-family-spec-update.pdf
+    if vendor_id == *b"GenuineIntel" && !value.test(Feature::avx as u32) {
+        value.unset(Feature::bmi1 as u32);
+        value.unset(Feature::bmi2 as u32);
+    }
+
+    value
+}
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxv
new file mode 100644
index 000000000..0538e661f
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-rpi3.auxv
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv
new file mode 100644
index 000000000..6afe1b3b4
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv
diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv
new file mode 100644
index 000000000..75abc02d1
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv
diff --git a/library/stdarch/crates/std_detect/src/lib.rs b/library/stdarch/crates/std_detect/src/lib.rs
new file mode 100644
index 000000000..c0e0de0dd
--- /dev/null
+++ b/library/stdarch/crates/std_detect/src/lib.rs
@@ -0,0 +1,34 @@
+//! Run-time feature detection for the Rust standard library.
+//!
+//! To detect whether a feature is enabled in the system running the binary
+//! use one of the appropriate macro for the target:
+//!
+//! * `x86` and `x86_64`: [`is_x86_feature_detected`]
+//! * `arm`: [`is_arm_feature_detected`]
+//! * `aarch64`: [`is_aarch64_feature_detected`]
+//! * `riscv`: [`is_riscv_feature_detected`]
+//! * `mips`: [`is_mips_feature_detected`]
+//! * `mips64`: [`is_mips64_feature_detected`]
+//! * `powerpc`: [`is_powerpc_feature_detected`]
+//! * `powerpc64`: [`is_powerpc64_feature_detected`]
+
+#![unstable(feature = "stdsimd", issue = "27731")]
+#![feature(staged_api, stdsimd, doc_cfg, allow_internal_unstable)]
+#![deny(rust_2018_idioms)]
+#![allow(clippy::shadow_reuse)]
+#![deny(clippy::missing_inline_in_public_items)]
+#![cfg_attr(test, allow(unused_imports))]
+#![no_std]
+
+#[cfg(test)]
+#[macro_use]
+extern crate std;
+
+// rust-lang/rust#83888: removing `extern crate` gives an error that `vec_spare>
+#[cfg_attr(feature = "std_detect_file_io", allow(unused_extern_crates))]
+#[cfg(feature = "std_detect_file_io")]
+extern crate alloc;
+
+#[doc(hidden)]
+#[unstable(feature = "stdsimd", issue = "27731")]
+pub mod detect;
diff --git a/library/stdarch/crates/std_detect/tests/cpu-detection.rs b/library/stdarch/crates/std_detect/tests/cpu-detection.rs
new file mode 100644
index 000000000..ca8bf28f4
--- /dev/null
+++ b/library/stdarch/crates/std_detect/tests/cpu-detection.rs
@@ -0,0 +1,164 @@
+#![feature(stdsimd)]
+#![allow(clippy::unwrap_used, clippy::use_debug, clippy::print_stdout)]
+#![cfg(any(
+    target_arch = "arm",
+    target_arch = "aarch64",
+    target_arch = "x86",
+    target_arch = "x86_64",
+    target_arch = "powerpc",
+    target_arch = "powerpc64"
+))]
+
+#[macro_use]
+extern crate std_detect;
+
+#[test]
+fn all() {
+    for (f, e) in std_detect::detect::features() {
+        println!("{}: {}", f, e);
+    }
+}
+
+#[test]
+#[cfg(all(target_arch = "arm", any(target_os = "linux", target_os = "android")))]
+fn arm_linux() {
+    println!("neon: {}", is_arm_feature_detected!("neon"));
+    println!("pmull: {}", is_arm_feature_detected!("pmull"));
+    println!("crc: {}", is_arm_feature_detected!("crc"));
+    println!("crypto: {}", is_arm_feature_detected!("crypto"));
+    println!("aes: {}", is_arm_feature_detected!("aes"));
+    println!("sha2: {}", is_arm_feature_detected!("sha2"));
+}
+
+#[test]
+#[cfg(all(
+    target_arch = "aarch64",
+    any(target_os = "linux", target_os = "android")
+))]
+fn aarch64_linux() {
+    println!("asimd: {}", is_aarch64_feature_detected!("asimd"));
+    println!("neon: {}", is_aarch64_feature_detected!("neon"));
+    println!("pmull: {}", is_aarch64_feature_detected!("pmull"));
+    println!("fp: {}", is_aarch64_feature_detected!("fp"));
+    println!("fp16: {}", is_aarch64_feature_detected!("fp16"));
+    println!("sve: {}", is_aarch64_feature_detected!("sve"));
+    println!("crc: {}", is_aarch64_feature_detected!("crc"));
+    println!("lse: {}", is_aarch64_feature_detected!("lse"));
+    println!("lse2: {}", is_aarch64_feature_detected!("lse2"));
+    println!("rdm: {}", is_aarch64_feature_detected!("rdm"));
+    println!("rcpc: {}", is_aarch64_feature_detected!("rcpc"));
+    println!("rcpc2: {}", is_aarch64_feature_detected!("rcpc2"));
+    println!("dotprod: {}", is_aarch64_feature_detected!("dotprod"));
+    println!("tme: {}", is_aarch64_feature_detected!("tme"));
+    println!("fhm: {}", is_aarch64_feature_detected!("fhm"));
+    println!("dit: {}", is_aarch64_feature_detected!("dit"));
+    println!("flagm: {}", is_aarch64_feature_detected!("flagm"));
+    println!("ssbs: {}", is_aarch64_feature_detected!("ssbs"));
+    println!("sb: {}", is_aarch64_feature_detected!("sb"));
+    println!("paca: {}", is_aarch64_feature_detected!("paca"));
+    println!("pacg: {}", is_aarch64_feature_detected!("pacg"));
+    println!("dpb: {}", is_aarch64_feature_detected!("dpb"));
+    println!("dpb2: {}", is_aarch64_feature_detected!("dpb2"));
+    println!("sve2: {}", is_aarch64_feature_detected!("sve2"));
+    println!("sve2-aes: {}", is_aarch64_feature_detected!("sve2-aes"));
+    println!("sve2-sm4: {}", is_aarch64_feature_detected!("sve2-sm4"));
+    println!("sve2-sha3: {}", is_aarch64_feature_detected!("sve2-sha3"));
+    println!(
+        "sve2-bitperm: {}",
+        is_aarch64_feature_detected!("sve2-bitperm")
+    );
+    println!("frintts: {}", is_aarch64_feature_detected!("frintts"));
+    println!("i8mm: {}", is_aarch64_feature_detected!("i8mm"));
+    println!("f32mm: {}", is_aarch64_feature_detected!("f32mm"));
+    println!("f64mm: {}", is_aarch64_feature_detected!("f64mm"));
+    println!("bf16: {}", is_aarch64_feature_detected!("bf16"));
+    println!("rand: {}", is_aarch64_feature_detected!("rand"));
+    println!("bti: {}", is_aarch64_feature_detected!("bti"));
+    println!("mte: {}", is_aarch64_feature_detected!("mte"));
+    println!("jsconv: {}", is_aarch64_feature_detected!("jsconv"));
+    println!("fcma: {}", is_aarch64_feature_detected!("fcma"));
+    println!("aes: {}", is_aarch64_feature_detected!("aes"));
+    println!("sha2: {}", is_aarch64_feature_detected!("sha2"));
+    println!("sha3: {}", is_aarch64_feature_detected!("sha3"));
+    println!("sm4: {}", is_aarch64_feature_detected!("sm4"));
+}
+
+#[test]
+#[cfg(all(target_arch = "powerpc", target_os = "linux"))]
+fn powerpc_linux() {
+    println!("altivec: {}", is_powerpc_feature_detected!("altivec"));
+    println!("vsx: {}", is_powerpc_feature_detected!("vsx"));
+    println!("power8: {}", is_powerpc_feature_detected!("power8"));
+}
+
+#[test]
+#[cfg(all(target_arch = "powerpc64", target_os = "linux"))]
+fn powerpc64_linux() {
+    println!("altivec: {}", is_powerpc64_feature_detected!("altivec"));
+    println!("vsx: {}", is_powerpc64_feature_detected!("vsx"));
+    println!("power8: {}", is_powerpc64_feature_detected!("power8"));
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn x86_all() {
+    println!("aes: {:?}", is_x86_feature_detected!("aes"));
+    println!("pcmulqdq: {:?}", is_x86_feature_detected!("pclmulqdq"));
+    println!("rdrand: {:?}", is_x86_feature_detected!("rdrand"));
+    println!("rdseed: {:?}", is_x86_feature_detected!("rdseed"));
+    println!("tsc: {:?}", is_x86_feature_detected!("tsc"));
+    println!("mmx: {:?}", is_x86_feature_detected!("mmx"));
+    println!("sse: {:?}", is_x86_feature_detected!("sse"));
+    println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
+    println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
+    println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
+    println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
+    println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
+    println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
+    println!("sha: {:?}", is_x86_feature_detected!("sha"));
+    println!("avx: {:?}", is_x86_feature_detected!("avx"));
+    println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
+    println!("avx512f: {:?}", is_x86_feature_detected!("avx512f"));
+    println!("avx512cd: {:?}", is_x86_feature_detected!("avx512cd"));
+    println!("avx512er: {:?}", is_x86_feature_detected!("avx512er"));
+    println!("avx512pf: {:?}", is_x86_feature_detected!("avx512pf"));
+    println!("avx512bw: {:?}", is_x86_feature_detected!("avx512bw"));
+    println!("avx512dq: {:?}", is_x86_feature_detected!("avx512dq"));
+    println!("avx512vl: {:?}", is_x86_feature_detected!("avx512vl"));
+    println!("avx512ifma: {:?}", is_x86_feature_detected!("avx512ifma"));
+    println!("avx512vbmi: {:?}", is_x86_feature_detected!("avx512vbmi"));
+    println!(
+        "avx512vpopcntdq: {:?}",
+        is_x86_feature_detected!("avx512vpopcntdq")
+    );
+    println!("avx512vbmi2 {:?}", is_x86_feature_detected!("avx512vbmi2"));
+    println!("avx512gfni {:?}", is_x86_feature_detected!("avx512gfni"));
+    println!("avx512vaes {:?}", is_x86_feature_detected!("avx512vaes"));
+    println!(
+        "avx512vpclmulqdq {:?}",
+        is_x86_feature_detected!("avx512vpclmulqdq")
+    );
+    println!("avx512vnni {:?}", is_x86_feature_detected!("avx512vnni"));
+    println!(
+        "avx512bitalg {:?}",
+        is_x86_feature_detected!("avx512bitalg")
+    );
+    println!("avx512bf16 {:?}", is_x86_feature_detected!("avx512bf16"));
+    println!(
+        "avx512vp2intersect {:?}",
+        is_x86_feature_detected!("avx512vp2intersect")
+    );
+    println!("f16c: {:?}", is_x86_feature_detected!("f16c"));
+    println!("fma: {:?}", is_x86_feature_detected!("fma"));
+    println!("bmi1: {:?}", is_x86_feature_detected!("bmi1"));
+    println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
+    println!("abm: {:?}", is_x86_feature_detected!("abm"));
+    println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
+    println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
+    println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
+    println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
+    println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
+    println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
+    println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
+    println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
+}
diff --git a/library/stdarch/crates/std_detect/tests/macro_trailing_commas.rs b/library/stdarch/crates/std_detect/tests/macro_trailing_commas.rs
new file mode 100644
index 000000000..cd597af73
--- /dev/null
+++ b/library/stdarch/crates/std_detect/tests/macro_trailing_commas.rs
@@ -0,0 +1,51 @@
+#![feature(stdsimd)]
+#![allow(clippy::unwrap_used, clippy::use_debug, clippy::print_stdout)]
+
+#[cfg(any(
+    target_arch = "arm",
+    target_arch = "aarch64",
+    target_arch = "x86",
+    target_arch = "x86_64",
+    target_arch = "powerpc",
+    target_arch = "powerpc64"
+))]
+#[macro_use]
+extern crate std_detect;
+
+#[test]
+#[cfg(all(target_arch = "arm", any(target_os = "linux", target_os = "android")))]
+fn arm_linux() {
+    let _ = is_arm_feature_detected!("neon");
+    let _ = is_arm_feature_detected!("neon",);
+}
+
+#[test]
+#[cfg(all(
+    target_arch = "aarch64",
+    any(target_os = "linux", target_os = "android")
+))]
+fn aarch64_linux() {
+    let _ = is_aarch64_feature_detected!("fp");
+    let _ = is_aarch64_feature_detected!("fp",);
+}
+
+#[test]
+#[cfg(all(target_arch = "powerpc", target_os = "linux"))]
+fn powerpc_linux() {
+    let _ = is_powerpc_feature_detected!("altivec");
+    let _ = is_powerpc_feature_detected!("altivec",);
+}
+
+#[test]
+#[cfg(all(target_arch = "powerpc64", target_os = "linux"))]
+fn powerpc64_linux() {
+    let _ = is_powerpc64_feature_detected!("altivec");
+    let _ = is_powerpc64_feature_detected!("altivec",);
+}
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn x86_all() {
+    let _ = is_x86_feature_detected!("sse");
+    let _ = is_x86_feature_detected!("sse",);
+}
diff --git a/library/stdarch/crates/std_detect/tests/x86-specific.rs b/library/stdarch/crates/std_detect/tests/x86-specific.rs
new file mode 100644
index 000000000..59e9a62fd
--- /dev/null
+++ b/library/stdarch/crates/std_detect/tests/x86-specific.rs
@@ -0,0 +1,158 @@
+#![feature(stdsimd)]
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+
+extern crate cupid;
+#[macro_use]
+extern crate std_detect;
+
+#[test]
+fn dump() {
+    println!("aes: {:?}", is_x86_feature_detected!("aes"));
+    println!("pclmulqdq: {:?}", is_x86_feature_detected!("pclmulqdq"));
+    println!("rdrand: {:?}", is_x86_feature_detected!("rdrand"));
+    println!("rdseed: {:?}", is_x86_feature_detected!("rdseed"));
+    println!("tsc: {:?}", is_x86_feature_detected!("tsc"));
+    println!("sse: {:?}", is_x86_feature_detected!("sse"));
+    println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
+    println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
+    println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
+    println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
+    println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
+    println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
+    println!("sha: {:?}", is_x86_feature_detected!("sha"));
+    println!("avx: {:?}", is_x86_feature_detected!("avx"));
+    println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
+    println!("avx512f {:?}", is_x86_feature_detected!("avx512f"));
+    println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd"));
+    println!("avx512er {:?}", is_x86_feature_detected!("avx512er"));
+    println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf"));
+    println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw"));
+    println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq"));
+    println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl"));
+    println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma"));
+    println!("avx512vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
+    println!(
+        "avx512_vpopcntdq {:?}",
+        is_x86_feature_detected!("avx512vpopcntdq")
+    );
+    println!("avx512vbmi2 {:?}", is_x86_feature_detected!("avx512vbmi2"));
+    println!("avx512gfni {:?}", is_x86_feature_detected!("avx512gfni"));
+    println!("avx512vaes {:?}", is_x86_feature_detected!("avx512vaes"));
+    println!(
+        "avx512vpclmulqdq {:?}",
+        is_x86_feature_detected!("avx512vpclmulqdq")
+    );
+    println!("avx512vnni {:?}", is_x86_feature_detected!("avx512vnni"));
+    println!(
+        "avx512bitalg {:?}",
+        is_x86_feature_detected!("avx512bitalg")
+    );
+    println!("avx512bf16 {:?}", is_x86_feature_detected!("avx512bf16"));
+    println!(
+        "avx512vp2intersect {:?}",
+        is_x86_feature_detected!("avx512vp2intersect")
+    );
+    println!("fma: {:?}", is_x86_feature_detected!("fma"));
+    println!("abm: {:?}", is_x86_feature_detected!("abm"));
+    println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
+    println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
+    println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
+    println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
+    println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
+    println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
+    println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
+    println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
+    println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
+    println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
+    println!("cmpxchg16b: {:?}", is_x86_feature_detected!("cmpxchg16b"));
+    println!("adx: {:?}", is_x86_feature_detected!("adx"));
+    println!("rtm: {:?}", is_x86_feature_detected!("rtm"));
+}
+
+#[cfg(feature = "std_detect_env_override")]
+#[test]
+fn env_override_no_avx() {
+    if let Ok(disable) = std::env::var("RUST_STD_DETECT_UNSTABLE") {
+        let information = cupid::master().unwrap();
+        for d in disable.split(" ") {
+            match d {
+                "avx" => {
+                    if information.avx() {
+                        assert_ne!(is_x86_feature_detected!("avx"), information.avx())
+                    }
+                }
+                "avx2" => {
+                    if information.avx2() {
+                        assert_ne!(is_x86_feature_detected!("avx2"), information.avx2())
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+}
+
+#[test]
+fn compare_with_cupid() {
+    let information = cupid::master().unwrap();
+    assert_eq!(is_x86_feature_detected!("aes"), information.aesni());
+    assert_eq!(
+        is_x86_feature_detected!("pclmulqdq"),
+        information.pclmulqdq()
+    );
+    assert_eq!(is_x86_feature_detected!("rdrand"), information.rdrand());
+    assert_eq!(is_x86_feature_detected!("rdseed"), information.rdseed());
+    assert_eq!(is_x86_feature_detected!("tsc"), information.tsc());
+    assert_eq!(is_x86_feature_detected!("sse"), information.sse());
+    assert_eq!(is_x86_feature_detected!("sse2"), information.sse2());
+    assert_eq!(is_x86_feature_detected!("sse3"), information.sse3());
+    assert_eq!(is_x86_feature_detected!("ssse3"), information.ssse3());
+    assert_eq!(is_x86_feature_detected!("sse4.1"), information.sse4_1());
+    assert_eq!(is_x86_feature_detected!("sse4.2"), information.sse4_2());
+    assert_eq!(is_x86_feature_detected!("sse4a"), information.sse4a());
+    assert_eq!(is_x86_feature_detected!("sha"), information.sha());
+    assert_eq!(is_x86_feature_detected!("avx"), information.avx());
+    assert_eq!(is_x86_feature_detected!("avx2"), information.avx2());
+    assert_eq!(is_x86_feature_detected!("avx512f"), information.avx512f());
+    assert_eq!(is_x86_feature_detected!("avx512cd"), information.avx512cd());
+    assert_eq!(is_x86_feature_detected!("avx512er"), information.avx512er());
+    assert_eq!(is_x86_feature_detected!("avx512pf"), information.avx512pf());
+    assert_eq!(is_x86_feature_detected!("avx512bw"), information.avx512bw());
+    assert_eq!(is_x86_feature_detected!("avx512dq"), information.avx512dq());
+    assert_eq!(is_x86_feature_detected!("avx512vl"), information.avx512vl());
+    assert_eq!(
+        is_x86_feature_detected!("avx512ifma"),
+        information.avx512_ifma()
+    );
+    assert_eq!(
+        is_x86_feature_detected!("avx512vbmi"),
+        information.avx512_vbmi()
+    );
+    assert_eq!(
+        is_x86_feature_detected!("avx512vpopcntdq"),
+        information.avx512_vpopcntdq()
+    );
+    assert_eq!(is_x86_feature_detected!("fma"), information.fma());
+    assert_eq!(is_x86_feature_detected!("bmi1"), information.bmi1());
+    assert_eq!(is_x86_feature_detected!("bmi2"), information.bmi2());
+    assert_eq!(is_x86_feature_detected!("popcnt"), information.popcnt());
+    assert_eq!(is_x86_feature_detected!("abm"), information.lzcnt());
+    assert_eq!(is_x86_feature_detected!("tbm"), information.tbm());
+    assert_eq!(is_x86_feature_detected!("lzcnt"), information.lzcnt());
+    assert_eq!(is_x86_feature_detected!("xsave"), information.xsave());
+    assert_eq!(is_x86_feature_detected!("xsaveopt"), information.xsaveopt());
+    assert_eq!(
+        is_x86_feature_detected!("xsavec"),
+        information.xsavec_and_xrstor()
+    );
+    assert_eq!(
+        is_x86_feature_detected!("xsaves"),
+        information.xsaves_xrstors_and_ia32_xss()
+    );
+    assert_eq!(
+        is_x86_feature_detected!("cmpxchg16b"),
+        information.cmpxchg16b(),
+    );
+    assert_eq!(is_x86_feature_detected!("adx"), information.adx(),);
+    assert_eq!(is_x86_feature_detected!("rtm"), information.rtm(),);
+}
diff --git a/library/stdarch/crates/stdarch-gen/Cargo.toml b/library/stdarch/crates/stdarch-gen/Cargo.toml
new file mode 100644
index 000000000..b339672f4
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "stdarch-gen"
+version = "0.1.0"
+authors = ["Heinz Gies <heinz@licenser.net>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
diff --git a/library/stdarch/crates/stdarch-gen/README.md b/library/stdarch/crates/stdarch-gen/README.md
new file mode 100644
index 000000000..54b602cdd
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen/README.md
@@ -0,0 +1,11 @@
+# Neon intrinsic code generator
+
+A small tool that allows to quickly generate intrinsics for the NEON architecture.
+
+The specification for the intrinsics can be found in `neon.spec`.
+
+To run and re-generate the code run the following from the root of the `stdarch` crate.
+
+```
+OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen -- crates/stdarch-gen/neon.spec
+```
+\ No newline at end of file
diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec
new file mode 100644
index 000000000..68a50fbe9
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@@ -0,0 +1,7560 @@
+// ARM Neon intrinsic specification.
+//
+// This file contains the specification for a number of
+// intrinsics that allows us to generate them along with
+// their test cases.
+//
+// To the syntax of the file - it's not very intelligently parsed!
+//
+// # Comments
+// start with AT LEAST two, or four or more slashes  so // is a
+// comment /////// is too.
+//
+// # Sections
+// Sections start with EXACTLY three slashes followed
+// by AT LEAST one space. Sections are used for two things:
+//
+// 1) they serve as the doc comment for the given intrinics.
+// 2) they reset all variables (name, fn, etc.)
+//
+// # Variables
+//
+// name    - The prefix of the function, suffixes are auto
+//           generated by the type they get passed.
+//
+// fn      - The function to call in rust-land.
+//
+// aarch64 - The intrinsic to check on aarch64 architecture.
+//           If this is given but no arm intrinsic is provided,
+//           the function will exclusively be generated for
+//           aarch64.
+//           This is used to generate both aarch64 specific and
+//           shared intrinics by first only specifying th aarch64
+//           variant then the arm variant.
+//
+// arm     - The arm v7 intrinics used to checked for arm code
+//           generation. All neon functions available in arm are
+//           also available in aarch64. If no aarch64 intrinic was
+//           set they are assumed to be the same.
+//           Intrinics ending with a `.` will have a size suffixes
+//           added (such as `i8` or `i64`) that is not sign specific
+//           Intrinics ending with a `.s` will have a size suffixes
+//           added (such as `s8` or `u64`) that is sign specific
+//
+// a       - First input for tests, it gets scaled to the size of
+//           the type.
+//
+// b       - Second input for tests, it gets scaled to the size of
+//           the type.
+//
+// # special values
+//
+// TRUE - 'true' all bits are set to 1
+// FALSE - 'false' all bits are set to 0
+// FF - same as 'true'
+// MIN - minimal value (either 0 or the lowest negative number)
+// MAX - maximal value proper to overflow
+//
+// # validate <values>
+// Validates a and b against the expected result of the test.
+// The special values 'TRUE' and 'FALSE' can be used to
+// represent the correct NEON representation of true or
+// false values. It too gets scaled to the type.
+//
+// Validate needs to be called before generate as it sets
+// up the rules for validation that get generated for each
+// type.
+// # generate <types>
+// The generate command generates the intrinsics, it uses the
+// Variables set and can be called multiple times while overwriting
+// some of the variables.
+
+/// Vector bitwise and
+name = vand
+fn = simd_and
+arm = vand
+aarch64 = and
+a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00
+b = 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F
+validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00
+b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+validate 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+generate int*_t, uint*_t, int64x*_t, uint64x*_t
+
+/// Vector bitwise or (immediate, inclusive)
+name = vorr
+fn = simd_or
+arm = vorr
+aarch64 = orr
+a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
+b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
+generate int*_t, uint*_t, int64x*_t, uint64x*_t
+
+
+/// Vector bitwise exclusive or (vector)
+name = veor
+fn = simd_xor
+arm = veor
+aarch64 = eor
+a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
+b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
+generate int*_t, uint*_t, int64x*_t, uint64x*_t
+
+/// Three-way exclusive OR
+name = veor3
+a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
+b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+c = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
+target = sha3
+
+aarch64 = eor3
+link-aarch64 = llvm.aarch64.crypto.eor3s._EXT_
+generate int8x16_t, int16x8_t, int32x4_t, int64x2_t
+link-aarch64 = llvm.aarch64.crypto.eor3u._EXT_
+generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t
+
+////////////////////
+// Absolute difference between the arguments
+////////////////////
+
+/// Absolute difference between the arguments
+name = vabd
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+validate 15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15
+
+arm = vabd.s
+aarch64 = sabd
+link-arm = vabds._EXT_
+link-aarch64 = sabd._EXT_
+generate int*_t
+
+arm = vabd.s
+aarch64 = uabd
+link-arm = vabdu._EXT_
+link-aarch64 = uabd._EXT_
+generate uint*_t
+
+/// Absolute difference between the arguments of Floating
+name = vabd
+a = 1.0, 2.0, 5.0, -4.0
+b = 9.0, 3.0, 2.0, 8.0
+validate 8.0, 1.0, 3.0, 12.0
+
+aarch64 = fabd
+link-aarch64 = fabd._EXT_
+generate float64x*_t
+
+arm = vabd.s
+aarch64 = fabd
+link-arm = vabds._EXT_
+link-aarch64 = fabd._EXT_
+generate float*_t
+
+/// Floating-point absolute difference
+name = vabd
+multi_fn = simd_extract, {vabd-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+a = 1.0
+b = 9.0
+validate 8.0
+
+aarch64 = fabd
+generate f32, f64
+
+////////////////////
+// Absolute difference Long
+////////////////////
+
+/// Unsigned Absolute difference Long
+name = vabdl
+multi_fn = simd_cast, {vabd-unsigned-noext, a, b}
+a = 1, 2, 3, 4, 4, 3, 2, 1
+b = 10, 10, 10, 10, 10, 10, 10, 10
+validate 9, 8, 7, 6, 6, 7, 8, 9
+
+arm = vabdl.s
+aarch64 = uabdl
+generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t
+
+/// Signed Absolute difference Long
+name = vabdl
+multi_fn = simd_cast, c:uint8x8_t, {vabd-signed-noext, a, b}
+multi_fn = simd_cast, c
+a = 1, 2, 3, 4, 4, 3, 2, 1
+b = 10, 10, 10, 10, 10, 10, 10, 10
+validate 9, 8, 7, 6, 6, 7, 8, 9
+
+arm = vabdl.s
+aarch64 = sabdl
+generate int8x8_t:int8x8_t:int16x8_t
+
+/// Signed Absolute difference Long
+name = vabdl
+multi_fn = simd_cast, c:uint16x4_t, {vabd-signed-noext, a, b}
+multi_fn = simd_cast, c
+a = 1, 2, 11, 12
+b = 10, 10, 10, 10
+validate 9, 8, 1, 2
+
+arm = vabdl.s
+aarch64 = sabdl
+generate int16x4_t:int16x4_t:int32x4_t
+
+/// Signed Absolute difference Long
+name = vabdl
+multi_fn = simd_cast, c:uint32x2_t, {vabd-signed-noext, a, b}
+multi_fn = simd_cast, c
+a = 1, 11
+b = 10, 10
+validate 9, 1
+
+arm = vabdl.s
+aarch64 = sabdl
+generate int32x2_t:int32x2_t:int64x2_t
+
+/// Unsigned Absolute difference Long
+name = vabdl_high
+no-q
+multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_cast, {vabd_u8, c, d}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
+validate 1, 0, 1, 2, 3, 4, 5, 6
+
+aarch64 = uabdl
+generate uint8x16_t:uint8x16_t:uint16x8_t
+
+/// Unsigned Absolute difference Long
+name = vabdl_high
+no-q
+multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_cast, {vabd_u16, c, d}
+a = 1, 2, 3, 4, 8, 9, 11, 12
+b = 10, 10, 10, 10, 10, 10, 10, 10
+validate 2, 1, 1, 2
+
+aarch64 = uabdl
+generate uint16x8_t:uint16x8_t:uint32x4_t
+
+/// Unsigned Absolute difference Long
+name = vabdl_high
+no-q
+multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_cast, {vabd_u32, c, d}
+a = 1, 2, 3, 4
+b = 10, 10, 10, 10
+validate 7, 6
+
+aarch64 = uabdl
+generate uint32x4_t:uint32x4_t:uint64x2_t
+
+/// Signed Absolute difference Long
+name = vabdl_high
+no-q
+multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d}
+multi_fn = simd_cast, e
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
+validate 1, 0, 1, 2, 3, 4, 5, 6
+
+aarch64 = sabdl
+generate int8x16_t:int8x16_t:int16x8_t
+
+/// Signed Absolute difference Long
+name = vabdl_high
+no-q
+multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d}
+multi_fn = simd_cast, e
+a = 1, 2, 3, 4, 9, 10, 11, 12
+b = 10, 10, 10, 10, 10, 10, 10, 10
+validate 1, 0, 1, 2
+
+aarch64 = sabdl
+generate int16x8_t:int16x8_t:int32x4_t
+
+/// Signed Absolute difference Long
+name = vabdl_high
+no-q
+multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
+multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d}
+multi_fn = simd_cast, e
+a = 1, 2, 3, 4
+b = 10, 10, 10, 10
+validate 7, 6
+
+aarch64 = sabdl
+generate int32x4_t:int32x4_t:int64x2_t
+
+////////////////////
+// equality
+////////////////////
+
+/// Compare bitwise Equal (vector)
+name = vceq
+fn = simd_eq
+a = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX
+b = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+a = MIN, MIN, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, MAX
+b = MIN, MAX, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, MIN
+validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE
+
+aarch64 = cmeq
+generate uint64x*_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
+
+arm = vceq.
+generate uint*_t, int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t
+
+/// Floating-point compare equal
+name = vceq
+fn = simd_eq
+a = 1.2, 3.4, 5.6, 7.8
+b = 1.2, 3.4, 5.6, 7.8
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = fcmeq
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+arm = vceq.
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Compare bitwise equal
+name = vceq
+multi_fn = transmute, {vceq-in_ntt-noext, {transmute, a}, {transmute, b}}
+a = 1
+b = 2
+validate 0
+
+aarch64 = cmp
+generate i64:u64, u64
+
+/// Floating-point compare equal
+name = vceq
+multi_fn = simd_extract, {vceq-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+a = 1.
+b = 2.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+/// Signed compare bitwise equal to zero
+name = vceqz
+fn = simd_eq
+a =  MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
+
+aarch64 = cmeq
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
+
+/// Unsigned compare bitwise equal to zero
+name = vceqz
+fn = simd_eq
+a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
+
+aarch64 = cmeq
+generate uint*_t, uint64x*_t
+
+/// Floating-point compare bitwise equal to zero
+name = vceqz
+fn = simd_eq
+a =  0.0, 1.2, 3.4, 5.6
+fixed = 0.0, 0.0, 0.0, 0.0
+validate TRUE, FALSE, FALSE, FALSE
+
+aarch64 = fcmeq
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+/// Compare bitwise equal to zero
+name = vceqz
+multi_fn = transmute, {vceqz-in_ntt-noext, {transmute, a}}
+a = 1
+validate 0
+
+aarch64 = cmp
+generate i64:u64, u64
+
+/// Floating-point compare bitwise equal to zero
+name = vceqz
+multi_fn = simd_extract, {vceqz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+a = 1.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+/// Signed compare bitwise Test bits nonzero
+name = vtst
+multi_fn = simd_and, c:in_t, a, b
+multi_fn = fixed, d:in_t
+multi_fn = simd_ne, c, transmute(d)
+a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
+b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmtst
+generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
+
+arm = vtst
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly16x4_t:uint16x4_t, poly16x8_t:uint16x8_t
+
+/// Unsigned compare bitwise Test bits nonzero
+name = vtst
+multi_fn = simd_and, c:in_t, a, b
+multi_fn = fixed, d:in_t
+multi_fn = simd_ne, c, transmute(d)
+a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
+b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmtst
+generate uint64x*_t
+
+arm = vtst
+generate uint*_t
+
+/// Compare bitwise test bits nonzero
+name = vtst
+multi_fn = transmute, {vtst-in_ntt-noext, {transmute, a}, {transmute, b}}
+a = 0
+b = 0
+validate 0
+
+aarch64 = tst
+generate i64:i64:u64, u64
+
+/// Signed saturating accumulate of unsigned value
+name = vuqadd
+out-suffix
+a = 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
+b = 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
+validate 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8
+
+aarch64 = suqadd
+link-aarch64 = suqadd._EXT_
+generate i32:u32:i32, i64:u64:i64
+
+/// Signed saturating accumulate of unsigned value
+name = vuqadd
+out-suffix
+multi_fn = simd_extract, {vuqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+a = 1
+b = 2
+validate 3
+
+aarch64 = suqadd
+generate i8:u8:i8, i16:u16:i16
+
+////////////////////
+// Floating-point absolute value
+////////////////////
+
+/// Floating-point absolute value
+name = vabs
+fn = simd_fabs
+a =  -0.1, -2.2, -3.3, -6.6
+validate 0.1, 2.2, 3.3, 6.6
+aarch64 = fabs
+generate float64x1_t:float64x1_t, float64x2_t:float64x2_t
+
+arm = vabs
+generate float32x2_t:float32x2_t, float32x4_t:float32x4_t
+
+////////////////////
+// greater then
+////////////////////
+
+/// Compare signed greater than
+name = vcgt
+fn = simd_gt
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+aarch64 = cmgt
+generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
+
+arm = vcgt.s
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
+
+/// Compare unsigned highe
+name = vcgt
+fn = simd_gt
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmhi
+generate uint64x*_t
+
+arm = vcgt.s
+generate uint*_t
+
+/// Floating-point compare greater than
+name = vcgt
+fn = simd_gt
+a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
+b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = fcmgt
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+arm = vcgt.s
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Compare greater than
+name = vcgt
+multi_fn = transmute, {vcgt-in_ntt-noext, {transmute, a}, {transmute, b}}
+a = 1
+b = 2
+validate 0
+
+aarch64 = cmp
+generate i64:u64, u64
+
+/// Floating-point compare greater than
+name = vcgt
+multi_fn = simd_extract, {vcgt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+a = 1.
+b = 2.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+////////////////////
+// lesser then
+////////////////////
+
+/// Compare signed less than
+name = vclt
+fn = simd_lt
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+aarch64 = cmgt
+generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
+
+arm = vcgt.s
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
+
+/// Compare unsigned less than
+name = vclt
+fn = simd_lt
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmhi
+generate uint64x*_t
+
+arm = vcgt.s
+generate uint*_t
+
+/// Floating-point compare less than
+name = vclt
+fn = simd_lt
+a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
+b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = fcmgt
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+arm = vcgt.s
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Compare less than
+name = vclt
+multi_fn = transmute, {vclt-in_ntt-noext, {transmute, a}, {transmute, b}}
+a = 2
+b = 1
+validate 0
+
+aarch64 = cmp
+generate i64:u64, u64
+
+/// Floating-point compare less than
+name = vclt
+multi_fn = simd_extract, {vclt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+a = 2.
+b = 1.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+////////////////////
+// lesser then equals
+////////////////////
+
+/// Compare signed less than or equal
+name = vcle
+fn = simd_le
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmge
+generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
+
+arm = vcge.s
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
+
+/// Compare greater than or equal
+name = vcge
+multi_fn = transmute, {vcge-in_ntt-noext, {transmute, a}, {transmute, b}}
+a = 1
+b = 2
+validate 0
+
+aarch64 = cmp
+generate i64:u64, u64
+
+/// Floating-point compare greater than or equal
+name = vcge
+multi_fn = simd_extract, {vcge-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+a = 1.
+b = 2.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+/// Compare unsigned less than or equal
+name = vcle
+fn = simd_le
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmhs
+generate uint64x*_t
+
+arm = vcge.s
+generate uint*_t
+
+/// Floating-point compare less than or equal
+name = vcle
+fn = simd_le
+a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
+b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+aarch64 = fcmge
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+arm = vcge.s
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Compare less than or equal
+name = vcle
+multi_fn = transmute, {vcle-in_ntt-noext, {transmute, a}, {transmute, b}}
+a = 2
+b = 1
+validate 0
+
+aarch64 = cmp
+generate i64:u64, u64
+
+/// Floating-point compare less than or equal
+name = vcle
+multi_fn = simd_extract, {vcle-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+a = 2.
+b = 1.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+////////////////////
+// greater then equals
+////////////////////
+
+/// Compare signed greater than or equal
+name = vcge
+fn = simd_ge
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmge
+generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
+
+arm = vcge.s
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
+
+/// Compare unsigned greater than or equal
+name = vcge
+fn = simd_ge
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmhs
+generate uint64x*_t
+
+arm = vcge.s
+generate uint*_t
+
+/// Floating-point compare greater than or equal
+name = vcge
+fn = simd_ge
+a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
+b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
+validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = fcmge
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+arm = vcge.s
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Compare signed greater than or equal to zero
+name = vcgez
+fn = simd_ge
+a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmge
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
+
+/// Floating-point compare greater than or equal to zero
+name = vcgez
+fn = simd_ge
+a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
+fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+validate FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = fcmge
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+/// Compare signed greater than or equal to zero
+name = vcgez
+multi_fn = transmute, {vcgez-in_ntt-noext, {transmute, a}}
+a = -1
+validate 0
+
+aarch64 = eor
+generate i64:u64
+
+/// Floating-point compare greater than or equal to zero
+name = vcgez
+multi_fn = simd_extract, {vcgez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+a = -1.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+/// Compare signed greater than zero
+name = vcgtz
+fn = simd_gt
+a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = cmgt
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
+
+/// Floating-point compare greater than zero
+name = vcgtz
+fn = simd_gt
+a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
+fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
+
+aarch64 = fcmgt
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+/// Compare signed greater than zero
+name = vcgtz
+multi_fn = transmute, {vcgtz-in_ntt-noext, {transmute, a}}
+a = -1
+validate 0
+
+aarch64 = cmp
+generate i64:u64
+
+/// Floating-point compare greater than zero
+name = vcgtz
+multi_fn = simd_extract, {vcgtz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+a = -1.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+/// Compare signed less than or equal to zero
+name = vclez
+fn = simd_le
+a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
+
+aarch64 = cmgt
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
+
+/// Floating-point compare less than or equal to zero
+name = vclez
+fn = simd_le
+a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
+fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
+
+aarch64 = fcmle
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+/// Compare less than or equal to zero
+name = vclez
+multi_fn = transmute, {vclez-in_ntt-noext, {transmute, a}}
+a = 2
+validate 0
+
+aarch64 = cmp
+generate i64:u64
+
+/// Floating-point compare less than or equal to zero
+name = vclez
+multi_fn = simd_extract, {vclez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+a = 2.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+/// Compare signed less than zero
+name = vcltz
+fn = simd_lt
+a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
+
+aarch64 = cmlt
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
+
+/// Floating-point compare less than zero
+name = vcltz
+fn = simd_lt
+a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
+fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+validate TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
+
+aarch64 = fcmlt
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+/// Compare less than zero
+name = vcltz
+multi_fn = transmute, {vcltz-in_ntt-noext, {transmute, a}}
+a = 2
+validate 0
+
+aarch64 = asr
+generate i64:u64
+
+/// Floating-point compare less than zero
+name = vcltz
+multi_fn = simd_extract, {vcltz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+a = 2.
+validate 0
+
+aarch64 = fcmp
+generate f32:u32, f64:u64
+
+/// Count leading sign bits
+name = vcls
+a = MIN, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX
+validate 0, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0
+
+arm = vcls.s
+aarch64 = cls
+link-arm = vcls._EXT_
+link-aarch64 = cls._EXT_
+generate int*_t
+
+/// Count leading sign bits
+name = vcls
+multi_fn = transmute, {vcls-signed-noext, {transmute, a}}
+a = MIN, MAX, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX
+validate BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1
+
+arm = vcls
+aarch64 = cls
+generate uint8x8_t:int8x8_t, uint8x16_t:int8x16_t, uint16x4_t:int16x4_t, uint16x8_t:int16x8_t, uint32x2_t:int32x2_t, uint32x4_t:int32x4_t
+
+/// Count leading zero bits
+name = vclz
+multi_fn = self-signed-ext, a
+a = MIN, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX
+validate 0, 0, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 1
+
+arm = vclz.
+aarch64 = clz
+generate int*_t
+
+/// Count leading zero bits
+name = vclz
+multi_fn = transmute, {self-signed-ext, transmute(a)}
+a = MIN, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX
+validate BITS, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0
+
+arm = vclz.
+aarch64 = clz
+generate uint*_t
+
+/// Floating-point absolute compare greater than
+name = vcagt
+a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
+b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
+validate !0, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE
+
+aarch64 = facgt
+link-aarch64 = facgt._EXT2_._EXT_
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
+
+arm = vacgt.s
+link-arm = vacgt._EXT2_._EXT_
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Floating-point absolute compare greater than or equal
+name = vcage
+a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
+b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
+validate !0, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE
+
+aarch64 = facge
+link-aarch64 = facge._EXT2_._EXT_
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
+
+arm = vacge.s
+link-arm = vacge._EXT2_._EXT_
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Floating-point absolute compare less than
+name = vcalt
+multi_fn = vcagt-self-noext, b, a
+a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
+b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
+validate 0, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE
+
+aarch64 = facgt
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
+
+arm = vacgt.s
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Floating-point absolute compare less than or equal
+name = vcale
+multi_fn = vcage-self-noext , b, a
+a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
+b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
+validate 0, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE
+
+aarch64 = facge
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
+
+arm = vacge.s
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Insert vector element from another vector element
+name = vcopy
+lane-suffixes
+constn = LANE1:LANE2
+multi_fn = static_assert_imm-in0_exp_len-LANE1
+multi_fn = static_assert_imm-in_exp_len-LANE2
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 0:1
+validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = mov
+generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x2_t, int32x4_t, int64x2_t
+generate uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x2_t, uint32x4_t, uint64x2_t
+generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t
+
+/// Insert vector element from another vector element
+name = vcopy
+lane-suffixes
+constn = LANE1:LANE2
+multi_fn = static_assert_imm-in0_exp_len-LANE1
+multi_fn = static_assert_imm-in_exp_len-LANE2
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+a = 1., 2., 3., 4.
+b = 0., 0.5, 0., 0.
+n = 0:1
+validate 0.5, 2., 3., 4.
+
+aarch64 = mov
+generate float32x2_t, float32x4_t, float64x2_t
+
+/// Insert vector element from another vector element
+name = vcopy
+lane-suffixes
+constn = LANE1:LANE2
+multi_fn = static_assert_imm-in0_exp_len-LANE1
+multi_fn = static_assert_imm-in_exp_len-LANE2
+multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 0:1
+validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = mov
+generate int8x8_t:int8x16_t:int8x8_t, int16x4_t:int16x8_t:int16x4_t, int32x2_t:int32x4_t:int32x2_t
+generate uint8x8_t:uint8x16_t:uint8x8_t, uint16x4_t:uint16x8_t:uint16x4_t, uint32x2_t:uint32x4_t:uint32x2_t
+generate poly8x8_t:poly8x16_t:poly8x8_t, poly16x4_t:poly16x8_t:poly16x4_t
+
+/// Insert vector element from another vector element
+name = vcopy
+lane-suffixes
+constn = LANE1:LANE2
+multi_fn = static_assert_imm-in0_exp_len-LANE1
+multi_fn = static_assert_imm-in_exp_len-LANE2
+multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
+a = 1., 2., 3., 4.
+b = 0., 0.5, 0., 0.
+n = 0:1
+validate 0.5, 2., 3., 4.
+
+aarch64 = mov
+generate float32x2_t:float32x4_t:float32x2_t
+
+/// Insert vector element from another vector element
+name = vcopy
+lane-suffixes
+constn = LANE1:LANE2
+multi_fn = static_assert_imm-in0_exp_len-LANE1
+multi_fn = static_assert_imm-in_exp_len-LANE2
+multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 0:1
+validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = mov
+generate int8x16_t:int8x8_t:int8x16_t, int16x8_t:int16x4_t:int16x8_t, int32x4_t:int32x2_t:int32x4_t
+generate uint8x16_t:uint8x8_t:uint8x16_t, uint16x8_t:uint16x4_t:uint16x8_t, uint32x4_t:uint32x2_t:uint32x4_t
+generate poly8x16_t:poly8x8_t:poly8x16_t, poly16x8_t:poly16x4_t:poly16x8_t
+
+/// Insert vector element from another vector element
+name = vcopy
+lane-suffixes
+constn = LANE1:LANE2
+multi_fn = static_assert_imm-in0_exp_len-LANE1
+multi_fn = static_assert_imm-in_exp_len-LANE2
+multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1:0
+validate 1, MAX
+
+aarch64 = mov
+generate int64x2_t:int64x1_t:int64x2_t, uint64x2_t:uint64x1_t:uint64x2_t, poly64x2_t:poly64x1_t:poly64x2_t
+
+/// Insert vector element from another vector element
+name = vcopy
+lane-suffixes
+constn = LANE1:LANE2
+multi_fn = static_assert_imm-in0_exp_len-LANE1
+multi_fn = static_assert_imm-in_exp_len-LANE2
+multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
+a = 1., 2., 3., 4.
+b = 0.5, 0., 0., 0.
+n = 1:0
+validate 1., 0.5, 3., 4.
+
+aarch64 = mov
+generate float32x4_t:float32x2_t:float32x4_t
+aarch64 = mov
+generate float64x2_t:float64x1_t:float64x2_t
+
+/// Insert vector element from another vector element
+name = vcreate
+out-suffix
+multi_fn = transmute, a
+a = 1
+validate 1, 0, 0, 0, 0, 0, 0, 0
+
+aarch64 = nop
+arm = nop
+generate u64:int8x8_t, u64:int16x4_t, u64:int32x2_t, u64:int64x1_t
+generate u64:uint8x8_t, u64:uint16x4_t, u64:uint32x2_t, u64:uint64x1_t
+generate u64:poly8x8_t, u64:poly16x4_t
+target = aes
+generate u64:poly64x1_t
+
+/// Insert vector element from another vector element
+name = vcreate
+out-suffix
+multi_fn = transmute, a
+a = 0
+validate 0., 0.
+
+aarch64 = nop
+generate u64:float64x1_t
+arm = nop
+generate u64:float32x2_t
+
+/// Fixed-point convert to floating-point
+name = vcvt
+double-suffixes
+fn = simd_cast
+a = 1, 2, 3, 4
+validate 1., 2., 3., 4.
+
+aarch64 = scvtf
+generate int64x1_t:float64x1_t, int64x2_t:float64x2_t
+aarch64 = ucvtf
+generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t
+
+arm = vcvt
+aarch64 = scvtf
+generate int32x2_t:float32x2_t, int32x4_t:float32x4_t
+aarch64 = ucvtf
+generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t
+
+/// Floating-point convert to higher precision long
+name = vcvt
+double-suffixes
+fn = simd_cast
+a = -1.2, 1.2
+validate -1.2f32 as f64, 1.2f32 as f64
+
+aarch64 = fcvtl
+generate float32x2_t:float64x2_t
+
+/// Floating-point convert to higher precision long
+name = vcvt_high
+noq-double-suffixes
+multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3]
+multi_fn = simd_cast, b
+a = -1.2, 1.2, 2.3, 3.4
+validate 2.3f32 as f64, 3.4f32 as f64
+
+aarch64 = fcvtl
+generate float32x4_t:float64x2_t
+
+/// Floating-point convert to lower precision narrow
+name = vcvt
+double-suffixes
+fn = simd_cast
+a = -1.2, 1.2
+validate -1.2f64 as f32, 1.2f64 as f32
+
+aarch64 = fcvtn
+generate float64x2_t:float32x2_t
+
+/// Floating-point convert to lower precision narrow
+name = vcvt_high
+noq-double-suffixes
+multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3]
+a = -1.2, 1.2
+b = -2.3, 3.4
+validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32
+
+aarch64 = fcvtn
+generate float32x2_t:float64x2_t:float32x4_t
+
+/// Floating-point convert to lower precision narrow, rounding to odd
+name = vcvtx
+double-suffixes
+a = -1.0, 2.0
+validate -1.0, 2.0
+
+aarch64 = fcvtxn
+link-aarch64 = fcvtxn._EXT2_._EXT_
+generate float64x2_t:float32x2_t
+
+/// Floating-point convert to lower precision narrow, rounding to odd
+name = vcvtx
+double-suffixes
+multi_fn = simd_extract, {vcvtx-_f32_f64-noext, {vdupq_n-in_ntt-noext, a}}, 0
+a = -1.0
+validate -1.0
+
+aarch64 = fcvtxn
+generate f64:f32
+
+/// Floating-point convert to lower precision narrow, rounding to odd
+name = vcvtx_high
+noq-double-suffixes
+multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
+a = -1.0, 2.0
+b = -3.0, 4.0
+validate -1.0, 2.0, -3.0, 4.0
+
+aarch64 = fcvtxn
+generate float32x2_t:float64x2_t:float32x4_t
+
+/// Fixed-point convert to floating-point
+name = vcvt
+double-n-suffixes
+constn = N
+multi_fn = static_assert-N-1-bits
+a = 1, 2, 3, 4
+n = 2
+validate 0.25, 0.5, 0.75, 1.
+arm-aarch64-separate
+
+aarch64 = scvtf
+link-aarch64 = vcvtfxs2fp._EXT2_._EXT_
+const-aarch64 = N
+generate int64x1_t:float64x1_t, int64x2_t:float64x2_t, i32:f32, i64:f64
+
+aarch64 = ucvtf
+link-aarch64 = vcvtfxu2fp._EXT2_._EXT_
+const-aarch64 = N
+generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t, u32:f32, u64:f64
+
+aarch64 = scvtf
+link-aarch64 = vcvtfxs2fp._EXT2_._EXT_
+arm = vcvt
+link-arm = vcvtfxs2fp._EXT2_._EXT_
+const-arm = N:i32
+
+generate int32x2_t:float32x2_t, int32x4_t:float32x4_t
+
+aarch64 = ucvtf
+link-aarch64 = vcvtfxu2fp._EXT2_._EXT_
+arm = vcvt
+link-arm = vcvtfxu2fp._EXT2_._EXT_
+const-arm = N:i32
+generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t
+
+/// Floating-point convert to fixed-point, rounding toward zero
+name = vcvt
+double-n-suffixes
+constn = N
+multi_fn = static_assert-N-1-bits
+a = 0.25, 0.5, 0.75, 1.
+n = 2
+validate 1, 2, 3, 4
+arm-aarch64-separate
+
+aarch64 = fcvtzs
+link-aarch64 = vcvtfp2fxs._EXT2_._EXT_
+const-aarch64 = N
+generate float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64
+
+aarch64 = fcvtzu
+link-aarch64 = vcvtfp2fxu._EXT2_._EXT_
+const-aarch64 = N
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
+
+aarch64 = fcvtzs
+link-aarch64 = vcvtfp2fxs._EXT2_._EXT_
+arm = vcvt
+link-arm = vcvtfp2fxs._EXT2_._EXT_
+const-arm = N:i32
+generate float32x2_t:int32x2_t, float32x4_t:int32x4_t
+
+aarch64 = fcvtzu
+link-aarch64 = vcvtfp2fxu._EXT2_._EXT_
+arm = vcvt
+link-arm = vcvtfp2fxu._EXT2_._EXT_
+const-arm = N:i32
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Fixed-point convert to floating-point
+name = vcvt
+double-suffixes
+multi_fn = a as out_t
+a = 1
+validate 1.
+
+aarch64 = scvtf
+generate i32:f32, i64:f64
+aarch64 = ucvtf
+generate u32:f32, u64:f64
+
+/// Fixed-point convert to floating-point
+name = vcvt
+double-suffixes
+multi_fn = a as out_t
+a = 1.
+validate 1
+
+aarch64 = fcvtzs
+generate f32:i32, f64:i64
+aarch64 = fcvtzu
+generate f32:u32, f64:u64
+
+/// Floating-point convert to signed fixed-point, rounding toward zero
+name = vcvt
+double-suffixes
+link-aarch64 = llvm.fptosi.sat._EXT2_._EXT_
+a = -1.1, 2.1, -2.9, 3.9
+validate -1, 2, -2, 3
+
+aarch64 = fcvtzs
+generate float64x1_t:int64x1_t, float64x2_t:int64x2_t
+
+link-arm = llvm.fptosi.sat._EXT2_._EXT_
+arm = vcvt
+generate float32x2_t:int32x2_t, float32x4_t:int32x4_t
+
+/// Floating-point convert to unsigned fixed-point, rounding toward zero
+name = vcvt
+double-suffixes
+link-aarch64 = llvm.fptoui.sat._EXT2_._EXT_
+a = 1.1, 2.1, 2.9, 3.9
+validate 1, 2, 2, 3
+
+aarch64 = fcvtzu
+generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+link-arm = llvm.fptoui.sat._EXT2_._EXT_
+arm = vcvt
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to away
+name = vcvta
+double-suffixes
+a = -1.1, 2.1, -2.9, 3.9
+validate -1, 2, -3, 4
+
+aarch64 = fcvtas
+link-aarch64 = fcvtas._EXT2_._EXT_
+generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t
+
+/// Floating-point convert to integer, rounding to nearest with ties to away
+name = vcvta
+double-suffixes
+a = 2.9
+validate 3
+
+aarch64 = fcvtas
+link-aarch64 = fcvtas._EXT2_._EXT_
+generate f32:i32, f64:i64
+
+aarch64 = fcvtau
+link-aarch64 = fcvtau._EXT2_._EXT_
+generate f32:u32, f64:u64
+
+/// Floating-point convert to signed integer, rounding to nearest with ties to even
+name = vcvtn
+double-suffixes
+a = -1.5, 2.1, -2.9, 3.9
+validate -2, 2, -3, 4
+
+aarch64 = fcvtns
+link-aarch64 = fcvtns._EXT2_._EXT_
+generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64
+
+/// Floating-point convert to signed integer, rounding toward minus infinity
+name = vcvtm
+double-suffixes
+a = -1.1, 2.1, -2.9, 3.9
+validate -2, 2, -3, 3
+
+aarch64 = fcvtms
+link-aarch64 = fcvtms._EXT2_._EXT_
+generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64
+
+/// Floating-point convert to signed integer, rounding toward plus infinity
+name = vcvtp
+double-suffixes
+a = -1.1, 2.1, -2.9, 3.9
+validate -1, 3, -2, 4
+
+aarch64 = fcvtps
+link-aarch64 = fcvtps._EXT2_._EXT_
+generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to away
+name = vcvta
+double-suffixes
+a = 1.1, 2.1, 2.9, 3.9
+validate 1, 2, 3, 4
+
+aarch64 = fcvtau
+link-aarch64 = fcvtau._EXT2_._EXT_
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
+/// Floating-point convert to unsigned integer, rounding to nearest with ties to even
+name = vcvtn
+double-suffixes
+a = 1.5, 2.1, 2.9, 3.9
+validate 2, 2, 3, 4
+
+aarch64 = fcvtnu
+link-aarch64 = fcvtnu._EXT2_._EXT_
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
+
+/// Floating-point convert to unsigned integer, rounding toward minus infinity
+name = vcvtm
+double-suffixes
+a = 1.1, 2.1, 2.9, 3.9
+validate 1, 2, 2, 3
+
+aarch64 = fcvtmu
+link-aarch64 = fcvtmu._EXT2_._EXT_
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
+
+/// Floating-point convert to unsigned integer, rounding toward plus infinity
+name = vcvtp
+double-suffixes
+a = 1.1, 2.1, 2.9, 3.9
+validate 2, 3, 3, 4
+
+aarch64 = fcvtpu
+link-aarch64 = fcvtpu._EXT2_._EXT_
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
+
+/// Set all vector lanes to the same value
+name = vdup
+lane-suffixes
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
+a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
+n = HFLEN
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
+aarch64 = dup
+generate poly64x2_t, poly64x1_t:poly64x2_t
+
+arm = vdup.l
+generate int*_t
+generate int8x16_t:int8x8_t, int16x8_t:int16x4_t, int32x4_t:int32x2_t
+generate int8x8_t:int8x16_t, int16x4_t:int16x8_t, int32x2_t:int32x4_t
+
+generate uint*_t
+generate uint8x16_t:uint8x8_t, uint16x8_t:uint16x4_t, uint32x4_t:uint32x2_t
+generate uint8x8_t:uint8x16_t, uint16x4_t:uint16x8_t, uint32x2_t:uint32x4_t
+
+generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
+generate poly8x16_t:poly8x8_t, poly16x8_t:poly16x4_t
+generate poly8x8_t:poly8x16_t, poly16x4_t:poly16x8_t
+
+/// Set all vector lanes to the same value
+name = vdup
+lane-suffixes
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
+a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
+n = HFLEN
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
+aarch64 = dup
+arm = vmov
+generate int64x2_t, int64x1_t:int64x2_t, uint64x2_t, uint64x1_t:uint64x2_t
+
+/// Set all vector lanes to the same value
+name = vdup
+lane-suffixes
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
+a = 1., 1., 1., 4.
+n = HFLEN
+validate 1., 1., 1., 1.
+
+aarch64 = dup
+generate float64x2_t, float64x1_t:float64x2_t
+
+arm = vdup.l
+generate float*_t, float32x4_t:float32x2_t, float32x2_t:float32x4_t
+
+/// Set all vector lanes to the same value
+name = vdup
+lane-suffixes
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = a
+a = 0
+n = HFLEN
+validate 0
+
+aarch64 = nop
+generate poly64x1_t
+
+arm = nop
+generate int64x1_t, uint64x1_t
+
+/// Set all vector lanes to the same value
+name = vdup
+lane-suffixes
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = a
+a = 0.
+n = HFLEN
+validate 0.
+
+aarch64 = nop
+generate float64x1_t
+
+/// Set all vector lanes to the same value
+name = vdup
+lane-suffixes
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32}
+a = 0, 1
+n = HFLEN
+validate 1
+
+aarch64 = nop
+generate poly64x2_t:poly64x1_t
+
+arm = vmov
+generate int64x2_t:int64x1_t, uint64x2_t:uint64x1_t
+
+/// Set all vector lanes to the same value
+name = vdup
+lane-suffixes
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32}
+a = 0., 1.
+n = HFLEN
+validate 1.
+
+aarch64 = nop
+generate float64x2_t:float64x1_t
+
+/// Set all vector lanes to the same value
+name = vdup
+lane-suffixes
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_extract, a, N as u32
+a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
+n = HFLEN
+validate 1
+
+aarch64 = nop
+generate int8x8_t:i8, int8x16_t:i8, int16x4_t:i16, int16x8_t:i16, int32x2_t:i32, int32x4_t:i32, int64x1_t:i64, int64x2_t:i64
+generate uint8x8_t:u8, uint8x16_t:u8, uint16x4_t:u16, uint16x8_t:u16, uint32x2_t:u32, uint32x4_t:u32, uint64x1_t:u64, uint64x2_t:u64
+generate poly8x8_t:p8, poly8x16_t:p8, poly16x4_t:p16, poly16x8_t:p16
+
+/// Set all vector lanes to the same value
+name = vdup
+lane-suffixes
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_extract, a, N as u32
+a = 1., 1., 1., 4.
+n = HFLEN
+validate 1.
+
+aarch64 = nop
+generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64
+
+/// Extract vector from pair of vectors
+name = vext
+constn = N
+multi_fn = static_assert_imm-out_exp_len-N
+multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
+a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
+b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
+n = HFLEN
+validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19
+
+arm = "vext.8"
+aarch64 = ext
+generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
+
+/// Extract vector from pair of vectors
+name = vext
+constn = N
+multi_fn = static_assert_imm-out_exp_len-N
+multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
+a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
+b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
+n = HFLEN
+validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19
+
+aarch64 = ext
+generate poly64x2_t
+
+arm = vmov
+generate int64x2_t, uint64x2_t
+
+/// Extract vector from pair of vectors
+name = vext
+constn = N
+multi_fn = static_assert_imm-out_exp_len-N
+multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
+a = 0., 2., 2., 3.
+b = 3., 4., 5., 6.,
+n = HFLEN
+validate 2., 3., 3., 4.
+
+aarch64 = ext
+generate float64x2_t
+
+arm = "vext.8"
+generate float*_t
+
+/// Multiply-add to accumulator
+name = vmla
+multi_fn = simd_add, a, {simd_mul, b, c}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+arm = vmla.
+aarch64 = mla
+generate int*_t, uint*_t
+
+/// Floating-point multiply-add to accumulator
+name = vmla
+multi_fn = simd_add, a, {simd_mul, b, c}
+a = 0., 1., 2., 3.
+b = 2., 2., 2., 2.
+c = 3., 3., 3., 3.
+validate 6., 7., 8., 9.
+
+aarch64 = fmul
+generate float64x*_t
+
+arm = vmla.
+generate float*_t
+
+/// Vector multiply accumulate with scalar
+name = vmla
+n-suffix
+multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+aarch64 = mla
+arm = vmla.
+generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t
+generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t
+
+/// Vector multiply accumulate with scalar
+name = vmla
+n-suffix
+multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c}
+a = 0., 1., 2., 3.
+b = 2., 2., 2., 2.
+c = 3.
+validate 6., 7., 8., 9.
+
+aarch64 = fmul
+arm = vmla.
+generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
+
+/// Vector multiply accumulate with scalar
+name = vmla
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+aarch64 = mla
+arm = vmla.
+generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
+generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
+generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
+generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
+
+/// Vector multiply accumulate with scalar
+name = vmla
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+a = 0., 1., 2., 3.
+b = 2., 2., 2., 2.
+c = 0., 3., 0., 0.
+n = 1
+validate 6., 7., 8., 9.
+
+aarch64 = fmul
+arm = vmla.
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
+/// Signed multiply-add long
+name = vmlal
+multi_fn = simd_add, a, {vmull-self-noext, b, c}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+arm = vmlal.s
+aarch64 = smlal
+generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
+
+/// Unsigned multiply-add long
+name = vmlal
+multi_fn = simd_add, a, {vmull-self-noext, b, c}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+arm = vmlal.s
+aarch64 = umlal
+generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
+
+/// Vector widening multiply accumulate with scalar
+name = vmlal
+n-suffix
+multi_fn = vmlal-self-noext, a, b, {vdup-nself-noext, c}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+arm = vmlal.s
+aarch64 = smlal
+generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
+aarch64 = umlal
+generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t
+
+/// Vector widening multiply accumulate with scalar
+name = vmlal_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+arm = vmlal.s
+aarch64 = smlal
+generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t
+generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
+aarch64 = umlal
+generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t
+generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t
+
+/// Signed multiply-add long
+name = vmlal_high
+no-q
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
+multi_fn = vmlal-noqself-noext, a, b, c
+a = 8, 7, 6, 5, 4, 3, 2, 1
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = smlal2
+generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+
+/// Unsigned multiply-add long
+name = vmlal_high
+no-q
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
+multi_fn = vmlal-noqself-noext, a, b, c
+a = 8, 7, 6, 5, 4, 3, 2, 1
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = umlal2
+generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
+
+/// Multiply-add long
+name = vmlal_high_n
+no-q
+multi_fn = vmlal_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c}
+a = 8, 7, 6, 5, 4, 3, 2, 1
+b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+c = 2
+validate 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = smlal2
+generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
+aarch64 = umlal2
+generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t
+
+/// Multiply-add long
+name = vmlal_high_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+a = 8, 7, 6, 5, 4, 3, 2, 1
+b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = smlal2
+generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t
+generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+aarch64 = umlal2
+generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
+generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
+
+/// Multiply-subtract from accumulator
+name = vmls
+multi_fn = simd_sub, a, {simd_mul, b, c}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+arm = vmls.
+aarch64 = mls
+generate int*_t, uint*_t
+
+/// Floating-point multiply-subtract from accumulator
+name = vmls
+multi_fn = simd_sub, a, {simd_mul, b, c}
+a = 6., 7., 8., 9.
+b = 2., 2., 2., 2.
+c = 3., 3., 3., 3.
+validate 0., 1., 2., 3.
+
+aarch64 = fmul
+generate float64x*_t
+
+arm = vmls.
+generate float*_t
+
+/// Vector multiply subtract with scalar
+name = vmls
+n-suffix
+multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = mls
+arm = vmls.
+generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t
+generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t
+
+/// Vector multiply subtract with scalar
+name = vmls
+n-suffix
+multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c}
+a = 6., 7., 8., 9.
+b = 2., 2., 2., 2.
+c = 3.
+validate 0., 1., 2., 3.
+
+aarch64 = fmul
+arm = vmls.
+generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
+
+/// Vector multiply subtract with scalar
+name = vmls
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = mls
+arm = vmls.
+generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
+generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
+generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
+generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
+
+/// Vector multiply subtract with scalar
+name = vmls
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+a = 6., 7., 8., 9.
+b = 2., 2., 2., 2.
+c = 0., 3., 0., 0.
+n = 1
+validate 0., 1., 2., 3.
+
+aarch64 = fmul
+arm = vmls.
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
+/// Signed multiply-subtract long
+name = vmlsl
+multi_fn = simd_sub, a, {vmull-self-noext, b, c}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+arm = vmlsl.s
+aarch64 = smlsl
+generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
+
+/// Unsigned multiply-subtract long
+name = vmlsl
+multi_fn = simd_sub, a, {vmull-self-noext, b, c}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+arm = vmlsl.s
+aarch64 = umlsl
+generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
+
+/// Vector widening multiply subtract with scalar
+name = vmlsl
+n-suffix
+multi_fn = vmlsl-self-noext, a, b, {vdup-nself-noext, c}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+arm = vmlsl.s
+aarch64 = smlsl
+generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
+aarch64 = umlsl
+generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t
+
+/// Vector widening multiply subtract with scalar
+name = vmlsl_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+arm = vmlsl.s
+aarch64 = smlsl
+generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t
+generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
+aarch64 = umlsl
+generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t
+generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t
+
+/// Signed multiply-subtract long
+name = vmlsl_high
+no-q
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
+multi_fn = vmlsl-noqself-noext, a, b, c
+a = 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 14, 13, 12, 11, 10, 9, 8, 7
+
+aarch64 = smlsl2
+generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+
+/// Unsigned multiply-subtract long
+name = vmlsl_high
+no-q
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
+multi_fn = vmlsl-noqself-noext, a, b, c
+a = 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 14, 13, 12, 11, 10, 9, 8, 7
+
+aarch64 = umlsl2
+generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
+
+/// Multiply-subtract long
+name = vmlsl_high_n
+no-q
+multi_fn = vmlsl_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c}
+a = 14, 15, 16, 17, 18, 19, 20, 21
+b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+c = 2
+validate 14, 13, 12, 11, 10, 9, 8, 7
+
+aarch64 = smlsl2
+generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
+aarch64 = umlsl2
+generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t
+
+/// Multiply-subtract long
+name = vmlsl_high_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
+a = 14, 15, 16, 17, 18, 19, 20, 21
+b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 14, 13, 12, 11, 10, 9, 8, 7
+
+aarch64 = smlsl2
+generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t
+generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+aarch64 = umlsl2
+generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
+generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
+
+/// Extract narrow
+name = vmovn_high
+no-q
+multi_fn = simd_cast, c:in_t0, b
+multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len}
+a = 0, 1, 2, 3, 2, 3, 4, 5
+b = 2, 3, 4, 5, 12, 13, 14, 15
+validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15
+
+aarch64 = xtn2
+generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
+generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
+
+/// Negate
+name = vneg
+fn = simd_neg
+a = 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8
+validate 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8
+
+aarch64 = neg
+generate int64x*_t
+
+arm = vneg.s
+generate int*_t
+
+/// Negate
+name = vneg
+multi_fn = a.wrapping_neg()
+a = 1
+validate -1
+
+aarch64 = neg
+generate i64
+
+/// Negate
+name = vneg
+fn = simd_neg
+a = 0., 1., -1., 2., -2., 3., -3., 4.
+validate 0., -1., 1., -2., 2., -3., 3., -4.
+
+aarch64 = fneg
+generate float64x*_t
+
+arm = vneg.s
+generate float*_t
+
+/// Signed saturating negate
+name = vqneg
+a = MIN, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7
+validate MAX, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7
+link-arm = vqneg._EXT_
+link-aarch64 = sqneg._EXT_
+
+aarch64 = sqneg
+generate int64x*_t
+
+arm = vqneg.s
+generate int*_t
+
+/// Signed saturating negate
+name = vqneg
+multi_fn = simd_extract, {vqneg-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+a = 1
+validate -1
+
+aarch64 = sqneg
+generate i8, i16, i32, i64
+
+/// Saturating subtract
+name = vqsub
+a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26
+
+arm = vqsub.s
+aarch64 = uqsub
+link-arm = llvm.usub.sat._EXT_
+link-aarch64 = uqsub._EXT_
+generate uint*_t, uint64x*_t
+
+arm = vqsub.s
+aarch64 = sqsub
+link-arm = llvm.ssub.sat._EXT_
+link-aarch64 = sqsub._EXT_
+generate int*_t, int64x*_t
+
+/// Saturating subtract
+name = vqsub
+multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = simd_extract, {vqsub-in_ntt-noext, a, b}, 0
+a = 42
+b = 1
+validate 41
+
+aarch64 = sqsub
+generate i8, i16
+aarch64 = uqsub
+generate u8, u16
+
+/// Saturating subtract
+name = vqsub
+a = 42
+b = 1
+validate 41
+
+aarch64 = uqsub
+link-aarch64 = uqsub._EXT_
+generate u32, u64
+
+aarch64 = sqsub
+link-aarch64 = sqsub._EXT_
+generate i32, i64
+
+/// Halving add
+name = vhadd
+a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29
+
+arm = vhadd.s
+aarch64 = uhadd
+link-aarch64 = uhadd._EXT_
+link-arm = vhaddu._EXT_
+generate uint*_t
+
+arm = vhadd.s
+aarch64 = shadd
+link-aarch64 = shadd._EXT_
+link-arm = vhadds._EXT_
+generate int*_t
+
+/// Reverse bit order
+name = vrbit
+a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120
+
+aarch64 = rbit
+link-aarch64 = rbit._EXT_
+
+generate int8x8_t, int8x16_t
+
+/// Reverse bit order
+name = vrbit
+multi_fn = transmute, {vrbit-signed-noext, transmute(a)}
+a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120
+
+aarch64 = rbit
+
+generate uint8x8_t, uint8x16_t, poly8x8_t, poly8x16_t
+
+/// Rounding halving add
+name = vrhadd
+a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29
+
+arm = vrhadd.s
+aarch64 = urhadd
+link-arm = vrhaddu._EXT_
+link-aarch64 = urhadd._EXT_
+generate uint*_t
+
+arm = vrhadd.s
+aarch64 = srhadd
+link-arm = vrhadds._EXT_
+link-aarch64 = srhadd._EXT_
+generate int*_t
+
+/// Floating-point round to integral exact, using current rounding mode
+name = vrndx
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 0.0, 2.0, 2.0
+
+aarch64 = frintx
+link-aarch64 = llvm.rint._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, to nearest with ties to away
+name = vrnda
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 1.0, 2.0, 3.0
+
+aarch64 = frinta
+link-aarch64 = llvm.round._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, to nearest with ties to even
+name = vrndn
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 0.0, 2.0, 2.0
+
+link-aarch64 = frintn._EXT_
+aarch64 = frintn
+generate float64x*_t
+
+target = fp-armv8
+arm = vrintn
+link-arm = vrintn._EXT_
+generate float*_t
+
+/// Floating-point round to integral, to nearest with ties to even
+name = vrndn
+a = -1.5
+validate -2.0
+
+aarch64 = frintn
+link-aarch64 = llvm.roundeven._EXT_
+generate f32
+
+/// Floating-point round to integral, toward minus infinity
+name = vrndm
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 0.0, 1.0, 2.0
+
+aarch64 = frintm
+link-aarch64 = llvm.floor._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, toward plus infinity
+name = vrndp
+a = -1.5, 0.5, 1.5, 2.5
+validate -1.0, 1.0, 2.0, 3.0
+
+aarch64 = frintp
+link-aarch64 = llvm.ceil._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, toward zero
+name = vrnd
+a = -1.5, 0.5, 1.5, 2.5
+validate -1.0, 0.0, 1.0, 2.0
+
+aarch64 = frintz
+link-aarch64 = llvm.trunc._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, using current rounding mode
+name = vrndi
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 0.0, 2.0, 2.0
+
+aarch64 = frinti
+link-aarch64 = llvm.nearbyint._EXT_
+generate float*_t, float64x*_t
+
+/// Saturating add
+name = vqadd
+a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58
+
+arm = vqadd.s
+aarch64 = uqadd
+link-arm = llvm.uadd.sat._EXT_
+link-aarch64 = uqadd._EXT_
+generate uint*_t, uint64x*_t
+
+arm = vqadd.s
+aarch64 = sqadd
+link-arm = llvm.sadd.sat._EXT_
+link-aarch64 = sqadd._EXT_
+generate int*_t, int64x*_t
+
+/// Saturating add
+name = vqadd
+multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = simd_extract, {vqadd-in_ntt-noext, a, b}, 0
+a = 42
+b = 1
+validate 43
+
+aarch64 = sqadd
+generate i8, i16
+aarch64 = uqadd
+generate u8, u16
+
+/// Saturating add
+name = vqadd
+a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58
+
+aarch64 = uqadd
+link-aarch64 = uqadd._EXT_
+generate u32, u64
+
+aarch64 = sqadd
+link-aarch64 = sqadd._EXT_
+generate i32, i64
+
+/// Load multiple single-element structures to one, two, three, or four registers
+name = vld1
+out-suffix
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+load_fn
+
+aarch64 = ld1
+link-aarch64 = ld1x2._EXT2_
+arm = vld1
+link-arm = vld1x2._EXT2_
+generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t
+generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t
+
+link-aarch64 = ld1x3._EXT2_
+link-arm = vld1x3._EXT2_
+generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t
+generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t, *const i64:int64x2x3_t
+
+link-aarch64 = ld1x4._EXT2_
+link-arm = vld1x4._EXT2_
+generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t
+generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t, *const i64:int64x2x4_t
+
+/// Load multiple single-element structures to one, two, three, or four registers
+name = vld1
+out-suffix
+multi_fn = transmute, {vld1-outsigned-noext, transmute(a)}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+
+load_fn
+aarch64 = ld1
+arm = vld1
+generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t
+generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t
+generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t
+generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t, *const u64:uint64x2x3_t
+generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t
+generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t, *const u64:uint64x2x4_t
+generate *const p8:poly8x8x2_t, *const p8:poly8x8x3_t, *const p8:poly8x8x4_t
+generate *const p8:poly8x16x2_t, *const p8:poly8x16x3_t, *const p8:poly8x16x4_t
+generate *const p16:poly16x4x2_t, *const p16:poly16x4x3_t, *const p16:poly16x4x4_t
+generate *const p16:poly16x8x2_t, *const p16:poly16x8x3_t, *const p16:poly16x8x4_t
+target = aes
+generate *const p64:poly64x1x2_t
+arm = nop
+generate *const p64:poly64x1x3_t, *const p64:poly64x1x4_t
+generate *const p64:poly64x2x2_t, *const p64:poly64x2x3_t, *const p64:poly64x2x4_t
+/// Load multiple single-element structures to one, two, three, or four registers
+name = vld1
+out-suffix
+a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
+validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
+load_fn
+
+aarch64 = ld1
+link-aarch64 = ld1x2._EXT2_
+generate *const f64:float64x1x2_t, *const f64:float64x2x2_t
+
+link-aarch64 = ld1x3._EXT2_
+generate *const f64:float64x1x3_t, *const f64:float64x2x3_t
+
+link-aarch64 = ld1x4._EXT2_
+generate *const f64:float64x1x4_t, *const f64:float64x2x4_t
+
+arm = vld1
+link-aarch64 = ld1x2._EXT2_
+link-arm = vld1x2._EXT2_
+generate *const f32:float32x2x2_t, *const f32:float32x4x2_t
+
+link-aarch64 = ld1x3._EXT2_
+link-arm = vld1x3._EXT2_
+generate *const f32:float32x2x3_t, *const f32:float32x4x3_t
+
+link-aarch64 = ld1x4._EXT2_
+link-arm = vld1x4._EXT2_
+generate *const f32:float32x2x4_t, *const f32:float32x4x4_t
+
+/// Load multiple 2-element structures to two registers
+name = vld2
+out-nox
+a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
+validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld2
+link-aarch64 = ld2._EXTv2_
+generate *const i64:int64x2x2_t
+
+arm = vld2
+link-arm = vld2._EXTpi82_
+generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t
+generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t
+arm = nop
+aarch64 = nop
+generate *const i64:int64x1x2_t
+
+/// Load multiple 2-element structures to two registers
+name = vld2
+out-nox
+multi_fn = transmute, {vld2-outsignednox-noext, transmute(a)}
+a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
+validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+load_fn
+
+aarch64 = ld2
+generate *const u64:uint64x2x2_t
+target = aes
+generate *const p64:poly64x2x2_t
+
+target = default
+arm = vld2
+generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t
+generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t
+generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t
+arm = nop
+aarch64 = nop
+generate *const u64:uint64x1x2_t
+target = aes
+generate *const p64:poly64x1x2_t
+
+
+/// Load multiple 2-element structures to two registers
+name = vld2
+out-nox
+a = 0., 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9.
+validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9.
+load_fn
+arm-aarch64-separate
+
+aarch64 = nop
+link-aarch64 = ld2._EXTv2_
+generate *const f64:float64x1x2_t
+aarch64 = ld2
+generate *const f64:float64x2x2_t
+
+arm = vld2
+link-arm = vld2._EXTpi82_
+generate *const f32:float32x2x2_t, *const f32:float32x4x2_t
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+name = vld2
+out-dup-nox
+a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld2r
+link-aarch64 = ld2r._EXT2_
+generate *const i64:int64x2x2_t
+
+arm = vld2
+link-arm = vld2dup._EXTpi82_
+generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t
+generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t
+arm = nop
+generate *const i64:int64x1x2_t
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+name = vld2
+out-dup-nox
+multi_fn = transmute, {vld2-outsigneddupnox-noext, transmute(a)}
+a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+
+aarch64 = ld2r
+generate *const u64:uint64x2x2_t
+target = aes
+generate *const p64:poly64x2x2_t
+
+target = default
+arm = vld2
+generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t
+generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t
+generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t
+arm = nop
+generate *const u64:uint64x1x2_t
+target = aes
+generate *const p64:poly64x1x2_t
+
+/// Load single 2-element structure and replicate to all lanes of two registers
+name = vld2
+out-dup-nox
+a = 0., 1., 1., 2., 3., 1., 4., 3., 5.
+validate 1., 1., 1., 1., 1., 1., 1., 1.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld2r
+link-aarch64 = ld2r._EXT2_
+generate *const f64:float64x1x2_t, *const f64:float64x2x2_t
+
+arm = vld2
+link-arm = vld2dup._EXTpi82_
+generate *const f32:float32x2x2_t, *const f32:float32x4x2_t
+
+/// Load multiple 2-element structures to two registers
+name = vld2
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+n = 0
+validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld2
+const-aarch64 = LANE
+link-aarch64 = ld2lane._EXTpi82_
+generate *const i8:int8x16x2_t:int8x16x2_t, *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t
+
+arm = vld2
+const-arm = LANE
+link-arm = vld2lane._EXTpi82_
+generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t
+generate *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t
+
+/// Load multiple 2-element structures to two registers
+name = vld2
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vld2-outsignedlanenox-::<LANE>, transmute(a), transmute(b)}
+constn = LANE
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+n = 0
+validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+load_fn
+
+aarch64 = ld2
+const-aarch64 = LANE
+
+target = aes
+generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t
+
+target = default
+generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t
+generate *const p8:poly8x16x2_t:poly8x16x2_t
+
+arm = vld2
+const-arm = LANE
+generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t
+generate *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t
+generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t
+generate *const p16:poly16x8x2_t:poly16x8x2_t
+
+/// Load multiple 2-element structures to two registers
+name = vld2
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0., 1., 2., 3., 4., 5., 6., 7., 8.
+b = 0., 2., 2., 14., 2., 16., 17., 18.
+n = 0
+validate 1., 2., 2., 14., 2., 16., 17., 18.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld2
+const-aarch64 = LANE
+link-aarch64 = ld2lane._EXTpi82_
+generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t
+
+arm = vld2
+const-arm = LANE
+link-arm = vld2lane._EXTpi82_
+generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t
+
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-nox
+a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48
+validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld3
+link-aarch64 = ld3._EXTv2_
+generate *const i64:int64x2x3_t
+
+arm = vld3
+link-arm = vld3._EXTpi82_
+generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t
+generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t
+arm = nop
+aarch64 = nop
+generate *const i64:int64x1x3_t
+
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-nox
+multi_fn = transmute, {vld3-outsignednox-noext, transmute(a)}
+a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48
+validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+load_fn
+
+aarch64 = ld3
+generate *const u64:uint64x2x3_t
+target = aes
+generate *const p64:poly64x2x3_t
+
+target = default
+arm = vld3
+generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t
+generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t
+generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t
+arm = nop
+aarch64 = nop
+generate *const u64:uint64x1x3_t
+target = aes
+generate *const p64:poly64x1x3_t
+
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-nox
+a = 0., 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8.
+validate 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8.
+load_fn
+arm-aarch64-separate
+
+aarch64 = nop
+link-aarch64 = ld3._EXTv2_
+generate *const f64:float64x1x3_t
+aarch64 = ld3
+generate *const f64:float64x2x3_t
+
+arm = vld3
+link-arm = vld3._EXTpi82_
+generate *const f32:float32x2x3_t, *const f32:float32x4x3_t
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+name = vld3
+out-dup-nox
+a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld3r
+link-aarch64 = ld3r._EXT2_
+generate *const i64:int64x2x3_t
+
+arm = vld3
+link-arm = vld3dup._EXTpi82_
+generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t
+generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t
+arm = nop
+generate *const i64:int64x1x3_t
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+name = vld3
+out-dup-nox
+multi_fn = transmute, {vld3-outsigneddupnox-noext, transmute(a)}
+a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+
+aarch64 = ld3r
+generate *const u64:uint64x2x3_t
+target = aes
+generate *const p64:poly64x2x3_t
+
+target = default
+arm = vld3
+generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t
+generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t
+generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t
+arm = nop
+generate *const u64:uint64x1x3_t
+target = aes
+generate *const p64:poly64x1x3_t
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+name = vld3
+out-dup-nox
+a = 0., 1., 1., 1., 3., 1., 4., 3., 5., 1., 4., 3., 5.
+validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld3r
+link-aarch64 = ld3r._EXT2_
+generate *const f64:float64x1x3_t, *const f64:float64x2x3_t
+
+arm = vld3
+link-arm = vld3dup._EXTpi82_
+generate *const f32:float32x2x3_t, *const f32:float32x4x3_t
+
+/// Load multiple 3-element structures to two registers
+name = vld3
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+n = 0
+validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld3
+const-aarch64 = LANE
+link-aarch64 = ld3lane._EXTpi82_
+generate *const i8:int8x16x3_t:int8x16x3_t, *const i64:int64x1x3_t:int64x1x3_t, *const i64:int64x2x3_t:int64x2x3_t
+
+arm = vld3
+const-arm = LANE
+link-arm = vld3lane._EXTpi82_
+generate *const i8:int8x8x3_t:int8x8x3_t, *const i16:int16x4x3_t:int16x4x3_t, *const i32:int32x2x3_t:int32x2x3_t
+generate *const i16:int16x8x3_t:int16x8x3_t, *const i32:int32x4x3_t:int32x4x3_t
+
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vld3-outsignedlanenox-::<LANE>, transmute(a), transmute(b)}
+constn = LANE
+a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+n = 0
+validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+load_fn
+
+aarch64 = ld3
+const-aarch64 = LANE
+target = aes
+generate *const p64:poly64x1x3_t:poly64x1x3_t, *const p64:poly64x2x3_t:poly64x2x3_t
+target = default
+generate *const p8:poly8x16x3_t:poly8x16x3_t, *const u8:uint8x16x3_t:uint8x16x3_t, *const u64:uint64x1x3_t:uint64x1x3_t, *const u64:uint64x2x3_t:uint64x2x3_t
+
+arm = vld3
+const-arm = LANE
+generate *const u8:uint8x8x3_t:uint8x8x3_t, *const u16:uint16x4x3_t:uint16x4x3_t, *const u32:uint32x2x3_t:uint32x2x3_t
+generate *const u16:uint16x8x3_t:uint16x8x3_t, *const u32:uint32x4x3_t:uint32x4x3_t
+generate *const p8:poly8x8x3_t:poly8x8x3_t, *const p16:poly16x4x3_t:poly16x4x3_t
+generate *const p16:poly16x8x3_t:poly16x8x3_t
+
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0., 1., 2., 2., 4., 5., 6., 7., 8., 5., 6., 7., 8.
+b = 0., 2., 2., 14., 9., 16., 17., 18., 5., 6., 7., 8.
+n = 0
+validate 1., 2., 2., 14., 2., 16., 17., 18., 2., 6., 7., 8.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld3
+const-aarch64 = LANE
+link-aarch64 = ld3lane._EXTpi82_
+generate *const f64:float64x1x3_t:float64x1x3_t, *const f64:float64x2x3_t:float64x2x3_t
+
+arm = vld3
+const-arm = LANE
+link-arm = vld3lane._EXTpi82_
+generate *const f32:float32x2x3_t:float32x2x3_t, *const f32:float32x4x3_t:float32x4x3_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-nox
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld4
+link-aarch64 = ld4._EXTv2_
+generate *const i64:int64x2x4_t
+
+arm = vld4
+link-arm = vld4._EXTpi82_
+generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t
+generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t
+aarch64 = nop
+arm = nop
+generate *const i64:int64x1x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-nox
+multi_fn = transmute, {vld4-outsignednox-noext, transmute(a)}
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+load_fn
+
+aarch64 = ld4
+generate *const u64:uint64x2x4_t
+target = aes
+generate *const p64:poly64x2x4_t
+
+target = default
+arm = vld4
+generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t
+generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t
+generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t
+aarch64 = nop
+arm = nop
+generate *const u64:uint64x1x4_t
+target = aes
+generate *const p64:poly64x1x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-nox
+a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 15., 16.
+validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 15., 6., 8., 8., 16.
+load_fn
+arm-aarch64-separate
+
+aarch64 = nop
+link-aarch64 = ld4._EXTv2_
+generate *const f64:float64x1x4_t
+aarch64 = ld4
+generate *const f64:float64x2x4_t
+
+arm = vld4
+link-arm = vld4._EXTpi82_
+generate *const f32:float32x2x4_t, *const f32:float32x4x4_t
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+name = vld4
+out-dup-nox
+a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld4r
+link-aarch64 = ld4r._EXT2_
+generate *const i64:int64x2x4_t
+
+arm = vld4
+link-arm = vld4dup._EXTpi82_
+generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t
+generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t
+arm = nop
+generate *const i64:int64x1x4_t
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+name = vld4
+out-dup-nox
+multi_fn = transmute, {vld4-outsigneddupnox-noext, transmute(a)}
+a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+
+aarch64 = ld4r
+generate *const u64:uint64x2x4_t
+target = aes
+generate *const p64:poly64x2x4_t
+
+target = default
+arm = vld4
+generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t
+generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t
+generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t
+arm = nop
+generate *const u64:uint64x1x4_t
+target = aes
+generate *const p64:poly64x1x4_t
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+name = vld4
+out-dup-nox
+a = 0., 1., 1., 1., 1., 6., 4., 3., 5., 7., 4., 3., 5., 8., 4., 3., 5., 9., 4., 3., 5.
+validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld4r
+link-aarch64 = ld4r._EXT2_
+generate *const f64:float64x1x4_t, *const f64:float64x2x4_t
+
+arm = vld4
+link-arm = vld4dup._EXTpi82_
+generate *const f32:float32x2x4_t, *const f32:float32x4x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+n = 0
+validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld4
+const-aarch64 = LANE
+link-aarch64 = ld4lane._EXTpi82_
+generate *const i8:int8x16x4_t:int8x16x4_t, *const i64:int64x1x4_t:int64x1x4_t, *const i64:int64x2x4_t:int64x2x4_t
+
+arm = vld4
+const-arm = LANE
+link-arm = vld4lane._EXTpi82_
+generate *const i8:int8x8x4_t:int8x8x4_t, *const i16:int16x4x4_t:int16x4x4_t, *const i32:int32x2x4_t:int32x2x4_t
+generate *const i16:int16x8x4_t:int16x8x4_t, *const i32:int32x4x4_t:int32x4x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vld4-outsignedlanenox-::<LANE>, transmute(a), transmute(b)}
+constn = LANE
+a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+n = 0
+validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+load_fn
+
+aarch64 = ld4
+const-aarch64 = LANE
+target = aes
+generate *const p64:poly64x1x4_t:poly64x1x4_t, *const p64:poly64x2x4_t:poly64x2x4_t
+target = default
+generate *const p8:poly8x16x4_t:poly8x16x4_t, *const u8:uint8x16x4_t:uint8x16x4_t, *const u64:uint64x1x4_t:uint64x1x4_t, *const u64:uint64x2x4_t:uint64x2x4_t
+
+arm = vld4
+const-arm = LANE
+generate *const u8:uint8x8x4_t:uint8x8x4_t, *const u16:uint16x4x4_t:uint16x4x4_t, *const u32:uint32x2x4_t:uint32x2x4_t
+generate *const u16:uint16x8x4_t:uint16x8x4_t, *const u32:uint32x4x4_t:uint32x4x4_t
+generate *const p8:poly8x8x4_t:poly8x8x4_t, *const p16:poly16x4x4_t:poly16x4x4_t
+generate *const p16:poly16x8x4_t:poly16x8x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0., 1., 2., 2., 2., 5., 6., 7., 8., 5., 6., 7., 8., 1., 4., 3., 5.
+b = 0., 2., 2., 2., 2., 16., 2., 18., 5., 6., 7., 8., 1., 4., 3., 5.
+n = 0
+validate 1., 2., 2., 2., 2., 16., 2., 18., 2., 6., 7., 8., 2., 4., 3., 5.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld4
+const-aarch64 = LANE
+link-aarch64 = ld4lane._EXTpi82_
+generate *const f64:float64x1x4_t:float64x1x4_t, *const f64:float64x2x4_t:float64x2x4_t
+
+arm = vld4
+const-arm = LANE
+link-arm = vld4lane._EXTpi82_
+generate *const f32:float32x2x4_t:float32x2x4_t, *const f32:float32x4x4_t:float32x4x4_t
+
+/// Store multiple single-element structures from one, two, three, or four registers
+name = vst1
+in1-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = *a, {simd_extract, b, LANE as u32}
+constn = LANE
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+n = 0
+validate 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+
+aarch64 = nop
+arm = nop
+generate *mut i8:int8x8_t:void, *mut i16:int16x4_t:void, *mut i32:int32x2_t:void, *mut i64:int64x1_t:void
+generate *mut i8:int8x16_t:void, *mut i16:int16x8_t:void, *mut i32:int32x4_t:void, *mut i64:int64x2_t:void
+generate *mut u8:uint8x8_t:void, *mut u16:uint16x4_t:void, *mut u32:uint32x2_t:void, *mut u64:uint64x1_t:void
+generate *mut u8:uint8x16_t:void, *mut u16:uint16x8_t:void, *mut u32:uint32x4_t:void, *mut u64:uint64x2_t:void
+generate *mut p8:poly8x8_t:void, *mut p16:poly16x4_t:void, *mut p8:poly8x16_t:void, *mut p16:poly16x8_t:void
+target = aes
+generate *mut p64:poly64x1_t:void, *mut p64:poly64x2_t:void
+
+/// Store multiple single-element structures from one, two, three, or four registers
+name = vst1
+in1-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = *a, {simd_extract, b, LANE as u32}
+constn = LANE
+a = 0., 1., 2., 3., 4., 5., 6., 7., 8.
+n = 0
+validate 1., 0., 0., 0., 0., 0., 0., 0.
+store_fn
+
+aarch64 = nop
+generate *mut f64:float64x1_t:void, *mut f64:float64x2_t:void
+
+arm = nop
+generate *mut f32:float32x2_t:void, *mut f32:float32x4_t:void
+
+/// Store multiple single-element structures from one, two, three, or four registers
+name = vst1
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+store_fn
+arm-aarch64-separate
+
+aarch64 = st1
+link-aarch64 = st1x2._EXT3_
+arm = vst1
+link-arm = vst1x2._EXTr3_
+generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void
+generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void, *mut i64:int64x2x2_t:void
+
+link-aarch64 = st1x3._EXT3_
+link-arm = vst1x3._EXTr3_
+generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void
+generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void, *mut i64:int64x2x3_t:void
+
+link-aarch64 = st1x4._EXT3_
+link-arm = vst1x4._EXTr3_
+generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void
+generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void, *mut i64:int64x2x4_t:void
+
+/// Store multiple single-element structures to one, two, three, or four registers
+name = vst1
+multi_fn = vst1-signed-noext, transmute(a), transmute(b)
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+
+store_fn
+aarch64 = st1
+arm = vst1
+generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void
+generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void, *mut u64:uint64x2x2_t:void
+generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void
+generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void, *mut u64:uint64x2x3_t:void
+generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void
+generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void, *mut u64:uint64x2x4_t:void
+generate *mut p8:poly8x8x2_t:void, *mut p8:poly8x8x3_t:void, *mut p8:poly8x8x4_t:void
+generate *mut p8:poly8x16x2_t:void, *mut p8:poly8x16x3_t:void, *mut p8:poly8x16x4_t:void
+generate *mut p16:poly16x4x2_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x4x4_t:void
+generate *mut p16:poly16x8x2_t:void, *mut p16:poly16x8x3_t:void, *mut p16:poly16x8x4_t:void
+target = aes
+generate *mut p64:poly64x1x2_t:void
+arm = nop
+generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x1x4_t:void
+generate *mut p64:poly64x2x2_t:void, *mut p64:poly64x2x3_t:void, *mut p64:poly64x2x4_t:void
+
+/// Store multiple single-element structures to one, two, three, or four registers
+name = vst1
+a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
+validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st1
+link-aarch64 = st1x2._EXT3_
+generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void
+
+link-aarch64 = st1x3._EXT3_
+generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void
+
+link-aarch64 = st1x4._EXT3_
+generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void
+
+arm = vst1
+link-aarch64 = st1x2._EXT3_
+link-arm = vst1x2._EXTr3_
+generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void
+
+link-aarch64 = st1x3._EXT3_
+link-arm = vst1x3._EXTr3_
+generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void
+
+link-aarch64 = st1x4._EXT3_
+link-arm = vst1x4._EXTr3_
+generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-nox
+a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
+store_fn
+arm-aarch64-separate
+
+aarch64 = st2
+link-aarch64 = st2._EXTpi8_
+generate *mut i64:int64x2x2_t:void
+
+arm = vst2
+link-arm = vst2._EXTpi8r_
+generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void
+generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void
+arm = nop
+aarch64 = nop
+generate *mut i64:int64x1x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+multi_fn = transmute, {vst2-in1signednox-noext, transmute(a), transmute(b)}
+in1-nox
+a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
+store_fn
+
+aarch64 = st2
+generate *mut u64:uint64x2x2_t:void
+target = aes
+generate *mut p64:poly64x2x2_t:void
+
+target = default
+arm = vst2
+generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void
+generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void
+generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p8:poly8x16x2_t:void, *mut p16:poly16x8x2_t:void
+arm = nop
+aarch64 = nop
+generate *mut u64:uint64x1x2_t:void
+target = aes
+generate *mut p64:poly64x1x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-nox
+a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9.
+validate 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st1
+link-aarch64 = st2._EXTpi8_
+generate *mut f64:float64x1x2_t:void
+aarch64 = st2
+generate *mut f64:float64x2x2_t:void
+
+arm = vst2
+link-arm = vst2._EXTpi8r_
+generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+n = 0
+validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+arm-aarch64-separate
+
+aarch64 = st2
+link-aarch64 = st2lane._EXTpi8_
+const-aarch64 = LANE
+generate *mut i8:int8x16x2_t:void, *mut i64:int64x1x2_t:void, *mut i64:int64x2x2_t:void
+
+arm = vst2
+link-arm = vst2lane._EXTpi8r_
+const-arm = LANE
+generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void
+generate *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vst2-in1signedlanenox-::<LANE>, transmute(a), transmute(b)}
+a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+n = 0
+validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+
+aarch64 = st2
+generate *mut u8:uint8x16x2_t:void, *mut u64:uint64x1x2_t:void, *mut u64:uint64x2x2_t:void, *mut p8:poly8x16x2_t:void
+target = aes
+generate *mut p64:poly64x1x2_t:void, *mut p64:poly64x2x2_t:void
+
+target = default
+arm = vst2
+generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void
+generate *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void
+generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p16:poly16x8x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9.
+n = 0
+validate 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st2
+link-aarch64 = st2lane._EXTpi8_
+const-aarch64 = LANE
+generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void
+
+arm = vst2
+link-arm = vst2lane._EXTpi8r_
+const-arm = LANE
+generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-nox
+a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48
+store_fn
+arm-aarch64-separate
+
+aarch64 = st3
+link-aarch64 = st3._EXTpi8_
+generate *mut i64:int64x2x3_t:void
+
+arm = vst3
+link-arm = vst3._EXTpi8r_
+generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void
+generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void
+arm = nop
+aarch64 = nop
+generate *mut i64:int64x1x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+multi_fn = transmute, {vst3-in1signednox-noext, transmute(a), transmute(b)}
+in1-nox
+a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48
+store_fn
+
+aarch64 = st3
+generate *mut u64:uint64x2x3_t:void
+target = aes
+generate *mut p64:poly64x2x3_t:void
+
+target = default
+arm = vst3
+generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void
+generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void
+generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p8:poly8x16x3_t:void, *mut p16:poly16x8x3_t:void
+arm = nop
+aarch64 = nop
+generate *mut u64:uint64x1x3_t:void
+target = aes
+generate *mut p64:poly64x1x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-nox
+a = 0., 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8., 13., 14., 15., 16
+validate 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8., 2., 13., 13., 4.
+store_fn
+arm-aarch64-separate
+
+aarch64 = nop
+link-aarch64 = st3._EXTpi8_
+generate *mut f64:float64x1x3_t:void
+aarch64 = st3
+generate *mut f64:float64x2x3_t:void
+
+arm = vst3
+link-arm = vst3._EXTpi8r_
+generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+n = 0
+validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+arm-aarch64-separate
+
+aarch64 = st3
+link-aarch64 = st3lane._EXTpi8_
+const-aarch64 = LANE
+generate *mut i8:int8x16x3_t:void, *mut i64:int64x1x3_t:void, *mut i64:int64x2x3_t:void
+
+arm = vst3
+link-arm = vst3lane._EXTpi8r_
+const-arm = LANE
+generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void
+generate *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vst3-in1signedlanenox-::<LANE>, transmute(a), transmute(b)}
+a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+n = 0
+validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+
+aarch64 = st3
+generate *mut u8:uint8x16x3_t:void, *mut u64:uint64x1x3_t:void, *mut u64:uint64x2x3_t:void, *mut p8:poly8x16x3_t:void
+target = aes
+generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x2x3_t:void
+
+target = default
+arm = vst3
+generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void
+generate *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void
+generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x8x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9.
+n = 0
+validate 1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st3
+link-aarch64 = st3lane._EXTpi8_
+const-aarch64 = LANE
+generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void
+
+arm = vst3
+link-arm = vst3lane._EXTpi8r_
+const-arm = LANE
+generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-nox
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+store_fn
+arm-aarch64-separate
+
+aarch64 = st4
+link-aarch64 = st4._EXTpi8_
+generate *mut i64:int64x2x4_t:void
+
+arm = vst4
+link-arm = vst4._EXTpi8r_
+generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void
+generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void
+arm = nop
+aarch64 = nop
+generate *mut i64:int64x1x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+multi_fn = transmute, {vst4-in1signednox-noext, transmute(a), transmute(b)}
+in1-nox
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+store_fn
+
+aarch64 = st4
+generate *mut u64:uint64x2x4_t:void
+target = aes
+generate *mut p64:poly64x2x4_t:void
+
+target = default
+arm = vst4
+generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void
+generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void
+generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p8:poly8x16x4_t:void, *mut p16:poly16x8x4_t:void
+arm = nop
+aarch64 = nop
+generate *mut u64:uint64x1x4_t:void
+target = aes
+generate *mut p64:poly64x1x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-nox
+a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.
+validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.
+store_fn
+arm-aarch64-separate
+
+aarch64 = nop
+link-aarch64 = st4._EXTpi8_
+generate *mut f64:float64x1x4_t:void
+aarch64 = st4
+generate *mut f64:float64x2x4_t:void
+
+arm = vst4
+link-arm = vst4._EXTpi8r_
+generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+n = 0
+validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+arm-aarch64-separate
+
+aarch64 = st4
+link-aarch64 = st4lane._EXTpi8_
+const-aarch64 = LANE
+generate *mut i8:int8x16x4_t:void, *mut i64:int64x1x4_t:void, *mut i64:int64x2x4_t:void
+
+arm = vst4
+link-arm = vst4lane._EXTpi8r_
+const-arm = LANE
+generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void
+generate *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vst4-in1signedlanenox-::<LANE>, transmute(a), transmute(b)}
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+n = 0
+validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+
+aarch64 = st4
+generate *mut u8:uint8x16x4_t:void, *mut u64:uint64x1x4_t:void, *mut u64:uint64x2x4_t:void, *mut p8:poly8x16x4_t:void
+target = aes
+generate *mut p64:poly64x1x4_t:void, *mut p64:poly64x2x4_t:void
+
+target = default
+arm = vst4
+generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void
+generate *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void
+generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p16:poly16x8x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.
+n = 0
+validate 1., 2., 2., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st4
+link-aarch64 = st4lane._EXTpi8_
+const-aarch64 = LANE
+generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void
+
+arm = vst4
+link-arm = vst4lane._EXTpi8r_
+const-arm = LANE
+generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
+
+/// Dot product index form with signed and unsigned integers
+name = vsudot
+out-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = simd_shuffle-in_len-!, c:unsigned, c, c, {base-4-LANE}
+multi_fn = vsudot-outlane-_, a, b, c
+a = 1, 2, 1, 2
+b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+n = 0
+validate 31, 72, 31, 72
+target = dotprod
+
+aarch64 = sudot
+link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:int8x8_t:uint8x8_t:int32x2_t
+// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot
+//generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t, int32x2_t:int8x8_t:uint8x16_t:int32x2_t
+link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:int8x16_t:uint8x16_t:int32x4_t
+// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot
+//generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t, int32x4_t:int8x16_t:uint8x16_t:int32x4_t
+
+/// Multiply
+name = vmul
+a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32
+arm = vmul.
+aarch64 = mul
+fn = simd_mul
+generate int*_t, uint*_t
+
+/// Polynomial multiply
+name = vmul
+a = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate 1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48
+
+aarch64 = pmul
+link-aarch64 = pmul._EXT_
+arm = vmul
+link-arm = vmulp._EXT_
+generate poly8x8_t, poly8x16_t
+
+/// Multiply
+name = vmul
+fn = simd_mul
+a = 1.0, 2.0, 1.0, 2.0
+b = 2.0, 3.0, 4.0, 5.0
+validate 2.0, 6.0, 4.0, 10.0
+
+aarch64 = fmul
+generate float64x*_t
+
+arm = vmul.
+generate float*_t
+
+/// Vector multiply by scalar
+name = vmul
+out-n-suffix
+multi_fn = simd_mul, a, {vdup-nout-noext, b}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 2
+validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
+
+arm = vmul
+aarch64 = mul
+generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t
+generate uint16x4_t:u16:uint16x4_t, uint16x8_t:u16:uint16x8_t, uint32x2_t:u32:uint32x2_t, uint32x4_t:u32:uint32x4_t
+
+/// Vector multiply by scalar
+name = vmul
+out-n-suffix
+multi_fn = simd_mul, a, {vdup-nout-noext, b}
+a = 1., 2., 3., 4.
+b = 2.
+validate 2., 4., 6., 8.
+
+aarch64 = fmul
+generate float64x1_t:f64:float64x1_t, float64x2_t:f64:float64x2_t
+
+arm = vmul
+generate float32x2_t:f32:float32x2_t, float32x4_t:f32:float32x4_t
+
+/// Multiply
+name = vmul
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
+
+aarch64 = mul
+arm = vmul
+generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t
+generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t
+generate uint16x4_t, uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
+generate uint32x2_t, uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
+
+/// Floating-point multiply
+name = vmul
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_mul, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}}
+a = 1., 2., 3., 4.
+b = 2., 0., 0., 0.
+n = 0
+validate 2., 4., 6., 8.
+
+aarch64 = fmul
+generate float64x1_t, float64x1_t:float64x2_t:float64x1_t
+
+/// Floating-point multiply
+name = vmul
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
+a = 1., 2., 3., 4.
+b = 2., 0., 0., 0.
+n = 0
+validate 2., 4., 6., 8.
+
+aarch64 = fmul
+generate float64x2_t:float64x1_t:float64x2_t, float64x2_t
+
+arm = vmul
+generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
+/// Floating-point multiply
+name = vmuls_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_extract, b:f32, b, LANE as u32
+multi_fn = a * b
+a = 1.
+b = 2., 0., 0., 0.
+n = 0
+validate 2.
+aarch64 = fmul
+generate f32:float32x2_t:f32, f32:float32x4_t:f32
+
+/// Floating-point multiply
+name = vmuld_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_extract, b:f64, b, LANE as u32
+multi_fn = a * b
+a = 1.
+b = 2., 0.
+n = 0
+validate 2.
+aarch64 = fmul
+generate f64:float64x1_t:f64, f64:float64x2_t:f64
+
+/// Signed multiply long
+name = vmull
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
+validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32
+
+arm = vmull.s
+aarch64 = smull
+link-arm = vmulls._EXT_
+link-aarch64 = smull._EXT_
+generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
+
+/// Signed multiply long
+name = vmull_high
+no-q
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = vmull-noqself-noext, a, b
+a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
+fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 9, 20, 11, 24, 13, 28, 15, 32
+
+aarch64 = smull2
+generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
+
+/// Unsigned multiply long
+name = vmull
+a = 1, 2, 3, 4, 5, 6, 7, 8
+b = 1, 2, 1, 2, 1, 2, 1, 2
+validate 1, 4, 3, 8, 5, 12, 7, 16
+
+arm = vmull.s
+aarch64 = umull
+link-arm = vmullu._EXT_
+link-aarch64 = umull._EXT_
+generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t
+
+/// Unsigned multiply long
+name = vmull_high
+no-q
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = vmull-noqself-noext, a, b
+a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
+fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 9, 20, 11, 24, 13, 28, 15, 32
+
+aarch64 = umull2
+generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t
+
+/// Polynomial multiply long
+name = vmull
+a = 1, 2, 3, 4, 5, 6, 7, 8
+b = 1, 3, 1, 3, 1, 3, 1, 3
+validate 1, 6, 3, 12, 5, 10, 7, 24
+
+arm = vmull.s
+aarch64 = pmull
+link-arm = vmullp._EXT_
+link-aarch64 = pmull._EXT_
+generate poly8x8_t:poly8x8_t:poly16x8_t
+
+/// Polynomial multiply long
+name = vmull
+no-q
+a = 15
+b = 3
+validate 17
+target = aes
+
+aarch64 = pmull
+link-aarch64 = pmull64:p64:p64:p64:int8x16_t
+// Because of the support status of llvm, vmull_p64 is currently only available on arm
+// arm = vmull
+// link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t
+generate p64:p64:p128
+
+
+/// Polynomial multiply long
+name = vmull_high
+no-q
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = vmull-noqself-noext, a, b
+a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
+fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 9, 30, 11, 20, 13, 18, 15, 48
+
+aarch64 = pmull
+generate poly8x16_t:poly8x16_t:poly16x8_t
+
+/// Polynomial multiply long
+name = vmull_high
+no-q
+multi_fn = vmull-noqself-noext, {simd_extract, a, 1}, {simd_extract, b, 1}
+a = 1, 15
+b = 1, 3
+validate 17
+target = aes
+
+aarch64 = pmull
+generate poly64x2_t:poly64x2_t:p128
+
+/// Vector long multiply with scalar
+name = vmull_n
+no-q
+multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b}
+a = 1, 2, 3, 4, 5, 6, 7, 8
+b = 2
+validate 2, 4, 6, 8, 10, 12, 14, 16
+
+arm = vmull
+aarch64 = smull
+generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t
+aarch64 = umull
+generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t
+
+/// Vector long multiply by scalar
+name = vmull_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
+
+arm = vmull
+aarch64 = smull
+generate int16x4_t:int16x4_t:int32x4_t, int16x4_t:int16x8_t:int32x4_t
+generate int32x2_t:int32x2_t:int64x2_t, int32x2_t:int32x4_t:int64x2_t
+aarch64 = umull
+generate uint16x4_t:uint16x4_t:uint32x4_t, uint16x4_t:uint16x8_t:uint32x4_t
+generate uint32x2_t:uint32x2_t:uint64x2_t, uint32x2_t:uint32x4_t:uint64x2_t
+
+/// Multiply long
+name = vmull_high_n
+no-q
+multi_fn = vmull_high-noqself-noext, a, {vdup-nin0-noext, b}
+a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
+b = 2
+validate 18, 20, 22, 24, 26, 28, 30, 32
+
+aarch64 = smull2
+generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t
+aarch64 = umull2
+generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t
+
+/// Multiply long
+name = vmull_high_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
+a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 18, 20, 22, 24, 26, 28, 30, 32
+
+aarch64 = smull2
+generate int16x8_t:int16x4_t:int32x4_t, int16x8_t:int16x8_t:int32x4_t
+generate int32x4_t:int32x2_t:int64x2_t, int32x4_t:int32x4_t:int64x2_t
+aarch64 = umull2
+generate uint16x8_t:uint16x4_t:uint32x4_t, uint16x8_t:uint16x8_t:uint32x4_t
+generate uint32x4_t:uint32x2_t:uint64x2_t, uint32x4_t:uint32x4_t:uint64x2_t
+
+/// Floating-point multiply extended
+name = vmulx
+a = 1., 2., 3., 4.
+b = 2., 2., 2., 2.
+validate 2., 4., 6., 8.
+
+aarch64 = fmulx
+link-aarch64 = fmulx._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point multiply extended
+name = vmulx
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmulx-in0-noext, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}}
+a = 1.
+b = 2., 0.
+n = 0
+validate 2.
+
+aarch64 = fmulx
+generate float64x1_t, float64x1_t:float64x2_t:float64x1_t
+
+/// Floating-point multiply extended
+name = vmulx
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
+a = 1., 2., 3., 4.
+b = 2., 0., 0., 0.
+n = 0
+validate 2., 4., 6., 8.
+
+aarch64 = fmulx
+generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t
+generate float64x2_t:float64x1_t:float64x2_t, float64x2_t
+
+/// Floating-point multiply extended
+name = vmulx
+a = 2.
+b = 3.
+validate 6.
+
+aarch64 = fmulx
+link-aarch64 = fmulx._EXT_
+generate f32, f64
+
+/// Floating-point multiply extended
+name = vmulx
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmulx-out-noext, a, {simd_extract, b, LANE as u32}
+
+a = 2.
+b = 3., 0., 0., 0.
+n = 0
+validate 6.
+
+aarch64 = fmulx
+generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:float64x2_t:f64
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+name = vfma
+multi_fn = vfma-self-_, b, c, a
+a = 8.0, 18.0, 12.0, 10.0
+b = 6.0, 4.0, 7.0, 8.0
+c = 2.0, 3.0, 4.0, 5.0
+validate 20.0, 30.0, 40.0, 50.0
+
+link-aarch64 = llvm.fma._EXT_
+aarch64 = fmadd
+generate float64x1_t
+aarch64 = fmla
+generate float64x2_t
+
+target = vfp4
+arm = vfma
+link-arm = llvm.fma._EXT_
+generate float*_t
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+name = vfma
+n-suffix
+multi_fn = vfma-self-noext, a, b, {vdup-nselfvfp4-noext, c}
+a = 2.0, 3.0, 4.0, 5.0
+b = 6.0, 4.0, 7.0, 8.0
+c = 8.0
+validate 50.0, 35.0, 60.0, 69.0
+
+aarch64 = fmadd
+generate float64x1_t:float64x1_t:f64:float64x1_t
+aarch64 = fmla
+generate float64x2_t:float64x2_t:f64:float64x2_t
+
+target = vfp4
+arm = vfma
+generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
+
+/// Floating-point fused multiply-add to accumulator
+name = vfma
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
+a = 2., 3., 4., 5.
+b = 6., 4., 7., 8.
+c = 2., 0., 0., 0.
+n = 0
+validate 14., 11., 18., 21.
+
+aarch64 = fmla
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+aarch64 = fmadd
+generate float64x1_t
+aarch64 = fmla
+generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t
+
+/// Floating-point fused multiply-add to accumulator
+name = vfma
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = simd_extract, c:out_t, c, LANE as u32
+multi_fn = vfma-in2lane-_, b, c, a
+a = 2.
+b = 6.
+c = 3., 0., 0., 0.
+n = 0
+validate 20.
+
+aarch64 = fmla
+link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32
+generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32
+link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64
+aarch64 = fmadd
+generate f64:f64:float64x1_t:f64
+aarch64 = fmla
+generate f64:f64:float64x2_t:f64
+
+/// Floating-point fused multiply-subtract from accumulator
+name = vfms
+multi_fn = simd_neg, b:in_t, b
+multi_fn = vfma-self-noext, a, b, c
+a = 20.0, 30.0, 40.0, 50.0
+b = 6.0, 4.0, 7.0, 8.0
+c = 2.0, 3.0, 4.0, 5.0
+validate 8.0, 18.0, 12.0, 10.0
+
+aarch64 = fmsub
+generate float64x1_t
+aarch64 = fmls
+generate float64x2_t
+
+target = vfp4
+arm = vfms
+generate float*_t
+
+/// Floating-point fused Multiply-subtract to accumulator(vector)
+name = vfms
+n-suffix
+multi_fn = vfms-self-noext, a, b, {vdup-nselfvfp4-noext, c}
+a = 50.0, 35.0, 60.0, 69.0
+b = 6.0, 4.0, 7.0, 8.0
+c = 8.0
+validate 2.0, 3.0, 4.0, 5.0
+
+aarch64 = fmsub
+generate float64x1_t:float64x1_t:f64:float64x1_t
+aarch64 = fmls
+generate float64x2_t:float64x2_t:f64:float64x2_t
+
+target = vfp4
+arm = vfms
+generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
+
+/// Floating-point fused multiply-subtract to accumulator
+name = vfms
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
+a = 14., 11., 18., 21.
+b = 6., 4., 7., 8.
+c = 2., 0., 0., 0.
+n = 0
+validate 2., 3., 4., 5.
+
+aarch64 = fmls
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+aarch64 = fmsub
+generate float64x1_t
+aarch64 = fmls
+generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t
+
+/// Floating-point fused multiply-subtract to accumulator
+name = vfms
+in2-lane-suffixes
+constn = LANE
+multi_fn = vfma-in2lane-::<LANE>, a, -b, c
+a = 14.
+b = 6.
+c = 2., 0., 0., 0.
+n = 0
+validate 2.
+
+aarch64 = fmls
+generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32
+aarch64 = fmsub
+generate f64:f64:float64x1_t:f64
+aarch64 = fmls
+generate f64:f64:float64x2_t:f64
+
+/// Divide
+name = vdiv
+fn = simd_div
+a = 2.0, 6.0, 4.0, 10.0
+b = 1.0, 2.0, 1.0, 2.0
+validate 2.0, 3.0, 4.0, 5.0
+
+aarch64 = fdiv
+generate float*_t, float64x*_t
+
+/// Subtract
+name = vsub
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
+validate 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+arm = vsub.
+aarch64 = sub
+fn = simd_sub
+generate int*_t, uint*_t, int64x*_t, uint64x*_t
+
+/// Subtract
+name = vsub
+fn = simd_sub
+a = 1.0, 4.0, 3.0, 8.0
+b = 1.0, 2.0, 3.0, 4.0
+validate 0.0, 2.0, 0.0, 4.0
+
+aarch64 = fsub
+generate float64x*_t
+
+arm = vsub.
+generate float*_t
+
+/// Subtract
+name = vsub
+multi_fn = a.wrapping_sub(b)
+a = 3
+b = 2
+validate 1
+
+aarch64 = nop
+generate i64, u64
+
+/// Add
+name = vadd
+multi_fn = a.wrapping_add(b)
+a = 1
+b = 2
+validate 3
+
+aarch64 = nop
+generate i64, u64
+
+/// Bitwise exclusive OR
+name = vadd
+multi_fn = simd_xor, a, b
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+validate 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17
+
+aarch64 = nop
+arm = nop
+generate poly8x8_t, poly16x4_t, poly8x16_t, poly16x8_t, poly64x1_t, poly64x2_t
+
+/// Bitwise exclusive OR
+name = vaddq
+no-q
+multi_fn = a ^ b
+a = 16
+b = 1
+validate 17
+
+aarch64 = nop
+arm = nop
+generate p128
+
+/// Floating-point add across vector
+name = vaddv
+a = 1., 2., 0., 0.
+validate 3.
+
+aarch64 = faddp
+link-aarch64 = faddv._EXT2_._EXT_
+generate float32x2_t:f32, float32x4_t:f32, float64x2_t:f64
+
+/// Signed Add Long across Vector
+name = vaddlv
+a = 1, 2, 3, 4
+validate 10
+
+aarch64 = saddlv
+link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_
+generate int16x4_t:i32
+
+/// Signed Add Long across Vector
+name = vaddlv
+a = 1, 2, 3, 4, 5, 6, 7, 8
+validate 36
+
+aarch64 = saddlv
+link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_
+generate int16x8_t:i32
+
+/// Signed Add Long across Vector
+name = vaddlv
+a = 1, 2
+validate 3
+
+aarch64 = saddlp
+link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_
+generate int32x2_t:i64
+
+/// Signed Add Long across Vector
+name = vaddlv
+a = 1, 2, 3, 4
+validate 10
+
+aarch64 = saddlv
+link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_
+generate int32x4_t:i64
+
+/// Unsigned Add Long across Vector
+name = vaddlv
+a = 1, 2, 3, 4
+validate 10
+
+aarch64 = uaddlv
+link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_
+generate uint16x4_t:u32
+
+/// Unsigned Add Long across Vector
+name = vaddlv
+a = 1, 2, 3, 4, 5, 6, 7, 8
+validate 36
+
+aarch64 = uaddlv
+link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_
+generate uint16x8_t:u32
+
+/// Unsigned Add Long across Vector
+name = vaddlv
+a = 1, 2
+validate 3
+
+aarch64 = uaddlp
+link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_
+generate uint32x2_t:u64
+
+/// Unsigned Add Long across Vector
+name = vaddlv
+a = 1, 2, 3, 4
+validate 10
+
+aarch64 = uaddlv
+link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_
+generate uint32x4_t:u64
+
+/// Subtract returning high narrow
+name = vsubhn
+no-q
+multi_fn = fixed, c:in_t
+multi_fn = simd_cast, {simd_shr, {simd_sub, a, b}, transmute(c)}
+a = MAX, MIN, 1, 1, MAX, MIN, 1, 1
+b = 1, 0, 0, 0, 1, 0, 0, 0
+fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS
+validate MAX, MIN, 0, 0, MAX, MIN, 0, 0
+
+arm = vsubhn
+aarch64 = subhn
+generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
+generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
+
+/// Subtract returning high narrow
+name = vsubhn_high
+no-q
+multi_fn = vsubhn-noqself-noext, d:in_t0, b, c
+multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len}
+a = MAX, 0, MAX, 0, MAX, 0, MAX, 0
+b = MAX, 1, MAX, 1, MAX, 1, MAX, 1
+c = 1, 0, 1, 0, 1, 0, 1, 0
+validate MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0
+
+arm = vsubhn
+aarch64 = subhn2
+generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t
+generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t
+
+/// Signed halving subtract
+name = vhsub
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
+validate 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7
+
+arm = vhsub.s
+aarch64 = uhsub
+link-arm = vhsubu._EXT_
+link-aarch64 = uhsub._EXT_
+generate uint*_t
+
+arm = vhsub.s
+aarch64 = shsub
+link-arm = vhsubs._EXT_
+link-aarch64 = shsub._EXT_
+generate int*_t
+
+/// Signed Subtract Wide
+name = vsubw
+no-q
+multi_fn = simd_sub, a, {simd_cast, b}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
+b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
+validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+arm = vsubw
+aarch64 = ssubw
+generate int16x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int64x2_t
+
+/// Unsigned Subtract Wide
+name = vsubw
+no-q
+multi_fn = simd_sub, a, {simd_cast, b}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
+b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
+validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+arm = vsubw
+aarch64 = usubw
+generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint64x2_t
+
+/// Signed Subtract Wide
+name = vsubw_high
+no-q
+multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_sub, a, {simd_cast, c}
+a = 8, 9, 10, 12, 13, 14, 15, 16
+b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
+validate 0, 0, 0, 0, 0, 0, 0, 0
+
+aarch64 = ssubw
+generate int16x8_t:int8x16_t:int16x8_t
+
+/// Signed Subtract Wide
+name = vsubw_high
+no-q
+multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_sub, a, {simd_cast, c}
+a = 8, 9, 10, 11
+b = 0, 1, 2, 3, 8, 9, 10, 11
+validate 0, 0, 0, 0
+
+aarch64 = ssubw
+generate int32x4_t:int16x8_t:int32x4_t
+
+/// Signed Subtract Wide
+name = vsubw_high
+no-q
+multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3]
+multi_fn = simd_sub, a, {simd_cast, c}
+a = 8, 9
+b = 6, 7, 8, 9
+validate 0, 0
+
+aarch64 = ssubw
+generate int64x2_t:int32x4_t:int64x2_t
+
+/// Unsigned Subtract Wide
+name = vsubw_high
+no-q
+multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_sub, a, {simd_cast, c}
+a = 8, 9, 10, 11, 12, 13, 14, 15
+b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 0, 0, 0, 0, 0, 0, 0, 0
+
+aarch64 = usubw
+generate uint16x8_t:uint8x16_t:uint16x8_t
+
+/// Unsigned Subtract Wide
+name = vsubw_high
+no-q
+multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_sub, a, {simd_cast, c}
+a = 8, 9, 10, 11
+b = 0, 1, 2, 3, 8, 9, 10, 11
+validate 0, 0, 0, 0
+
+aarch64 = usubw
+generate uint32x4_t:uint16x8_t:uint32x4_t
+
+/// Unsigned Subtract Wide
+name = vsubw_high
+no-q
+multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_sub, a, {simd_cast, c}
+a = 8, 9
+b = 6, 7, 8, 9
+validate 0, 0
+
+aarch64 = usubw
+generate uint64x2_t:uint32x4_t:uint64x2_t
+
+/// Signed Subtract Long
+name = vsubl
+no-q
+multi_fn = simd_cast, c:out_t, a
+multi_fn = simd_cast, d:out_t, b
+multi_fn = simd_sub, c, d
+
+a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+arm = vsubl
+aarch64 = ssubl
+generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
+
+/// Unsigned Subtract Long
+name = vsubl
+no-q
+multi_fn = simd_cast, c:out_t, a
+multi_fn = simd_cast, d:out_t, b
+multi_fn = simd_sub, c, d
+
+a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+arm = vsubl
+aarch64 = usubl
+generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t
+
+/// Signed Subtract Long
+name = vsubl_high
+no-q
+multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_cast, d:out_t, c
+multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_cast, f:out_t, e
+multi_fn = simd_sub, d, f
+
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
+validate 6, 7, 8, 9, 10, 11, 12, 13
+
+aarch64 = ssubl
+generate int8x16_t:int8x16_t:int16x8_t
+
+/// Signed Subtract Long
+name = vsubl_high
+no-q
+multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_cast, d:out_t, c
+multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_cast, f:out_t, e
+multi_fn = simd_sub, d, f
+
+a = 8, 9, 10, 11, 12, 13, 14, 15
+b = 6, 6, 6, 6, 8, 8, 8, 8
+validate 4, 5, 6, 7
+
+aarch64 = ssubl
+generate int16x8_t:int16x8_t:int32x4_t
+
+/// Signed Subtract Long
+name = vsubl_high
+no-q
+multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
+multi_fn = simd_cast, d:out_t, c
+multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3]
+multi_fn = simd_cast, f:out_t, e
+multi_fn = simd_sub, d, f
+
+a = 12, 13, 14, 15
+b = 6, 6, 8, 8
+validate 6, 7
+
+aarch64 = ssubl
+generate int32x4_t:int32x4_t:int64x2_t
+
+/// Unsigned Subtract Long
+name = vsubl_high
+no-q
+multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_cast, d:out_t, c
+multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_cast, f:out_t, e
+multi_fn = simd_sub, d, f
+
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
+validate 6, 7, 8, 9, 10, 11, 12, 13
+
+aarch64 = usubl
+generate uint8x16_t:uint8x16_t:uint16x8_t
+
+/// Unsigned Subtract Long
+name = vsubl_high
+no-q
+multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_cast, d:out_t, c
+multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_cast, f:out_t, e
+multi_fn = simd_sub, d, f
+
+a = 8, 9, 10, 11, 12, 13, 14, 15
+b = 6, 6, 6, 6, 8, 8, 8, 8
+validate 4, 5, 6, 7
+
+aarch64 = usubl
+generate uint16x8_t:uint16x8_t:uint32x4_t
+
+/// Unsigned Subtract Long
+name = vsubl_high
+no-q
+multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
+multi_fn = simd_cast, d:out_t, c
+multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_cast, f:out_t, e
+multi_fn = simd_sub, d, f
+
+a = 12, 13, 14, 15
+b = 6, 6, 8, 8
+validate 6, 7
+
+aarch64 = usubl
+generate uint32x4_t:uint32x4_t:uint64x2_t
+
+/// Bit clear and exclusive OR
+name = vbcax
+a = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0
+b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+c = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+validate 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
+target = sha3
+
+aarch64 = bcax
+link-aarch64 = llvm.aarch64.crypto.bcaxs._EXT_
+generate int8x16_t, int16x8_t, int32x4_t, int64x2_t
+link-aarch64 = llvm.aarch64.crypto.bcaxu._EXT_
+generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t
+
+/// Floating-point complex add
+name = vcadd_rot270
+no-q
+a = 1., -1., 1., -1.
+b = -1., 1., -1., 1.
+validate 2., 0., 2., 0.
+target = fcma
+
+aarch64 = fcadd
+link-aarch64 = vcadd.rot270._EXT_
+generate float32x2_t
+name = vcaddq_rot270
+generate float32x4_t, float64x2_t
+
+/// Floating-point complex add
+name = vcadd_rot90
+no-q
+a = 1., -1., 1., -1.
+b = -1., 1., -1., 1.
+validate 0., -2., 0., -2.
+target = fcma
+
+aarch64 = fcadd
+link-aarch64 = vcadd.rot90._EXT_
+generate float32x2_t
+name = vcaddq_rot90
+generate float32x4_t, float64x2_t
+
+/// Floating-point complex multiply accumulate
+name = vcmla
+a = 1., -1., 1., -1.
+b = -1., 1., -1., 1.
+c = 1., 1., -1., -1.
+validate 0., -2., 2., 0.
+target = fcma
+
+aarch64 = fcmla
+link-aarch64 = vcmla.rot0._EXT_
+generate float32x2_t, float32x4_t, float64x2_t
+
+/// Floating-point complex multiply accumulate
+name = vcmla_rot90
+rot-suffix
+a = 1., 1., 1., 1.
+b = 1., -1., 1., -1.
+c = 1., 1., 1., 1.
+validate 2., 0., 2., 0.
+target = fcma
+
+aarch64 = fcmla
+link-aarch64 = vcmla.rot90._EXT_
+generate float32x2_t, float32x4_t, float64x2_t
+
+/// Floating-point complex multiply accumulate
+name = vcmla_rot180
+rot-suffix
+a = 1., 1., 1., 1.
+b = 1., -1., 1., -1.
+c = 1., 1., 1., 1.
+validate 0., 0., 0., 0.
+target = fcma
+
+aarch64 = fcmla
+link-aarch64 = vcmla.rot180._EXT_
+generate float32x2_t, float32x4_t, float64x2_t
+
+/// Floating-point complex multiply accumulate
+name = vcmla_rot270
+rot-suffix
+a = 1., 1., 1., 1.
+b = 1., -1., 1., -1.
+c = 1., 1., 1., 1.
+validate 0., 2., 0., 2.
+target = fcma
+
+aarch64 = fcmla
+link-aarch64 = vcmla.rot270._EXT_
+generate float32x2_t, float32x4_t, float64x2_t
+
+/// Floating-point complex multiply accumulate
+name = vcmla
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_rot-LANE
+multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE}
+multi_fn = vcmla-self-noext, a, b, c
+a = 1., -1., 1., -1.
+b = -1., 1., -1., 1.
+c = 1., 1., -1., -1.
+n = 0
+validate 0., -2., 0., -2.
+target = fcma
+
+aarch64 = fcmla
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t
+generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
+/// Floating-point complex multiply accumulate
+name = vcmla_rot90
+rot-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_rot-LANE
+multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE}
+multi_fn = vcmla_rot90-rot-noext, a, b, c
+a = 1., -1., 1., -1.
+b = -1., 1., -1., 1.
+c = 1., 1., -1., -1.
+n = 0
+validate 0., 0., 0., 0.
+target = fcma
+
+aarch64 = fcmla
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t
+generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
+/// Floating-point complex multiply accumulate
+name = vcmla_rot180
+rot-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_rot-LANE
+multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE}
+multi_fn = vcmla_rot180-rot-noext, a, b, c
+a = 1., -1., 1., -1.
+b = -1., 1., -1., 1.
+c = 1., 1., -1., -1.
+n = 0
+validate 2., 0., 2., 0.
+target = fcma
+
+aarch64 = fcmla
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t
+generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
+/// Floating-point complex multiply accumulate
+name = vcmla_rot270
+rot-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_rot-LANE
+multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {base-2-LANE}
+multi_fn = vcmla_rot270-rot-noext, a, b, c
+a = 1., -1., 1., -1.
+b = -1., 1., -1., 1.
+c = 1., 1., -1., -1.
+n = 0
+validate 2., -2., 2., -2.
+target = fcma
+
+aarch64 = fcmla
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t
+generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
+/// Dot product arithmetic
+name = vdot
+out-suffix
+a = 1, 2, 1, 2
+b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+validate 31, 176, 31, 176
+target = dotprod
+
+aarch64 = sdot
+link-aarch64 = sdot._EXT_._EXT3_
+generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
+
+aarch64 = udot
+link-aarch64 = udot._EXT_._EXT3_
+generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
+
+/// Dot product arithmetic
+name = vdot
+out-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = simd_shuffle-in_len-!, c:in_t, c, c, {base-4-LANE}
+multi_fn = vdot-out-noext, a, b, c
+a = 1, 2, 1, 2
+b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+n = 0
+validate 31, 72, 31, 72
+target = dotprod
+
+aarch64 = sdot
+generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x2_t:int8x8_t:int8x16_t:int32x2_t
+generate int32x4_t:int8x16_t:int8x8_t:int32x4_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
+
+aarch64 = udot
+generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t
+generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
+
+/// Maximum (vector)
+name = vmax
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+validate 16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16
+
+arm = vmax
+aarch64 = smax
+link-arm = vmaxs._EXT_
+link-aarch64 = smax._EXT_
+generate int*_t
+
+arm = vmax
+aarch64 = umax
+link-arm = vmaxu._EXT_
+link-aarch64 = umax._EXT_
+generate uint*_t
+
+/// Maximum (vector)
+name = vmax
+a = 1.0, -2.0, 3.0, -4.0
+b = 0.0, 3.0, 2.0, 8.0
+validate 1.0, 3.0, 3.0, 8.0
+
+aarch64 = fmax
+link-aarch64 = fmax._EXT_
+generate float64x*_t
+
+arm = vmax
+aarch64 = fmax
+link-arm = vmaxs._EXT_
+link-aarch64 = fmax._EXT_
+generate float*_t
+
+/// Floating-point Maximum Number (vector)
+name = vmaxnm
+a = 1.0, 2.0, 3.0, -4.0
+b = 8.0, 16.0, -1.0, 6.0
+validate 8.0, 16.0, 3.0, 6.0
+
+aarch64 = fmaxnm
+link-aarch64 = fmaxnm._EXT_
+generate float64x*_t
+
+target = fp-armv8
+arm = vmaxnm
+aarch64 = fmaxnm
+link-arm = vmaxnm._EXT_
+link-aarch64 = fmaxnm._EXT_
+generate float*_t
+
+/// Floating-point maximum number across vector
+name = vmaxnmv
+a = 1., 2., 0., 1.
+validate 2.
+
+aarch64 = fmaxnmp
+link-aarch64 = fmaxnmv._EXT2_._EXT_
+generate float32x2_t:f32, float64x2_t:f64
+aarch64 = fmaxnmv
+generate float32x4_t:f32
+
+/// Floating-point Maximum Number Pairwise (vector).
+name = vpmaxnm
+a = 1.0, 2.0
+b = 6.0, -3.0
+validate 2.0, 6.0
+aarch64 = fmaxnmp
+link-aarch64 = fmaxnmp._EXT_
+generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t
+
+/// Floating-point Maximum Number Pairwise (vector).
+name = vpmaxnm
+a = 1.0, 2.0, 3.0, -4.0
+b = 8.0, 16.0, -1.0, 6.0
+validate 2.0, 3.0, 16.0, 6.0
+aarch64 = fmaxnmp
+link-aarch64 = fmaxnmp._EXT_
+generate float32x4_t:float32x4_t:float32x4_t
+
+/// Floating-point maximum number pairwise
+name = vpmaxnm
+out-suffix
+a = 1., 2.
+validate 2.
+
+aarch64 = fmaxnmp
+link-aarch64 = fmaxnmv._EXT2_._EXT_
+generate float32x2_t:f32
+name = vpmaxnmq
+generate float64x2_t:f64
+
+/// Floating-point maximum pairwise
+name = vpmax
+out-suffix
+a = 1., 2.
+validate 2.
+
+aarch64 = fmaxp
+link-aarch64 = fmaxv._EXT2_._EXT_
+generate float32x2_t:f32
+name = vpmaxq
+generate float64x2_t:f64
+
+/// Minimum (vector)
+name = vmin
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+validate 1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1
+
+arm = vmin
+aarch64 = smin
+link-arm = vmins._EXT_
+link-aarch64 = smin._EXT_
+generate int*_t
+
+arm = vmin
+aarch64 = umin
+link-arm = vminu._EXT_
+link-aarch64 = umin._EXT_
+generate uint*_t
+
+/// Minimum (vector)
+name = vmin
+a = 1.0, -2.0, 3.0, -4.0
+b = 0.0, 3.0, 2.0, 8.0
+validate 0.0, -2.0, 2.0, -4.0
+
+aarch64 = fmin
+link-aarch64 = fmin._EXT_
+generate float64x*_t
+
+arm = vmin
+aarch64 = fmin
+link-arm = vmins._EXT_
+link-aarch64 = fmin._EXT_
+generate float*_t
+
+/// Floating-point Minimum Number (vector)
+name = vminnm
+a = 1.0, 2.0, 3.0, -4.0
+b = 8.0, 16.0, -1.0, 6.0
+validate 1.0, 2.0, -1.0, -4.0
+
+aarch64 = fminnm
+link-aarch64 = fminnm._EXT_
+generate float64x*_t
+
+target = fp-armv8
+arm = vminnm
+aarch64 = fminnm
+link-arm = vminnm._EXT_
+link-aarch64 = fminnm._EXT_
+generate float*_t
+
+/// Floating-point minimum number across vector
+name = vminnmv
+a = 1., 0., 2., 3.
+validate 0.
+
+aarch64 = fminnmp
+link-aarch64 = fminnmv._EXT2_._EXT_
+generate float32x2_t:f32, float64x2_t:f64
+aarch64 = fminnmv
+generate float32x4_t:f32
+
+/// Vector move
+name = vmovl_high
+no-q
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen}
+multi_fn = vmovl-noqself-noext, a
+a = 1, 2, 3, 4, 3, 4, 5, 6, 3, 4, 5, 6, 7, 8, 9, 10
+validate 3, 4, 5, 6, 7, 8, 9, 10
+
+aarch64 = sxtl2
+generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t
+
+aarch64 = uxtl2
+generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t
+
+/// Floating-point add pairwise
+name = vpadd
+a = 1., 2., 3., 4.
+b = 3., 4., 5., 6.
+validate 3., 7., 7., 11.
+
+aarch64 = faddp
+link-aarch64 = faddp._EXT_
+generate float32x4_t, float64x2_t
+
+arm = vpadd
+link-arm = vpadd._EXT_
+generate float32x2_t
+
+/// Floating-point add pairwise
+name = vpadd
+out-suffix
+multi_fn = simd_extract, a1:out_t, a, 0
+multi_fn = simd_extract, a2:out_t, a, 1
+multi_fn = a1 + a2
+a = 1., 2.
+validate 3.
+
+aarch64 = nop
+generate float32x2_t:f32, float64x2_t:f64
+
+/// Floating-point Minimum Number Pairwise (vector).
+name = vpminnm
+a = 1.0, 2.0
+b = 6.0, -3.0
+validate 1.0, -3.0
+
+aarch64 = fminnmp
+link-aarch64 = fminnmp._EXT_
+generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t
+
+/// Floating-point Minimum Number Pairwise (vector).
+name = vpminnm
+a = 1.0, 2.0, 3.0, -4.0
+b = 8.0, 16.0, -1.0, 6.0
+validate 1.0, -4.0, 8.0, -1.0
+aarch64 = fminnmp
+link-aarch64 = fminnmp._EXT_
+generate float32x4_t:float32x4_t:float32x4_t
+
+/// Floating-point minimum number pairwise
+name = vpminnm
+out-suffix
+a = 1., 2.
+validate 1.
+
+aarch64 = fminnmp
+link-aarch64 = fminnmv._EXT2_._EXT_
+generate float32x2_t:f32
+name = vpminnmq
+generate float64x2_t:f64
+
+/// Floating-point minimum pairwise
+name = vpmin
+out-suffix
+a = 1., 2.
+validate 1.
+
+aarch64 = fminp
+link-aarch64 = fminv._EXT2_._EXT_
+generate float32x2_t:f32
+name = vpminq
+generate float64x2_t:f64
+
+/// Signed saturating doubling multiply long
+name = vqdmull
+a = 0, 1, 2, 3, 4, 5, 6, 7
+b = 1, 2, 3, 4, 5, 6, 7, 8
+validate 0, 4, 12, 24, 40, 60, 84, 108
+
+aarch64 = sqdmull
+link-aarch64 = sqdmull._EXT2_
+arm = vqdmull
+link-arm = vqdmull._EXT2_
+generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
+
+/// Signed saturating doubling multiply long
+name = vqdmull
+multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = simd_extract, {vqdmull-in_ntt-noext, a, b}, 0
+a = 2
+b = 3
+validate 12
+
+aarch64 = sqdmull
+generate i16:i16:i32
+
+/// Signed saturating doubling multiply long
+name = vqdmull
+a = 2
+b = 3
+validate 12
+
+aarch64 = sqdmull
+link-aarch64 = sqdmulls.scalar
+generate i32:i32:i64
+
+/// Vector saturating doubling long multiply with scalar
+name = vqdmull_n
+no-q
+multi_fn = vqdmull-in_ntt-noext, a, {vdup_n-in_ntt-noext, b}
+a = 2, 4, 6, 8
+b = 2
+validate 8, 16, 24, 32
+
+aarch64 = sqdmull
+arm = vqdmull
+generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t
+
+/// Signed saturating doubling multiply long
+name = vqdmull_high
+no-q
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen}
+multi_fn = vqdmull-noqself-noext, a, b
+a = 0, 1, 4, 5, 4, 5, 6, 7
+b = 1, 2, 5, 6, 5, 6, 7, 8
+validate 40, 60, 84, 112
+
+aarch64 = sqdmull2
+generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
+
+/// Signed saturating doubling multiply long
+name = vqdmull_high_n
+no-q
+multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len}
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = vqdmull-in_ntt-noext, a, b
+a = 0, 2, 8, 10, 8, 10, 12, 14
+b = 2
+validate 32, 40, 48, 56
+
+aarch64 = sqdmull2
+generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t
+
+/// Vector saturating doubling long multiply by scalar
+name = vqdmull_lane
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32}
+multi_fn = vqdmull-noqself-noext, a, b
+a = 1, 2, 3, 4
+b = 0, 2, 2, 0, 2, 0, 0, 0
+n = HFLEN
+validate 4, 8, 12, 16
+
+aarch64 = sqdmull
+generate int16x4_t:int16x8_t:int32x4_t, int32x2_t:int32x4_t:int64x2_t
+
+arm = vqdmull
+generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
+
+/// Signed saturating doubling multiply long
+name = vqdmullh_lane
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_extract, b:in_t0, b, N as u32
+multi_fn = vqdmullh-noqself-noext, a, b
+a = 2
+b = 0, 2, 2, 0, 2, 0, 0, 0
+n = HFLEN
+validate 8
+
+aarch64 = sqdmull
+generate i16:int16x4_t:i32, i16:int16x8_t:i32
+
+/// Signed saturating doubling multiply long
+name = vqdmulls_lane
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_extract, b:in_t0, b, N as u32
+multi_fn = vqdmulls-noqself-noext, a, b
+a = 2
+b = 0, 2, 2, 0, 2, 0, 0, 0
+n = HFLEN
+validate 8
+
+aarch64 = sqdmull
+generate i32:int32x2_t:i64, i32:int32x4_t:i64
+
+/// Signed saturating doubling multiply long
+name = vqdmull_high_lane
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32}
+multi_fn = vqdmull-self-noext, a, b
+a = 0, 1, 4, 5, 4, 5, 6, 7
+b = 0, 2, 2, 0, 2, 0, 0, 0
+n = HFLEN
+validate 16, 20, 24, 28
+
+aarch64 = sqdmull2
+generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t
+
+/// Signed saturating doubling multiply long
+name = vqdmull_high_lane
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32}
+multi_fn = vqdmull-noqself-noext, a, b
+a = 0, 1, 4, 5, 4, 5, 6, 7
+b = 0, 2, 2, 0, 2, 0, 0, 0
+n = HFLEN
+validate 16, 20, 24, 28
+
+aarch64 = sqdmull2
+generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
+
+/// Signed saturating doubling multiply-add long
+name = vqdmlal
+multi_fn = vqadd-out-noext, a, {vqdmull-self-noext, b, c}
+a = 1, 1, 1, 1
+b = 1, 2, 3, 4
+c = 2, 2, 2, 2
+validate 5, 9, 13, 17
+
+aarch64 = sqdmlal
+arm = vqdmlal
+generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
+
+/// Vector widening saturating doubling multiply accumulate with scalar
+name = vqdmlal
+n-suffix
+multi_fn = vqadd-out-noext, a, {vqdmull_n-self-noext, b, c}
+a = 1, 1, 1, 1
+b = 1, 2, 3, 4
+c = 2
+validate 5, 9, 13, 17
+
+aarch64 = sqdmlal
+arm = vqdmlal
+generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
+
+/// Signed saturating doubling multiply-add long
+name = vqdmlal_high
+no-q
+multi_fn = vqadd-out-noext, a, {vqdmull_high-noqself-noext, b, c}
+a = 1, 2, 3, 4
+b = 0, 1, 4, 5, 4, 5, 6, 7
+c = 1, 2, 5, 6, 5, 6, 7, 8
+validate 41, 62, 87, 116
+
+aarch64 = sqdmlal2
+generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+
+/// Signed saturating doubling multiply-add long
+name = vqdmlal_high_n
+no-q
+multi_fn = vqadd-out-noext, a, {vqdmull_high_n-noqself-noext, b, c}
+a = 1, 2, 3, 4
+b = 0, 2, 8, 10, 8, 10, 12, 14
+c = 2
+validate 33, 42, 51, 60
+
+aarch64 = sqdmlal2
+generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
+
+/// Vector widening saturating doubling multiply accumulate with scalar
+name = vqdmlal_lane
+in2-suffix
+constn = N
+multi_fn = static_assert_imm-in2_exp_len-N
+multi_fn = vqadd-out-noext, a, {vqdmull_lane-in2-::<N>, b, c}
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+c = 0, 2, 2, 0, 2, 0, 0, 0
+n = HFLEN
+validate 5, 10, 15, 20
+
+aarch64 = sqdmlal
+generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
+
+arm = vqdmlal
+generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
+
+/// Signed saturating doubling multiply-add long
+name = vqdmlal_high_lane
+in2-suffix
+constn = N
+multi_fn = static_assert_imm-in2_exp_len-N
+multi_fn = vqadd-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c}
+a = 1, 2, 3, 4
+b = 0, 1, 4, 5, 4, 5, 6, 7
+c = 0, 2, 0, 0, 0, 0, 0, 0
+n = 1
+validate 17, 22, 27, 32
+
+aarch64 = sqdmlal2
+generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+
+/// Signed saturating doubling multiply-add long
+name = vqdmlal
+multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c}
+multi_fn = vqadd-out-noext, a, {simd_extract, x, 0}
+a = 1
+b = 1
+c = 2
+validate 5
+
+aarch64 = sqdmull
+generate i32:i16:i16:i32, i64:i32:i32:i64
+
+/// Signed saturating doubling multiply-add long
+name = vqdmlalh_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vqdmlal-self-noext, a, b, {simd_extract, c, LANE as u32}
+a = 1
+b = 1
+c = 2, 1, 1, 1, 1, 1, 1, 1
+n = 0
+validate 5
+
+aarch64 = sqdmlal
+generate i32:i16:int16x4_t:i32, i32:i16:int16x8_t:i32
+name = vqdmlals_lane
+aarch64 = sqdmull
+generate i64:i32:int32x2_t:i64, i64:i32:int32x4_t:i64
+
+/// Signed saturating doubling multiply-subtract long
+name = vqdmlsl
+multi_fn = vqsub-out-noext, a, {vqdmull-self-noext, b, c}
+a = 3, 7, 11, 15
+b = 1, 2, 3, 4
+c = 2, 2, 2, 2
+validate -1, -1, -1, -1
+
+aarch64 = sqdmlsl
+arm = vqdmlsl
+generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
+
+/// Vector widening saturating doubling multiply subtract with scalar
+name = vqdmlsl
+n-suffix
+multi_fn = vqsub-out-noext, a, {vqdmull_n-self-noext, b, c}
+a = 3, 7, 11, 15
+b = 1, 2, 3, 4
+c = 2
+validate -1, -1, -1, -1
+
+aarch64 = sqdmlsl
+arm = vqdmlsl
+generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
+
+/// Signed saturating doubling multiply-subtract long
+name = vqdmlsl_high
+no-q
+multi_fn = vqsub-out-noext, a, {vqdmull_high-noqself-noext, b, c}
+a = 39, 58, 81, 108
+b = 0, 1, 4, 5, 4, 5, 6, 7
+c = 1, 2, 5, 6, 5, 6, 7, 8
+validate -1, -2, -3, -4
+
+aarch64 = sqdmlsl2
+generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+
+/// Signed saturating doubling multiply-subtract long
+name = vqdmlsl_high_n
+no-q
+multi_fn = vqsub-out-noext, a, {vqdmull_high_n-noqself-noext, b, c}
+a = 31, 38, 45, 52
+b = 0, 2, 8, 10, 8, 10, 12, 14
+c = 2
+validate -1, -2, -3, -4
+
+aarch64 = sqdmlsl2
+generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
+
+/// Vector widening saturating doubling multiply subtract with scalar
+name = vqdmlsl_lane
+in2-suffix
+constn = N
+multi_fn = static_assert_imm-in2_exp_len-N
+multi_fn = vqsub-out-noext, a, {vqdmull_lane-in2-::<N>, b, c}
+a = 3, 6, 9, 12
+b = 1, 2, 3, 4
+c = 0, 2, 2, 0, 2, 0, 0, 0
+n = HFLEN
+validate -1, -2, -3, -4
+
+aarch64 = sqdmlsl
+generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
+
+arm = vqdmlsl
+generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
+
+/// Signed saturating doubling multiply-subtract long
+name = vqdmlsl_high_lane
+in2-suffix
+constn = N
+multi_fn = static_assert_imm-in2_exp_len-N
+multi_fn = vqsub-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c}
+a = 15, 18, 21, 24
+b = 0, 1, 4, 5, 4, 5, 6, 7
+c = 0, 2, 0, 0, 0, 0, 0, 0
+n = 1
+validate -1, -2, -3, -4
+
+aarch64 = sqdmlsl2
+generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+
+/// Signed saturating doubling multiply-subtract long
+name = vqdmlsl
+multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c}
+multi_fn = vqsub-out-noext, a, {simd_extract, x, 0}
+a = 10
+b = 1
+c = 2
+validate 6
+
+aarch64 = sqdmull
+generate i32:i16:i16:i32, i64:i32:i32:i64
+
+/// Signed saturating doubling multiply-subtract long
+name = vqdmlslh_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vqdmlsl-self-noext, a, b, {simd_extract, c, LANE as u32}
+a = 10
+b = 1
+c = 2, 1, 1, 1, 1, 1, 1, 1
+n = 0
+validate 6
+
+aarch64 = sqdmlsl
+generate i32:i16:int16x4_t:i32, i32:i16:int16x8_t:i32
+name = vqdmlsls_lane
+aarch64 = sqdmull
+generate i64:i32:int32x2_t:i64, i64:i32:int32x4_t:i64
+
+/// Signed saturating doubling multiply returning high half
+name = vqdmulh
+a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+b = 2, 2, 2, 2, 2, 2, 2, 2
+validate 1, 1, 1, 1, 1, 1, 1, 1
+
+aarch64 = sqdmulh
+link-aarch64 = sqdmulh._EXT_
+arm = vqdmulh
+link-arm = vqdmulh._EXT_
+generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
+
+/// Signed saturating doubling multiply returning high half
+name = vqdmulh
+multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = simd_extract, {vqdmulh-in_ntt-noext, a, b}, 0
+a = 1
+b = 2
+validate 0
+
+aarch64 = sqdmulh
+generate i16, i32
+
+/// Vector saturating doubling multiply high with scalar
+name = vqdmulh_n
+out-suffix
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = vqdmulh-out-noext, a, b
+a = MAX, MAX, MAX, MAX
+b = 2
+validate 1, 1, 1, 1
+
+aarch64 = sqdmulh
+arm = vqdmulh
+generate int16x4_t:i16:int16x4_t, int32x2_t:i32:int32x2_t
+
+/// Vector saturating doubling multiply high with scalar
+name = vqdmulhq_n
+no-q
+multi_fn = vdupq_n-in_ntt-noext, b:out_t, b
+multi_fn = vqdmulh-out-noext, a, b
+a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+b = 2
+validate 1, 1, 1, 1, 1, 1, 1, 1
+
+aarch64 = sqdmulh
+arm = vqdmulh
+generate int16x8_t:i16:int16x8_t, int32x4_t:i32:int32x4_t
+
+/// Signed saturating doubling multiply returning high half
+name = vqdmulhh_lane
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_extract, b:in_t0, b, N as u32
+multi_fn = vqdmulhh-out_ntt-noext, a, b
+a = 2
+b = 0, 0, MAX, 0, 0, 0, 0, 0
+n = 2
+validate 1
+
+aarch64 = sqdmulh
+generate i16:int16x4_t:i16, i16:int16x8_t:i16
+
+/// Signed saturating doubling multiply returning high half
+name = vqdmulhs_lane
+constn = N
+multi_fn = static_assert_imm-in_exp_len-N
+multi_fn = simd_extract, b:in_t0, b, N as u32
+multi_fn = vqdmulhs-out_ntt-noext, a, b
+a = 2
+b = 0, MAX, 0, 0
+n = 1
+validate 1
+
+aarch64 = sqdmulh
+generate i32:int32x2_t:i32, i32:int32x4_t:i32
+
+/// Vector saturating doubling multiply high by scalar
+name = vqdmulh
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vqdmulh-out-noext, a, {vdup-nout-noext, {simd_extract, b, LANE as u32}}
+a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+b = 2, 1, 1, 1, 1, 1, 1, 1
+n = 0
+validate 1, 1, 1, 1, 1, 1, 1, 1
+
+aarch64 = sqdmulh
+generate int16x4_t, int16x8_t:int16x4_t:int16x8_t
+generate int32x2_t, int32x4_t:int32x2_t:int32x4_t
+arm = vqdmulh
+generate int16x8_t, int16x4_t:int16x8_t:int16x4_t
+generate int32x4_t, int32x2_t:int32x4_t:int32x2_t
+
+/// Signed saturating extract narrow
+name = vqmovn
+no-q
+a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+validate  MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+
+aarch64 = sqxtn
+link-aarch64 = sqxtn._EXT2_
+arm = vqmovn
+link-arm = vqmovns._EXT2_
+generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
+
+/// Unsigned saturating extract narrow
+name = vqmovn
+no-q
+a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+validate  MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+
+aarch64 = uqxtn
+link-aarch64 = uqxtn._EXT2_
+arm = vqmovn
+link-arm = vqmovnu._EXT2_
+generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
+
+/// Saturating extract narrow
+name = vqmovn
+multi_fn = simd_extract, {vqmovn-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0
+a = 1
+validate 1
+
+aarch64 = sqxtn
+generate i16:i8, i32:i16
+aarch64 = uqxtn
+generate u16:u8, u32:u16
+
+/// Saturating extract narrow
+name = vqmovn
+a = 1
+validate 1
+
+aarch64 = sqxtn
+link-aarch64 = scalar.sqxtn._EXT2_._EXT_
+generate i64:i32
+
+aarch64 = uqxtn
+link-aarch64 = scalar.uqxtn._EXT2_._EXT_
+generate u64:u32
+
+/// Signed saturating extract narrow
+name = vqmovn_high
+no-q
+multi_fn = simd_shuffle-out_len-!, a, {vqmovn-noqself-noext, b}, {asc-0-out_len}
+a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+
+aarch64 = sqxtn2
+generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
+aarch64 = uqxtn2
+generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
+
+/// Signed saturating extract unsigned narrow
+name = vqmovun
+no-q
+a = -1, -1, -1, -1, -1, -1, -1, -1
+validate 0, 0, 0, 0, 0, 0, 0, 0
+
+aarch64 = sqxtun
+link-aarch64 = sqxtun._EXT2_
+arm = vqmovun
+link-arm = vqmovnsu._EXT2_
+generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t
+
+/// Signed saturating extract unsigned narrow
+name = vqmovun
+multi_fn = simd_extract, {vqmovun-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0
+a = 1
+validate 1
+
+aarch64 = sqxtun
+generate i16:u8, i32:u16, i64:u32
+
+/// Signed saturating extract unsigned narrow
+name = vqmovun_high
+no-q
+multi_fn = simd_shuffle-out_len-!, a, {vqmovun-noqself-noext, b}, {asc-0-out_len}
+a = 0, 0, 0, 0, 0, 0, 0, 0
+b = -1, -1, -1, -1, -1, -1, -1, -1
+validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+aarch64 = sqxtun2
+generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t
+
+/// Signed saturating rounding doubling multiply returning high half
+name = vqrdmulh
+a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+b = 2, 2, 2, 2, 2, 2, 2, 2
+validate 2, 2, 2, 2, 2, 2, 2, 2
+
+aarch64 = sqrdmulh
+link-aarch64 = sqrdmulh._EXT_
+arm = vqrdmulh
+link-arm = vqrdmulh._EXT_
+generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
+
+/// Signed saturating rounding doubling multiply returning high half
+name = vqrdmulh
+multi_fn = simd_extract, {vqrdmulh-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+a = 1
+b = 2
+validate 0
+
+aarch64 = sqrdmulh
+generate i16, i32
+
+/// Vector saturating rounding doubling multiply high with scalar
+name = vqrdmulh
+out-n-suffix
+multi_fn = vqrdmulh-out-noext, a, {vdup-nout-noext, b}
+a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+b = 2
+validate 2, 2, 2, 2, 2, 2, 2, 2
+
+aarch64 = sqrdmulh
+arm = vqrdmulh
+generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t
+
+/// Vector rounding saturating doubling multiply high by scalar
+name = vqrdmulh
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32}
+multi_fn = vqrdmulh-out-noext, a, b
+a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+b = 0, 2, 0, 0, 0, 0, 0, 0,
+n = 1
+validate 2, 2, 2, 2, 2, 2, 2, 2
+
+aarch64 = sqrdmulh
+arm = vqrdmulh
+generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t
+generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t
+
+/// Signed saturating rounding doubling multiply returning high half
+name = vqrdmulh
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vqrdmulh-out-noext, a, {simd_extract, b, LANE as u32}
+a = 1
+b = 0, 2, 0, 0, 0, 0, 0, 0,
+n = 1
+validate 0
+
+aarch64 = sqrdmulh
+generate i16:int16x4_t:i16, i16:int16x8_t:i16, i32:int32x2_t:i32, i32:int32x4_t:i32
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+name = vqrdmlah
+a = 1, 1, 1, 1, 1, 1, 1, 1
+b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+c = 2, 2, 2, 2, 2, 2, 2, 2
+validate 3, 3, 3, 3, 3, 3, 3, 3
+
+aarch64 = sqrdmlah
+link-aarch64 = sqrdmlah._EXT_
+target = rdm
+generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+name = vqrdmlah
+multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c
+multi_fn = simd_extract, {vqrdmlah-in_ntt-noext, a, b, c}, 0
+a = 1
+b = 1
+c = 2
+validate 1
+
+aarch64 = sqrdmlah
+target = rdm
+generate i16, i32
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+name = vqrdmlah
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {dup-out_len-LANE as u32}
+multi_fn = vqrdmlah-out-noext, a, b, c
+a = 1, 1, 1, 1, 1, 1, 1, 1
+b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+c = 0, 2, 0, 0, 0, 0, 0, 0
+n = 1
+validate 3, 3, 3, 3, 3, 3, 3, 3
+
+aarch64 = sqrdmlah
+target = rdm
+generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
+generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
+
+/// Signed saturating rounding doubling multiply accumulate returning high half
+name = vqrdmlah
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vqrdmlah-self-noext, a, b, {simd_extract, c, LANE as u32}
+a = 1
+b = 1
+c = 0, 2, 0, 0, 0, 0, 0, 0
+n = 1
+validate 1
+
+aarch64 = sqrdmlah
+target = rdm
+generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+name = vqrdmlsh
+link-aarch64 = sqrdmlsh._EXT_
+a = 1, 1, 1, 1, 1, 1, 1, 1
+b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+c = 2, 2, 2, 2, 2, 2, 2, 2
+validate -1, -1, -1, -1, -1, -1, -1, -1
+
+aarch64 = sqrdmlsh
+target = rdm
+generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+name = vqrdmlsh
+multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c
+multi_fn = simd_extract, {vqrdmlsh-in_ntt-noext, a, b, c}, 0
+a = 1
+b = 1
+c = 2
+validate 1
+
+aarch64 = sqrdmlsh
+target = rdm
+generate i16, i32
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+name = vqrdmlsh
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {dup-out_len-LANE as u32}
+multi_fn = vqrdmlsh-out-noext, a, b, c
+a = 1, 1, 1, 1, 1, 1, 1, 1
+b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
+c = 0, 2, 0, 0, 0, 0, 0, 0
+n = 1
+validate -1, -1, -1, -1, -1, -1, -1, -1
+
+aarch64 = sqrdmlsh
+target = rdm
+generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
+generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
+
+/// Signed saturating rounding doubling multiply subtract returning high half
+name = vqrdmlsh
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vqrdmlsh-self-noext, a, b, {simd_extract, c, LANE as u32}
+a = 1
+b = 1
+c = 0, 2, 0, 0, 0, 0, 0, 0
+n = 1
+validate 1
+
+aarch64 = sqrdmlsh
+target = rdm
+generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32
+
+/// Signed saturating rounding shift left
+name = vqrshl
+a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+validate 8, MIN, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+
+aarch64 = sqrshl
+link-aarch64 = sqrshl._EXT_
+generate i32, i64
+
+arm = vqrshl
+link-arm = vqrshifts._EXT_
+generate int*_t, int64x*_t
+
+/// Signed saturating rounding shift left
+name = vqrshl
+multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = simd_extract, {vqrshl-in_ntt-noext, a, b}, 0
+a = 1
+b = 2
+validate 4
+
+aarch64 = sqrshl
+generate i8, i16
+
+/// Unsigned signed saturating rounding shift left
+name = vqrshl
+out-suffix
+a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+validate 8, 0, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+
+aarch64 = uqrshl
+link-aarch64 = uqrshl._EXT_
+generate u32:i32:u32, u64:i64:u64
+
+arm = vqrshl
+link-arm = vqrshiftu._EXT_
+generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
+generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t
+
+/// Unsigned signed saturating rounding shift left
+name = vqrshl
+out-suffix
+multi_fn = vdup_n-out_ntt-noext, a:out_ntt, a
+multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
+multi_fn = simd_extract, {vqrshl-out_ntt-noext, a, b}, 0
+a = 1
+b = 2
+validate 4
+
+aarch64 = uqrshl
+generate u8:i8:u8, u16:i16:u16
+
+/// Signed saturating rounded shift right narrow
+name = vqrshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+a = MIN, 4, 8, 12, 16, 20, 24, 28
+n = 2
+validate MIN, 1, 2, 3, 4, 5, 6, 7
+
+aarch64 = sqrshrn
+link-aarch64 = sqrshrn._EXT2_
+const-aarch64 = N
+
+arm = vqrshrn
+link-arm = vqrshiftns._EXT2_
+const-arm = -N as ttn
+arm-aarch64-separate
+generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
+
+/// Signed saturating rounded shift right narrow
+name = vqrshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a
+multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0
+a = 4
+n = 2
+validate 1
+
+aarch64 = sqrshrn
+generate i16:i8, i32:i16, i64:i32
+
+/// Signed saturating rounded shift right narrow
+name = vqrshrn_high
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+a = 0, 1, 2, 3, 2, 3, 6, 7
+b = 8, 12, 24, 28, 48, 52, 56, 60
+n = 2
+validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15
+
+aarch64 = sqrshrn2
+generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
+
+/// Unsigned signed saturating rounded shift right narrow
+name = vqrshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+a = MIN, 4, 8, 12, 16, 20, 24, 28
+n = 2
+validate 0, 1, 2, 3, 4, 5, 6, 7
+
+aarch64 = uqrshrn
+link-aarch64 = uqrshrn._EXT2_
+const-aarch64 = N
+
+arm = vqrshrn
+link-arm = vqrshiftnu._EXT2_
+const-arm = -N as ttn
+arm-aarch64-separate
+generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
+
+/// Unsigned saturating rounded shift right narrow
+name = vqrshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a
+multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0
+a = 4
+n = 2
+validate 1
+
+aarch64 = uqrshrn
+generate u16:u8, u32:u16, u64:u32
+
+/// Unsigned saturating rounded shift right narrow
+name = vqrshrn_high
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+a = 0, 1, 2, 3, 2, 3, 6, 7
+b = 8, 12, 24, 28, 48, 52, 56, 60
+n = 2
+validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15
+
+aarch64 = uqrshrn2
+generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
+
+/// Signed saturating rounded shift right unsigned narrow
+name = vqrshrun
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+a = 0, 4, 8, 12, 16, 20, 24, 28
+n = 2
+validate 0, 1, 2, 3, 4, 5, 6, 7
+
+aarch64 = sqrshrun
+link-aarch64 = sqrshrun._EXT2_
+const-aarch64 = N
+
+arm = vqrshrun
+link-arm = vqrshiftnsu._EXT2_
+const-arm = -N as ttn
+arm-aarch64-separate
+generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t
+
+/// Signed saturating rounded shift right unsigned narrow
+name = vqrshrun
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a
+multi_fn = simd_extract, {vqrshrun_n-in_ntt-::<N>, a}, 0
+a = 4
+n = 2
+validate 1
+
+aarch64 = sqrshrun
+generate i16:u8, i32:u16, i64:u32
+
+/// Signed saturating rounded shift right unsigned narrow
+name = vqrshrun_high
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
+a = 0, 1, 2, 3, 2, 3, 6, 7
+b = 8, 12, 24, 28, 48, 52, 56, 60
+n = 2
+validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15
+
+aarch64 = sqrshrun2
+generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t
+
+/// Signed saturating shift left
+name = vqshl
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+
+aarch64 = sqshl
+link-aarch64 = sqshl._EXT_
+generate i64
+
+arm = vqshl
+link-arm = vqshifts._EXT_
+generate int*_t, int64x*_t
+
+/// Signed saturating shift left
+name = vqshl
+multi_fn = vqshl-in_ntt-noext, c:in_ntt, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}
+multi_fn = simd_extract, c, 0
+a = 1
+b = 2
+validate 4
+
+aarch64 = sqshl
+generate i8, i16, i32
+
+/// Unsigned saturating shift left
+name = vqshl
+out-suffix
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+
+aarch64 = uqshl
+link-aarch64 = uqshl._EXT_
+generate u64:i64:u64
+
+arm = vqshl
+link-arm = vqshiftu._EXT_
+generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
+generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t
+
+/// Unsigned saturating shift left
+name = vqshl
+out-suffix
+multi_fn = vqshl-out_ntt-noext, c:out_ntt, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}
+multi_fn = simd_extract, c, 0
+a = 1
+b = 2
+validate 4
+
+aarch64 = uqshl
+generate u8:i8:u8, u16:i16:u16, u32:i32:u32
+
+/// Signed saturating shift left
+name = vqshl
+n-suffix
+constn = N
+multi_fn = static_assert_imm-out_bits_exp_len-N
+multi_fn = vqshl-self-noext, a, {vdup-nself-noext, N as _}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+n = 2
+validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+
+aarch64 = sqshl
+arm = vqshl
+generate int*_t, int64x*_t
+
+/// Signed saturating shift left
+name = vqshl
+n-suffix
+constn = N
+multi_fn = static_assert_imm-out_bits_exp_len-N
+multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
+a = 1
+n = 2
+validate 4
+
+aarch64 = sqshl
+generate i8, i16, i32, i64
+
+/// Unsigned saturating shift left
+name = vqshl
+n-suffix
+constn = N
+multi_fn = static_assert_imm-out_bits_exp_len-N
+multi_fn = vqshl-self-noext, a, {vdup-nsigned-noext, N as _}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+n = 2
+validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+
+aarch64 = uqshl
+arm = vqshl
+generate uint*_t, uint64x*_t
+
+/// Unsigned saturating shift left
+name = vqshl
+n-suffix
+constn = N
+multi_fn = static_assert_imm-out_bits_exp_len-N
+multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
+a = 1
+n = 2
+validate 4
+
+aarch64 = uqshl
+generate u8, u16, u32, u64
+
+/// Signed saturating shift left unsigned
+name = vqshlu
+n-suffix
+constn = N
+multi_fn = static_assert_imm-out_bits_exp_len-N
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+n = 2
+validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+arm-aarch64-separate
+
+aarch64 = sqshlu
+link-aarch64 = sqshlu._EXT_
+const-aarch64 = {dup-in_len-N as ttn}
+arm = vqshlu
+link-arm = vqshiftsu._EXT_
+const-arm = N as ttn
+generate int8x8_t:uint8x8_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t
+generate int8x16_t:uint8x16_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t
+
+/// Signed saturating shift left unsigned
+name = vqshlu
+n-suffix
+constn = N
+multi_fn = static_assert_imm-out_bits_exp_len-N
+multi_fn = simd_extract, {vqshlu_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
+a = 1
+n = 2
+validate 4
+
+aarch64 = sqshlu
+generate i8:u8, i16:u16, i32:u32, i64:u64
+
+/// Signed saturating shift right narrow
+name = vqshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+a = 0, 4, 8, 12, 16, 20, 24, 28
+n = 2
+validate 0, 1, 2, 3, 4, 5, 6, 7
+arm-aarch64-separate
+
+aarch64 = sqshrn
+link-aarch64 = sqshrn._EXT2_
+const-aarch64 = N
+generate i64:i32
+
+arm = vqshrn
+link-arm = vqshiftns._EXT2_
+const-arm = -N as ttn
+generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
+
+/// Signed saturating shift right narrow
+name = vqshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
+a = 4
+n = 2
+validate 1
+
+aarch64 = sqshrn
+generate i16:i8, i32:i16
+
+/// Signed saturating shift right narrow
+name = vqshrn_high
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+a = 0, 1, 8, 9, 8, 9, 10, 11
+b = 32, 36, 40, 44, 48, 52, 56, 60
+n = 2
+validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = sqshrn2
+generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
+
+/// Unsigned saturating shift right narrow
+name = vqshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+a = 0, 4, 8, 12, 16, 20, 24, 28
+n = 2
+validate 0, 1, 2, 3, 4, 5, 6, 7
+arm-aarch64-separate
+
+aarch64 = uqshrn
+link-aarch64 = uqshrn._EXT2_
+const-aarch64 = N
+generate u64:u32
+
+arm = vqshrn
+link-arm = vqshiftnu._EXT2_
+const-arm = -N as ttn
+generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
+
+/// Unsigned saturating shift right narrow
+name = vqshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
+a = 4
+n = 2
+validate 1
+
+aarch64 = uqshrn
+generate u16:u8, u32:u16
+
+/// Unsigned saturating shift right narrow
+name = vqshrn_high
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+a = 0, 1, 8, 9, 8, 9, 10, 11
+b = 32, 36, 40, 44, 48, 52, 56, 60
+n = 2
+validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = uqshrn2
+generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
+
+/// Signed saturating shift right unsigned narrow
+name = vqshrun
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+a = 0, 4, 8, 12, 16, 20, 24, 28
+n = 2
+validate 0, 1, 2, 3, 4, 5, 6, 7
+arm-aarch64-separate
+
+aarch64 = sqshrun
+link-aarch64 = sqshrun._EXT2_
+const-aarch64 = N
+
+arm = vqshrun
+link-arm = vqshiftnsu._EXT2_
+const-arm = -N as ttn
+generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t
+
+/// Signed saturating shift right unsigned narrow
+name = vqshrun
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_extract, {vqshrun_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
+a = 4
+n = 2
+validate 1
+
+aarch64 = sqshrun
+generate i16:u8, i32:u16, i64:u32
+
+/// Signed saturating shift right unsigned narrow
+name = vqshrun_high
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
+a = 0, 1, 8, 9, 8, 9, 10, 11
+b = 32, 36, 40, 44, 48, 52, 56, 60
+n = 2
+validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = sqshrun2
+generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t
+
+/// Unsigned saturating accumulate of signed value
+name = vsqadd
+out-suffix
+multi_fn = simd_extract, {vsqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+a = 2
+b = 2
+validate 4
+
+aarch64 = usqadd
+generate u8:i8:u8, u16:i16:u16
+
+/// Unsigned saturating accumulate of signed value
+name = vsqadd
+out-suffix
+a = 2
+b = 2
+validate 4
+
+aarch64 = usqadd
+link-aarch64 = usqadd._EXT_
+generate u32:i32:u32, u64:i64:u64
+
+/// Calculates the square root of each lane.
+name = vsqrt
+fn = simd_fsqrt
+a = 4.0, 9.0, 16.0, 25.0
+validate 2.0, 3.0, 4.0, 5.0
+
+aarch64 = fsqrt
+generate float*_t, float64x*_t
+
+/// Reciprocal square-root estimate.
+name = vrsqrte
+a = 1.0, 2.0, 3.0, 4.0
+validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375
+
+aarch64 = frsqrte
+link-aarch64 = frsqrte._EXT_
+generate float64x*_t, f32, f64
+
+arm = vrsqrte
+link-arm = vrsqrte._EXT_
+generate float*_t
+
+/// Unsigned reciprocal square root estimate
+name = vrsqrte
+a = 1, 2, 3, 4
+validate 4294967295, 4294967295, 4294967295, 4294967295
+
+aarch64 = ursqrte
+link-aarch64 = ursqrte._EXT_
+arm = vrsqrte
+link-arm = vrsqrte._EXT_
+generate uint32x2_t, uint32x4_t
+
+/// Floating-point reciprocal square root step
+name = vrsqrts
+a = 1.0, 2.0, 3.0, 4.0
+b = 1.0, 2.0, 3.0, 4.0
+validate 1., -0.5, -3.0, -6.5
+
+aarch64 = frsqrts
+link-aarch64 = frsqrts._EXT_
+generate float64x*_t, f32, f64
+
+arm = vrsqrts
+link-arm = vrsqrts._EXT_
+generate float*_t
+
+/// Reciprocal estimate.
+name = vrecpe
+a = 4.0, 3.0, 2.0, 1.0
+validate 0.24951171875, 0.3330078125, 0.4990234375, 0.998046875
+
+aarch64 = frecpe
+link-aarch64 = frecpe._EXT_
+generate float64x*_t, f32, f64
+
+arm = vrecpe
+link-arm = vrecpe._EXT_
+generate float*_t
+
+/// Unsigned reciprocal estimate
+name = vrecpe
+a = 4, 3, 2, 1
+validate 4294967295, 4294967295, 4294967295, 4294967295
+
+aarch64 = urecpe
+link-aarch64 = urecpe._EXT_
+arm = vrecpe
+link-arm = vrecpe._EXT_
+generate uint32x2_t, uint32x4_t
+
+/// Floating-point reciprocal step
+name = vrecps
+a = 4.0, 3.0, 2.0, 1.0
+b = 4.0, 3.0, 2.0, 1.0
+validate -14., -7., -2., 1.
+
+aarch64 = frecps
+link-aarch64 = frecps._EXT_
+generate float64x*_t, f32, f64
+
+arm = vrecps
+link-arm = vrecps._EXT_
+generate float*_t
+
+/// Floating-point reciprocal exponent
+name = vrecpx
+a = 4.0
+validate 0.5
+
+aarch64 = frecpx
+link-aarch64 = frecpx._EXT_
+generate f32, f64
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = nop
+generate poly64x1_t:int64x1_t, poly64x1_t:uint64x1_t, int64x1_t:poly64x1_t, uint64x1_t:poly64x1_t
+generate poly64x2_t:int64x2_t, poly64x2_t:uint64x2_t, int64x2_t:poly64x2_t, uint64x2_t:poly64x2_t
+
+arm = nop
+generate uint8x8_t:int8x8_t, poly8x8_t:int8x8_t, poly16x4_t:int16x4_t, uint16x4_t:int16x4_t, uint32x2_t:int32x2_t, uint64x1_t:int64x1_t
+generate uint8x16_t:int8x16_t, poly8x16_t:int8x16_t, poly16x8_t:int16x8_t, uint16x8_t:int16x8_t, uint32x4_t:int32x4_t, uint64x2_t:int64x2_t
+generate poly8x8_t:uint8x8_t, int8x8_t:uint8x8_t, poly16x4_t:uint16x4_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t
+generate poly8x16_t:uint8x16_t, int8x16_t:uint8x16_t, poly16x8_t:uint16x8_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t
+generate int8x8_t:poly8x8_t, uint8x8_t:poly8x8_t, int16x4_t:poly16x4_t, uint16x4_t:poly16x4_t
+generate int8x16_t:poly8x16_t, uint8x16_t:poly8x16_t, int16x8_t:poly16x8_t, uint16x8_t:poly16x8_t
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0, 1, 2, 3, 4, 5, 6, 7
+validate 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+
+aarch64 = nop
+arm = nop
+generate int16x4_t:int8x8_t, uint16x4_t:int8x8_t, poly16x4_t:int8x8_t, int32x2_t:int16x4_t, uint32x2_t:int16x4_t, int64x1_t:int32x2_t, uint64x1_t:int32x2_t
+generate int16x8_t:int8x16_t, uint16x8_t:int8x16_t, poly16x8_t:int8x16_t, int32x4_t:int16x8_t, uint32x4_t:int16x8_t, int64x2_t:int32x4_t, uint64x2_t:int32x4_t
+generate poly16x4_t:uint8x8_t, int16x4_t:uint8x8_t, uint16x4_t:uint8x8_t, int32x2_t:uint16x4_t, uint32x2_t:uint16x4_t, int64x1_t:uint32x2_t, uint64x1_t:uint32x2_t
+generate poly16x8_t:uint8x16_t, int16x8_t:uint8x16_t, uint16x8_t:uint8x16_t, int32x4_t:uint16x8_t, uint32x4_t:uint16x8_t, int64x2_t:uint32x4_t, uint64x2_t:uint32x4_t
+generate poly16x4_t:poly8x8_t, int16x4_t:poly8x8_t, uint16x4_t:poly8x8_t, int32x2_t:poly16x4_t, uint32x2_t:poly16x4_t
+generate poly16x8_t:poly8x16_t, int16x8_t:poly8x16_t, uint16x8_t:poly8x16_t, int32x4_t:poly16x8_t, uint32x4_t:poly16x8_t
+target = aes
+generate poly64x1_t:int32x2_t, poly64x1_t:uint32x2_t
+generate poly64x2_t:int32x4_t, poly64x2_t:uint32x4_t
+generate p128:int64x2_t, p128:uint64x2_t, p128:poly64x2_t
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+validate 0, 1, 2, 3, 4, 5, 6, 7
+
+aarch64 = nop
+arm = nop
+generate poly8x8_t:int16x4_t, int8x8_t:int16x4_t, uint8x8_t:int16x4_t, poly16x4_t:int32x2_t, int16x4_t:int32x2_t, uint16x4_t:int32x2_t, int32x2_t:int64x1_t, uint32x2_t:int64x1_t
+generate poly8x16_t:int16x8_t, int8x16_t:int16x8_t, uint8x16_t:int16x8_t, poly16x8_t:int32x4_t, int16x8_t:int32x4_t, uint16x8_t:int32x4_t, int32x4_t:int64x2_t, uint32x4_t:int64x2_t
+generate poly8x8_t:uint16x4_t, int8x8_t:uint16x4_t, uint8x8_t:uint16x4_t, poly16x4_t:uint32x2_t, int16x4_t:uint32x2_t, uint16x4_t:uint32x2_t, int32x2_t:uint64x1_t, uint32x2_t:uint64x1_t
+generate poly8x16_t:uint16x8_t, int8x16_t:uint16x8_t, uint8x16_t:uint16x8_t, poly16x8_t:uint32x4_t, int16x8_t:uint32x4_t, uint16x8_t:uint32x4_t, int32x4_t:uint64x2_t, uint32x4_t:uint64x2_t
+generate poly8x8_t:poly16x4_t, int8x8_t:poly16x4_t, uint8x8_t:poly16x4_t
+generate poly8x16_t:poly16x8_t, int8x16_t:poly16x8_t, uint8x16_t:poly16x8_t
+target = aes
+generate int32x2_t:poly64x1_t, uint32x2_t:poly64x1_t
+generate int32x4_t:poly64x2_t, uint32x4_t:poly64x2_t
+generate int64x2_t:p128, uint64x2_t:p128, poly64x2_t:p128
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0, 1, 2, 3
+validate 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+
+aarch64 = nop
+arm = nop
+generate int32x2_t:int8x8_t, uint32x2_t:int8x8_t, int64x1_t:int16x4_t, uint64x1_t:int16x4_t
+generate int32x4_t:int8x16_t, uint32x4_t:int8x16_t, int64x2_t:int16x8_t, uint64x2_t:int16x8_t
+generate int32x2_t:uint8x8_t, uint32x2_t:uint8x8_t, int64x1_t:uint16x4_t, uint64x1_t:uint16x4_t
+generate int32x4_t:uint8x16_t, uint32x4_t:uint8x16_t, int64x2_t:uint16x8_t, uint64x2_t:uint16x8_t
+generate int32x2_t:poly8x8_t, uint32x2_t:poly8x8_t, int64x1_t:poly16x4_t, uint64x1_t:poly16x4_t
+generate int32x4_t:poly8x16_t, uint32x4_t:poly8x16_t, int64x2_t:poly16x8_t, uint64x2_t:poly16x8_t
+target = aes
+generate poly64x1_t:int16x4_t, poly64x1_t:uint16x4_t, poly64x1_t:poly16x4_t
+generate poly64x2_t:int16x8_t, poly64x2_t:uint16x8_t, poly64x2_t:poly16x8_t
+generate p128:int32x4_t, p128:uint32x4_t
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+validate 0, 1, 2, 3
+
+aarch64 = nop
+arm = nop
+generate poly8x8_t:int32x2_t, int8x8_t:int32x2_t, uint8x8_t:int32x2_t, poly16x4_t:int64x1_t, int16x4_t:int64x1_t, uint16x4_t:int64x1_t
+generate poly8x16_t:int32x4_t, int8x16_t:int32x4_t, uint8x16_t:int32x4_t, poly16x8_t:int64x2_t, int16x8_t:int64x2_t, uint16x8_t:int64x2_t
+generate poly8x8_t:uint32x2_t, int8x8_t:uint32x2_t, uint8x8_t:uint32x2_t, poly16x4_t:uint64x1_t, int16x4_t:uint64x1_t, uint16x4_t:uint64x1_t
+generate poly8x16_t:uint32x4_t, int8x16_t:uint32x4_t, uint8x16_t:uint32x4_t, poly16x8_t:uint64x2_t, int16x8_t:uint64x2_t, uint16x8_t:uint64x2_t
+target = aes
+generate poly16x4_t:poly64x1_t, int16x4_t:poly64x1_t, uint16x4_t:poly64x1_t
+generate poly16x8_t:poly64x2_t, int16x8_t:poly64x2_t, uint16x8_t:poly64x2_t
+generate int32x4_t:p128, uint32x4_t:p128
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0, 1
+validate 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+aarch64 = nop
+arm = nop
+generate int64x1_t:int8x8_t, uint64x1_t:int8x8_t, int64x1_t:uint8x8_t, uint64x1_t:uint8x8_t, int64x1_t:poly8x8_t, uint64x1_t:poly8x8_t
+generate int64x2_t:int8x16_t, uint64x2_t:int8x16_t, int64x2_t:uint8x16_t, uint64x2_t:uint8x16_t, int64x2_t:poly8x16_t, uint64x2_t:poly8x16_t
+target = aes
+generate poly64x1_t:int8x8_t, poly64x1_t:uint8x8_t, poly64x1_t:poly8x8_t
+generate poly64x2_t:int8x16_t, poly64x2_t:uint8x16_t, poly64x2_t:poly8x16_t
+generate p128:int16x8_t, p128:uint16x8_t, p128:poly16x8_t
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+validate 0, 1
+
+aarch64 = nop
+arm = nop
+generate poly8x8_t:int64x1_t, int8x8_t:int64x1_t, uint8x8_t:int64x1_t, poly8x8_t:uint64x1_t, int8x8_t:uint64x1_t, uint8x8_t:uint64x1_t
+generate poly8x16_t:int64x2_t, int8x16_t:int64x2_t, uint8x16_t:int64x2_t, poly8x16_t:uint64x2_t, int8x16_t:uint64x2_t, uint8x16_t:uint64x2_t
+target = aes
+generate poly8x8_t:poly64x1_t, int8x8_t:poly64x1_t, uint8x8_t:poly64x1_t
+generate poly8x16_t:poly64x2_t, int8x16_t:poly64x2_t, uint8x16_t:poly64x2_t
+generate int16x8_t:p128, uint16x8_t:p128, poly16x8_t:p128
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate 1
+target = aes
+
+aarch64 = nop
+arm = nop
+generate int8x16_t:p128, uint8x16_t:p128, poly8x16_t:p128
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 1
+validate 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+target = aes
+
+aarch64 = nop
+arm = nop
+generate p128:int8x16_t, p128:uint8x16_t, p128:poly8x16_t
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0., 0., 0., 0., 0., 0., 0., 0.
+validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+aarch64 = nop
+generate float64x1_t:int8x8_t, float64x1_t:int16x4_t, float64x1_t:int32x2_t, float64x1_t:int64x1_t
+generate float64x2_t:int8x16_t, float64x2_t:int16x8_t, float64x2_t:int32x4_t, float64x2_t:int64x2_t
+generate float64x1_t:uint8x8_t, float64x1_t:uint16x4_t, float64x1_t:uint32x2_t, float64x1_t:uint64x1_t
+generate float64x2_t:uint8x16_t, float64x2_t:uint16x8_t, float64x2_t:uint32x4_t, float64x2_t:uint64x2_t
+generate float64x1_t:poly8x8_t, float64x1_t:poly16x4_t, float32x2_t:poly64x1_t, float64x1_t:poly64x1_t
+generate float64x2_t:poly8x16_t, float64x2_t:poly16x8_t, float32x4_t:poly64x2_t, float64x2_t:poly64x2_t
+generate float64x2_t:p128
+
+arm = nop
+generate float32x2_t:int8x8_t, float32x2_t:int16x4_t, float32x2_t:int32x2_t, float32x2_t:int64x1_t
+generate float32x4_t:int8x16_t, float32x4_t:int16x8_t, float32x4_t:int32x4_t, float32x4_t:int64x2_t
+generate float32x2_t:uint8x8_t, float32x2_t:uint16x4_t, float32x2_t:uint32x2_t, float32x2_t:uint64x1_t
+generate float32x4_t:uint8x16_t, float32x4_t:uint16x8_t, float32x4_t:uint32x4_t, float32x4_t:uint64x2_t
+generate float32x2_t:poly8x8_t, float32x2_t:poly16x4_t
+generate float32x4_t:poly8x16_t, float32x4_t:poly16x8_t
+generate float32x4_t:p128
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+validate 0., 0., 0., 0., 0., 0., 0., 0.
+
+aarch64 = nop
+generate int8x8_t:float64x1_t, int16x4_t:float64x1_t, int32x2_t:float64x1_t, int64x1_t:float64x1_t
+generate int8x16_t:float64x2_t, int16x8_t:float64x2_t, int32x4_t:float64x2_t, int64x2_t:float64x2_t
+generate poly8x8_t:float64x1_t, uint16x4_t:float64x1_t, uint32x2_t:float64x1_t, uint64x1_t:float64x1_t
+generate poly8x16_t:float64x2_t, uint16x8_t:float64x2_t, uint32x4_t:float64x2_t, uint64x2_t:float64x2_t
+generate uint8x8_t:float64x1_t, poly16x4_t:float64x1_t, poly64x1_t:float64x1_t, poly64x1_t:float32x2_t
+generate uint8x16_t:float64x2_t, poly16x8_t:float64x2_t, poly64x2_t:float64x2_t, poly64x2_t:float32x4_t
+generate p128:float64x2_t
+
+arm = nop
+generate int8x8_t:float32x2_t, int16x4_t:float32x2_t, int32x2_t:float32x2_t, int64x1_t:float32x2_t
+generate int8x16_t:float32x4_t, int16x8_t:float32x4_t, int32x4_t:float32x4_t, int64x2_t:float32x4_t
+generate uint8x8_t:float32x2_t, uint16x4_t:float32x2_t, uint32x2_t:float32x2_t, uint64x1_t:float32x2_t
+generate uint8x16_t:float32x4_t, uint16x8_t:float32x4_t, uint32x4_t:float32x4_t, uint64x2_t:float32x4_t
+generate poly8x8_t:float32x2_t, poly16x4_t:float32x2_t
+generate poly8x16_t:float32x4_t, poly16x8_t:float32x4_t
+generate p128:float32x4_t
+
+/// Vector reinterpret cast operation
+name = vreinterpret
+double-suffixes
+fn = transmute
+a = 0., 0., 0., 0., 0., 0., 0., 0.
+validate 0., 0., 0., 0., 0., 0., 0., 0.
+
+aarch64 = nop
+generate float32x2_t:float64x1_t, float64x1_t:float32x2_t
+generate float32x4_t:float64x2_t, float64x2_t:float32x4_t
+
+/// Signed rounding shift left
+name = vrshl
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+
+aarch64 = srshl
+link-aarch64 = srshl._EXT_
+generate i64
+
+arm = vrshl
+link-arm = vrshifts._EXT_
+generate int*_t, int64x*_t
+
+/// Unsigned rounding shift left
+name = vrshl
+out-suffix
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+
+aarch64 = urshl
+link-aarch64 = urshl._EXT_
+generate u64:i64:u64
+
+arm = vrshl
+link-arm = vrshiftu._EXT_
+generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
+generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t
+
+/// Signed rounding shift right
+name = vrshr
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N) as _}
+a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = srshr
+arm = vrshr
+generate int*_t, int64x*_t
+
+/// Signed rounding shift right
+name = vrshr
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = vrshl-self-noext, a, -N as i64
+a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = srshr
+generate i64
+
+/// Unsigned rounding shift right
+name = vrshr
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N) as _}
+a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = urshr
+arm = vrshr
+generate uint*_t, uint64x*_t
+
+/// Unsigned rounding shift right
+name = vrshr
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = vrshl-self-noext, a, -N as i64
+a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = urshr
+generate u64
+
+/// Rounding shift right narrow
+name = vrshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+arm-aarch64-separate
+
+aarch64 = rshrn
+link-aarch64 = rshrn._EXT2_
+const-aarch64 = N
+
+arm = vrshrn
+link-arm = vrshiftn._EXT2_
+const-arm = -N as ttn
+generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
+
+/// Rounding shift right narrow
+name = vrshrn
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = transmute, {vrshrn_n-noqsigned-::<N>, transmute(a)}
+a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = rshrn
+arm = vrshrn
+generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
+
+/// Rounding shift right narrow
+name = vrshrn_high
+noq-n-suffix
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+a = 0, 1, 8, 9, 8, 9, 10, 11
+b = 32, 36, 40, 44, 48, 52, 56, 60
+n = 2
+validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = rshrn2
+generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
+generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
+
+/// Signed rounding shift right and accumulate
+name = vrsra
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = simd_add, a, {vrshr-nself-::<N>, b}
+a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+
+aarch64 = srsra
+arm = vrsra
+generate int*_t, int64x*_t
+
+/// Unsigned rounding shift right and accumulate
+name = vrsra
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = simd_add, a, {vrshr-nself-::<N>, b}
+a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+
+aarch64 = ursra
+arm = vrsra
+generate uint*_t, uint64x*_t
+
+/// Signed rounding shift right and accumulate.
+name = vrsra
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = vrshr-nself-::<N>, b:in_t, b
+multi_fn = a.wrapping_add(b)
+a = 1
+b = 4
+n = 2
+validate 2
+
+aarch64 = srsra
+generate i64
+
+/// Ungisned rounding shift right and accumulate.
+name = vrsra
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = vrshr-nself-::<N>, b:in_t, b
+multi_fn = a.wrapping_add(b)
+a = 1
+b = 4
+n = 2
+validate 2
+
+aarch64 = ursra
+generate u64
+
+/// Rounding subtract returning high narrow
+name = vrsubhn
+no-q
+a = MAX, MIN, 0, 4, 5, 6, 7, 8
+b = 1, 2, 3, 4, 5, 6, 7, 8
+validate MIN, MIN, 0, 0, 0, 0, 0, 0
+
+aarch64 = rsubhn
+link-aarch64 = rsubhn._EXT2_
+arm = vrsubhn
+link-arm = vrsubhn._EXT2_
+generate int16x8_t:int16x8_t:int8x8_t, int32x4_t:int32x4_t:int16x4_t, int64x2_t:int64x2_t:int32x2_t
+
+/// Rounding subtract returning high narrow
+name = vrsubhn
+no-q
+multi_fn = transmute, {vrsubhn-noqsigned-noext, {transmute, a}, {transmute, b}}
+a = MAX, MIN, 3, 4, 5, 6, 7, 8
+b = 1, 2, 3, 4, 5, 6, 7, 8
+validate 0, 0, 0, 0, 0, 0, 0, 0
+
+aarch64 = rsubhn
+arm = vrsubhn
+generate uint16x8_t:uint16x8_t:uint8x8_t, uint32x4_t:uint32x4_t:uint16x4_t, uint64x2_t:uint64x2_t:uint32x2_t
+
+/// Rounding subtract returning high narrow
+name = vrsubhn_high
+no-q
+multi_fn = vrsubhn-noqself-noext, x:in_t0, b, c
+multi_fn = simd_shuffle-out_len-!, a, x, {asc-0-out_len}
+a = 1, 2, 0, 0, 0, 0, 0, 0
+b = 1, 2, 3, 4, 5, 6, 7, 8
+c = 1, 2, 3, 4, 5, 6, 7, 8
+validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+aarch64 = rsubhn2
+generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t
+generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t
+
+/// Insert vector element from another vector element
+name = vset_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_insert, b, LANE as u32, a
+a = 1
+b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+n = 0
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = nop
+arm = nop
+generate i8:int8x8_t:int8x8_t, i16:int16x4_t:int16x4_t
+generate i32:int32x2_t:int32x2_t, i64:int64x1_t:int64x1_t
+generate u8:uint8x8_t:uint8x8_t, u16:uint16x4_t:uint16x4_t
+generate u32:uint32x2_t:uint32x2_t, u64:uint64x1_t:uint64x1_t
+generate p8:poly8x8_t:poly8x8_t, p16:poly16x4_t:poly16x4_t
+
+target = aes
+generate p64:poly64x1_t:poly64x1_t
+
+/// Insert vector element from another vector element
+name = vsetq_lane
+no-q
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_insert, b, LANE as u32, a
+a = 1
+b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+n = 0
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+aarch64 = nop
+arm = nop
+generate i8:int8x16_t:int8x16_t, i16:int16x8_t:int16x8_t
+generate i32:int32x4_t:int32x4_t, i64:int64x2_t:int64x2_t
+generate u8:uint8x16_t:uint8x16_t, u16:uint16x8_t:uint16x8_t
+generate u32:uint32x4_t:uint32x4_t, u64:uint64x2_t:uint64x2_t
+generate p8:poly8x16_t:poly8x16_t, p16:poly16x8_t:poly16x8_t
+
+target = aes
+generate p64:poly64x2_t:poly64x2_t
+
+/// Insert vector element from another vector element
+name = vset_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_insert, b, LANE as u32, a
+a = 1.
+b = 0., 2., 3., 4.
+n = 0
+validate 1., 2., 3., 4.
+
+aarch64 = nop
+generate f64:float64x1_t:float64x1_t
+
+arm = nop
+generate f32:float32x2_t:float32x2_t
+
+/// Insert vector element from another vector element
+name = vsetq_lane
+no-q
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_insert, b, LANE as u32, a
+a = 1.
+b = 0., 2., 3., 4.
+n = 0
+validate 1., 2., 3., 4.
+
+aarch64 = nop
+generate f64:float64x2_t:float64x2_t
+
+arm = nop
+generate f32:float32x4_t:float32x4_t
+
+/// Signed Shift left
+name = vshl
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+
+aarch64 = sshl
+link-aarch64 = sshl._EXT_
+arm = vshl
+link-arm = vshifts._EXT_
+generate int*_t, int64x*_t
+
+/// Signed Shift left
+name = vshl
+multi_fn = transmute, {vshl-in_ntt-noext, transmute(a), transmute(b)}
+a = 1
+b = 2
+validate 4
+
+aarch64 = sshl
+generate i64
+
+/// Unsigned Shift left
+name = vshl
+out-suffix
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+
+aarch64 = ushl
+link-aarch64 = ushl._EXT_
+arm = vshl
+link-arm = vshiftu._EXT_
+generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
+generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t
+
+/// Unsigned Shift left
+out-suffix
+name = vshl
+multi_fn = transmute, {vshl-out_ntt-noext, transmute(a), transmute(b)}
+a = 1
+b = 2
+validate 4
+
+aarch64 = ushl
+generate u64:i64:u64
+
+/// Shift left
+name = vshl
+n-suffix
+constn = N
+multi_fn = static_assert_imm-out_bits_exp_len-N
+multi_fn = simd_shl, a, {vdup-nself-noext, N as _}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+n = 2
+validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+
+arm = vshl
+aarch64 = shl
+generate int*_t, uint*_t, int64x*_t, uint64x*_t
+
+/// Signed shift left long
+name = vshll
+n-suffix
+constn = N
+multi_fn = static_assert-N-0-bits
+multi_fn = simd_shl, {simd_cast, a}, {vdup-nout-noext, N as _}
+a = 1, 2, 3, 4, 5, 6, 7, 8
+n = 2
+validate 4, 8, 12, 16, 20, 24, 28, 32
+
+arm = vshll.s
+aarch64 = sshll
+generate int8x8_t:int16x8_t, int16x4_t:int32x4_t, int32x2_t:int64x2_t
+aarch64 = ushll
+generate uint8x8_t:uint16x8_t, uint16x4_t:uint32x4_t, uint32x2_t:uint64x2_t
+
+/// Signed shift left long
+name = vshll_high_n
+no-q
+constn = N
+multi_fn = static_assert-N-0-bits
+multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen}
+multi_fn = vshll_n-noqself-::<N>, b
+a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8
+n = 2
+validate 4, 8, 12, 16, 20, 24, 28, 32
+
+aarch64 = sshll2
+generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t
+aarch64 = ushll2
+generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t
+
+/// Shift right
+name = vshr
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = fix_right_shift_imm-N-bits
+multi_fn = simd_shr, a, {vdup-nself-noext, n as _}
+a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+arm = vshr.s
+aarch64 = sshr
+generate int*_t, int64x*_t
+aarch64 = ushr
+generate uint*_t, uint64x*_t
+
+/// Shift right narrow
+name = vshrn_n
+no-q
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_cast, {simd_shr, a, {vdup-nself-noext, N as _}}
+a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+
+arm = vshrn.
+aarch64 = shrn
+generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
+generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
+
+/// Shift right narrow
+name = vshrn_high_n
+no-q
+constn = N
+multi_fn = static_assert-N-1-halfbits
+multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+a = 1, 2, 5, 6, 5, 6, 7, 8
+b = 20, 24, 28, 32, 52, 56, 60, 64
+n = 2
+validate 1, 2, 5, 6, 5, 6, 7, 8, 5, 6, 7, 8, 13, 14, 15, 16
+
+aarch64 = shrn2
+generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
+generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
+
+/// Signed shift right and accumulate
+name = vsra
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = simd_add, a, {vshr-nself-::<N>, b}
+a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+
+aarch64 = ssra
+arm = vsra
+generate int*_t, int64x*_t
+
+/// Unsigned shift right and accumulate
+name = vsra
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = simd_add, a, {vshr-nself-::<N>, b}
+a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
+n = 2
+validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+
+aarch64 = usra
+arm = vsra
+generate uint*_t, uint64x*_t
+
+/// SM3PARTW1
+name = vsm3partw1
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+c = 1, 2, 3, 4
+validate 2147549312, 3221323968, 131329, 2684362752
+target = sm4
+
+aarch64 = sm3partw1
+link-aarch64 = llvm.aarch64.crypto.sm3partw1
+generate uint32x4_t
+
+/// SM3PARTW2
+name = vsm3partw2
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+c = 1, 2, 3, 4
+validate 128, 256, 384, 1077977696
+target = sm4
+
+aarch64 = sm3partw2
+link-aarch64 = llvm.aarch64.crypto.sm3partw2
+generate uint32x4_t
+
+/// SM3SS1
+name = vsm3ss1
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+c = 1, 2, 3, 4
+validate 0, 0, 0, 2098176
+target = sm4
+
+aarch64 = sm3ss1
+link-aarch64 = llvm.aarch64.crypto.sm3ss1
+generate uint32x4_t
+
+/// SM4 key
+name = vsm4ekey
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+validate 1784948604, 136020997, 2940231695, 3789947679
+target = sm4
+
+aarch64 = sm4ekey
+link-aarch64 = llvm.aarch64.crypto.sm4ekey
+generate uint32x4_t
+
+/// SM4 encode
+name = vsm4e
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+validate 1093874472, 3616769504, 3878330411, 2765298765
+target = sm4
+
+aarch64 = sm4e
+link-aarch64 = llvm.aarch64.crypto.sm4e
+generate uint32x4_t
+
+/// Rotate and exclusive OR
+name = vrax1
+a = 1, 2
+b = 3, 4
+validate 7, 10
+target = sha3
+
+aarch64 = rax1
+link-aarch64 = llvm.aarch64.crypto.rax1
+generate uint64x2_t
+
+/// SHA512 hash update part 1
+name = vsha512h
+a = 1, 2
+b = 3, 4
+c = 5, 6
+validate 11189044327219203, 7177611956453380
+target = sha3
+
+aarch64 = sha512h
+link-aarch64 = llvm.aarch64.crypto.sha512h
+generate uint64x2_t
+
+/// SHA512 hash update part 2
+name = vsha512h2
+a = 1, 2
+b = 3, 4
+c = 5, 6
+validate 5770237651009406214, 349133864969
+target = sha3
+
+aarch64 = sha512h2
+link-aarch64 = llvm.aarch64.crypto.sha512h2
+generate uint64x2_t
+
+/// SHA512 schedule update 0
+name = vsha512su0
+a = 1, 2
+b = 3, 4
+validate 144115188075855874, 9439544818968559619
+target = sha3
+
+aarch64 = sha512su0
+link-aarch64 = llvm.aarch64.crypto.sha512su0
+generate uint64x2_t
+
+/// SHA512 schedule update 1
+name = vsha512su1
+a = 1, 2
+b = 3, 4
+c = 5, 6
+validate 105553116266526, 140737488355368
+target = sha3
+
+aarch64 = sha512su1
+link-aarch64 = llvm.aarch64.crypto.sha512su1
+generate uint64x2_t
+
+/// Floating-point round to 32-bit integer, using current rounding mode
+name = vrnd32x
+a = 1.1, 1.9, -1.7, -2.3
+validate 1.0, 2.0, -2.0, -2.0
+target = frintts
+
+aarch64 = frint32x
+link-aarch64 = frint32x._EXT_
+generate float32x2_t, float32x4_t
+
+/// Floating-point round to 32-bit integer toward zero
+name = vrnd32z
+a = 1.1, 1.9, -1.7, -2.3
+validate 1.0, 1.0, -1.0, -2.0
+target = frintts
+
+aarch64 = frint32z
+link-aarch64 = frint32z._EXT_
+generate float32x2_t, float32x4_t
+
+/// Floating-point round to 64-bit integer, using current rounding mode
+name = vrnd64x
+a = 1.1, 1.9, -1.7, -2.3
+validate 1.0, 2.0, -2.0, -2.0
+target = frintts
+
+aarch64 = frint64x
+link-aarch64 = frint64x._EXT_
+generate float32x2_t, float32x4_t
+
+/// Floating-point round to 64-bit integer toward zero
+name = vrnd64z
+a = 1.1, 1.9, -1.7, -2.3
+validate 1.0, 1.0, -1.0, -2.0
+target = frintts
+
+aarch64 = frint64z
+link-aarch64 = frint64z._EXT_
+generate float32x2_t, float32x4_t
+
+/// Transpose elements
+name = vtrn
+multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle-in_len-!, b1:in_t, a, b, {transpose-2-in_len}
+multi_fn = transmute, (a1, b1)
+a = 0, 2, 2, 6, 2, 10, 6, 14, 2, 18, 6, 22, 10, 26, 14, 30
+b = 1, 3, 3, 7, 3, 1, 7, 15, 3, 19, 7, 23, 1, 27, 15, 31
+validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15, 2, 3, 6, 7, 10, 1, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+
+aarch64 = trn
+arm = vtrn
+generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t, int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t
+generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t, uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t
+generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t, poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t
+aarch64 = zip
+generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t
+
+/// Transpose elements
+name = vtrn
+multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle-in_len-!, b1:in_t, a, b, {transpose-2-in_len}
+multi_fn = transmute, (a1, b1)
+a = 0., 2., 2., 6.
+b = 1., 3., 3., 7.
+validate 0., 1., 2., 3., 2., 3., 6., 7.
+
+aarch64 = zip
+arm = vtrn
+generate float32x2_t:float32x2_t:float32x2x2_t
+aarch64 = trn
+generate float32x4_t:float32x4_t:float32x4x2_t
+
+/// Transpose vectors
+name = vtrn1
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
+a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+
+aarch64 = trn1
+generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
+
+aarch64 = zip1
+generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
+
+/// Transpose vectors
+name = vtrn1
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
+a = 0., 2., 4., 6., 8., 10., 12., 14.
+b = 1., 3., 5., 7., 9., 11., 13., 15.
+validate 0., 1., 4., 5., 8., 9., 12., 13.
+
+aarch64 = trn1
+generate float32x4_t
+
+aarch64 = zip1
+generate float32x2_t, float64x2_t
+
+/// Transpose vectors
+name = vtrn2
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
+a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+
+aarch64 = trn2
+generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
+
+aarch64 = zip2
+generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
+
+/// Transpose vectors
+name = vtrn2
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
+a = 0., 2., 4., 6., 8., 10., 12., 14.
+b = 1., 3., 5., 7., 9., 11., 13., 15.
+validate 2., 3., 6., 7., 10., 11., 14., 15.
+
+aarch64 = trn2
+generate float32x4_t
+
+aarch64 = zip2
+generate float32x2_t, float64x2_t
+
+/// Zip vectors
+name = vzip
+multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {zip-2-in_len}
+multi_fn = transmute, (a0, b0)
+a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+
+aarch64 = zip
+arm = vzip
+generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t
+generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t
+generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t
+arm = vtrn
+generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t
+aarch64 = ext
+arm = vorr
+generate int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t
+generate uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t
+generate poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t
+
+/// Zip vectors
+name = vzip
+multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {zip-2-in_len}
+multi_fn = transmute, (a0, b0)
+a = 1., 2., 3., 4.
+b = 5., 6., 7., 8.
+validate 1., 5., 2., 6., 3., 7., 4., 8.
+
+aarch64 = zip
+arm = vtrn
+generate float32x2_t:float32x2_t:float32x2x2_t
+aarch64 = ext
+arm = vorr
+generate float32x4_t:float32x4_t:float32x4x2_t
+
+/// Zip vectors
+name = vzip1
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
+a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = zip1
+generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t
+
+/// Zip vectors
+name = vzip1
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
+a = 0., 2., 4., 6., 8., 10., 12., 14.
+b = 1., 3., 5., 7., 9., 11., 13., 15.
+validate 0., 1., 2., 3., 4., 5., 6., 7.
+
+aarch64 = zip1
+generate float32x2_t, float32x4_t, float64x2_t
+
+/// Zip vectors
+name = vzip2
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
+a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30
+b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31
+validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+
+aarch64 = zip2
+generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t
+
+/// Zip vectors
+name = vzip2
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
+a = 0., 8., 8., 10., 8., 10., 12., 14.
+b = 1., 9., 9., 11., 9., 11., 13., 15.
+validate 8., 9., 10., 11., 12., 13., 14., 15.
+
+aarch64 = zip2
+generate float32x2_t, float32x4_t, float64x2_t
+
+/// Unzip vectors
+name = vuzp
+multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {unzip-2-in_len}
+multi_fn = transmute, (a0, b0)
+a = 1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 15, 8, 16
+b = 2, 3, 3, 8, 3, 15, 8, 16, 3, 29, 8, 30, 15, 31, 16, 32
+validate 1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16, 2, 3, 3, 8, 3, 8, 15, 16, 3, 8, 15, 16, 29, 30, 31, 32
+
+aarch64 = uzp
+arm = vuzp
+generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t, int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t
+generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t, uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t
+generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t, poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t
+aarch64 = zip
+arm = vtrn
+generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t
+
+/// Unzip vectors
+name = vuzp
+multi_fn = simd_shuffle-in_len-!, a0:in_t, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, b0:in_t, a, b, {unzip-2-in_len}
+multi_fn = transmute, (a0, b0)
+a = 1., 2., 2., 4.
+b = 2., 6., 6., 8.
+validate 1., 2., 2., 6., 2., 4., 6., 8.
+
+aarch64 = zip
+arm = vtrn
+generate float32x2_t:float32x2_t:float32x2x2_t
+aarch64 = uzp
+arm = vuzp
+generate float32x4_t:float32x4_t:float32x4x2_t
+
+/// Unzip vectors
+name = vuzp1
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
+a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0
+b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0
+validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16
+
+aarch64 = uzp1
+generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
+
+aarch64 = zip1
+generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
+
+/// Unzip vectors
+name = vuzp1
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
+a = 0., 8., 1., 9., 4., 12., 5., 13.
+b = 1., 10., 3., 11., 6., 14., 7., 15.
+validate 0., 1., 1., 3., 4., 5., 6., 7.
+
+aarch64 = uzp1
+generate float32x4_t
+
+aarch64 = zip1
+generate float32x2_t, float64x2_t
+
+/// Unzip vectors
+name = vuzp2
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
+a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24
+b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32
+validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32
+
+aarch64 = uzp2
+generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
+
+aarch64 = zip2
+generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
+
+/// Unzip vectors
+name = vuzp2
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
+a = 0., 8., 1., 9., 4., 12., 5., 13.
+b = 2., 9., 3., 11., 6., 14., 7., 15.
+validate 8., 9., 9., 11., 12., 13., 14., 15.
+
+aarch64 = uzp2
+generate float32x4_t
+
+aarch64 = zip2
+generate float32x2_t, float64x2_t
+
+////////////////////
+// Unsigned Absolute difference and Accumulate Long
+////////////////////
+
+/// Unsigned Absolute difference and Accumulate Long
+name = vabal
+multi_fn = vabd-unsigned-noext, b, c, d:in_t
+multi_fn = simd_add, a, {simd_cast, d}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
+validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20
+
+arm = vabal.s
+aarch64 = uabal
+generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
+
+/// Unsigned Absolute difference and Accumulate Long
+name = vabal_high
+no-q
+multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = vabd_u8, d, e, f:uint8x8_t
+multi_fn = simd_add, a, {simd_cast, f}
+a = 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
+validate 20, 20, 20, 20, 20, 20, 20, 20
+
+aarch64 = uabal
+generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t
+
+/// Unsigned Absolute difference and Accumulate Long
+name = vabal_high
+no-q
+multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7]
+multi_fn = vabd_u16, d, e, f:uint16x4_t
+multi_fn = simd_add, a, {simd_cast, f}
+a = 9, 10, 11, 12
+b = 1, 2, 3, 4, 9, 10, 11, 12
+c = 10, 10, 10, 10, 20, 0, 2, 4
+validate 20, 20, 20, 20
+
+aarch64 = uabal
+generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
+
+/// Unsigned Absolute difference and Accumulate Long
+name = vabal_high
+no-q
+multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3]
+multi_fn = vabd_u32, d, e, f:uint32x2_t
+multi_fn = simd_add, a, {simd_cast, f}
+a = 15, 16
+b = 1, 2, 15, 16
+c = 10, 10, 10, 12
+validate 20, 20
+
+aarch64 = uabal
+generate uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
+
+////////////////////
+// Signed Absolute difference and Accumulate Long
+////////////////////
+
+/// Signed Absolute difference and Accumulate Long
+name = vabal
+multi_fn = vabd-signed-noext, b, c, d:int8x8_t
+multi_fn = simd_cast, e:uint8x8_t, d
+multi_fn = simd_add, a, {simd_cast, e}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
+validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20
+
+arm = vabal.s
+aarch64 = sabal
+generate int16x8_t:int8x8_t:int8x8_t:int16x8_t
+
+/// Signed Absolute difference and Accumulate Long
+name = vabal
+multi_fn = vabd-signed-noext, b, c, d:int16x4_t
+multi_fn = simd_cast, e:uint16x4_t, d
+multi_fn = simd_add, a, {simd_cast, e}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
+validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20
+
+arm = vabal.s
+aarch64 = sabal
+generate int32x4_t:int16x4_t:int16x4_t:int32x4_t
+
+/// Signed Absolute difference and Accumulate Long
+name = vabal
+multi_fn = vabd-signed-noext, b, c, d:int32x2_t
+multi_fn = simd_cast, e:uint32x2_t, d
+multi_fn = simd_add, a, {simd_cast, e}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
+validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20
+
+arm = vabal.s
+aarch64 = sabal
+generate int64x2_t:int32x2_t:int32x2_t:int64x2_t
+
+/// Signed Absolute difference and Accumulate Long
+name = vabal_high
+no-q
+multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = vabd_s8, d, e, f:int8x8_t
+multi_fn = simd_cast, f:uint8x8_t, f
+multi_fn = simd_add, a, {simd_cast, f}
+a = 9, 10, 11, 12, 13, 14, 15, 16
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
+validate 20, 20, 20, 20, 20, 20, 20, 20
+
+aarch64 = sabal
+generate int16x8_t:int8x16_t:int8x16_t:int16x8_t
+
+/// Signed Absolute difference and Accumulate Long
+name = vabal_high
+no-q
+multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7]
+multi_fn = vabd_s16, d, e, f:int16x4_t
+multi_fn = simd_cast, f:uint16x4_t, f
+multi_fn = simd_add, a, {simd_cast, f}
+a = 9, 10, 11, 12
+b = 1, 2, 3, 4, 9, 10, 11, 12
+c = 10, 10, 10, 10, 20, 0, 2, 4
+validate 20, 20, 20, 20
+
+aarch64 = sabal
+generate int32x4_t:int16x8_t:int16x8_t:int32x4_t
+
+/// Signed Absolute difference and Accumulate Long
+name = vabal_high
+no-q
+multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3]
+multi_fn = vabd_s32, d, e, f:int32x2_t
+multi_fn = simd_cast, f:uint32x2_t, f
+multi_fn = simd_add, a, {simd_cast, f}
+a = 15, 16
+b = 1, 2, 15, 16
+c = 10, 10, 10, 12
+validate 20, 20
+
+aarch64 = sabal
+generate int64x2_t:int32x4_t:int32x4_t:int64x2_t
+
+////////////////////
+// Singned saturating Absolute value
+////////////////////
+
+/// Singned saturating Absolute value
+name = vqabs
+a = MIN, MAX, -6, -5, -4, -3, -2, -1, 0, -127, 127, 1, 2, 3, 4, 5
+validate MAX, MAX, 6, 5, 4, 3, 2, 1, 0, 127, 127, 1, 2, 3, 4, 5
+
+arm = vqabs.s
+aarch64 = sqabs
+link-arm = vqabs._EXT_
+link-aarch64 = sqabs._EXT_
+generate int*_t
+
+/// Singned saturating Absolute value
+name = vqabs
+a = MIN, -7
+validate MAX, 7
+
+aarch64 = sqabs
+link-aarch64 = sqabs._EXT_
+generate int64x*_t
+
+/// Signed saturating absolute value
+name = vqabs
+multi_fn = simd_extract, {vqabs-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+a = -7
+validate 7
+
+aarch64 = sqabs
+generate i8:i8, i16:i16
+
+/// Signed saturating absolute value
+name = vqabs
+a = -7
+validate 7
+
+aarch64 = sqabs
+link-aarch64 = sqabs._EXT_
+generate i32:i32, i64:i64
+
+/// Shift left and insert
+name = vsli
+n-suffix
+constn = N
+multi_fn = static_assert-N-0-63
+multi_fn = transmute, {vsli_n-in_ntt-::<N>, transmute(a), transmute(b)}
+a = 333
+b = 2042
+n = 2
+validate 8169
+
+aarch64 = sli
+generate i64, u64
+
+/// Shift right and insert
+name = vsri
+n-suffix
+constn = N
+multi_fn = static_assert-N-1-bits
+multi_fn = transmute, {vsri_n-in_ntt-::<N>, transmute(a), transmute(b)}
+a = 333
+b = 2042
+n = 2
+validate 510
+
+aarch64 = sri
+generate i64, u64
diff --git a/library/stdarch/crates/stdarch-gen/src/main.rs b/library/stdarch/crates/stdarch-gen/src/main.rs
new file mode 100644
index 000000000..a2ae250a7
--- /dev/null
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@@ -0,0 +1,3391 @@
+use self::Suffix::*;
+use self::TargetFeature::*;
+use std::env;
+use std::fs::File;
+use std::io::prelude::*;
+use std::io::{self, BufReader};
+use std::path::PathBuf;
+
+const IN: &str = "neon.spec";
+const ARM_OUT: &str = "generated.rs";
+const AARCH64_OUT: &str = "generated.rs";
+
+const UINT_TYPES: [&str; 6] = [
+    "uint8x8_t",
+    "uint8x16_t",
+    "uint16x4_t",
+    "uint16x8_t",
+    "uint32x2_t",
+    "uint32x4_t",
+];
+
+const UINT_TYPES_64: [&str; 2] = ["uint64x1_t", "uint64x2_t"];
+
+const INT_TYPES: [&str; 6] = [
+    "int8x8_t",
+    "int8x16_t",
+    "int16x4_t",
+    "int16x8_t",
+    "int32x2_t",
+    "int32x4_t",
+];
+
+const INT_TYPES_64: [&str; 2] = ["int64x1_t", "int64x2_t"];
+
+const FLOAT_TYPES: [&str; 2] = [
+    //"float8x8_t", not supported by rust
+    //"float8x16_t", not supported by rust
+    //"float16x4_t", not supported by rust
+    //"float16x8_t", not supported by rust
+    "float32x2_t",
+    "float32x4_t",
+];
+
+const FLOAT_TYPES_64: [&str; 2] = [
+    //"float8x8_t", not supported by rust
+    //"float8x16_t", not supported by rust
+    //"float16x4_t", not supported by rust
+    //"float16x8_t", not supported by rust
+    "float64x1_t",
+    "float64x2_t",
+];
+
+fn type_len(t: &str) -> usize {
+    let s: Vec<_> = t.split("x").collect();
+    if s.len() == 2 {
+        match &s[1][0..2] {
+            "1_" => 1,
+            "2_" => 2,
+            "4_" => 4,
+            "8_" => 8,
+            "16" => 16,
+            _ => panic!("unknown type: {}", t),
+        }
+    } else if s.len() == 3 {
+        s[1].parse::<usize>().unwrap() * type_sub_len(t)
+    } else {
+        1
+    }
+}
+
+fn type_sub_len(t: &str) -> usize {
+    let s: Vec<_> = t.split('x').collect();
+    if s.len() != 3 {
+        1
+    } else {
+        match s[2] {
+            "2_t" => 2,
+            "3_t" => 3,
+            "4_t" => 4,
+            _ => panic!("unknown type len: {}", t),
+        }
+    }
+}
+
+fn type_bits(t: &str) -> usize {
+    match t {
+        "int8x8_t" | "int8x16_t" | "uint8x8_t" | "uint8x16_t" | "poly8x8_t" | "poly8x16_t"
+        | "i8" | "u8" => 8,
+        "int16x4_t" | "int16x8_t" | "uint16x4_t" | "uint16x8_t" | "poly16x4_t" | "poly16x8_t"
+        | "i16" | "u16" => 16,
+        "int32x2_t" | "int32x4_t" | "uint32x2_t" | "uint32x4_t" | "i32" | "u32" | "float32x2_t"
+        | "float32x4_t" | "f32" => 32,
+        "int64x1_t" | "int64x2_t" | "uint64x1_t" | "uint64x2_t" | "poly64x1_t" | "poly64x2_t"
+        | "i64" | "u64" | "float64x1_t" | "float64x2_t" | "f64" => 64,
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_exp_len(t: &str, base_len: usize) -> usize {
+    let t = type_to_sub_type(t);
+    let len = type_len(&t) / base_len;
+    match len {
+        1 => 0,
+        2 => 1,
+        4 => 2,
+        8 => 3,
+        16 => 4,
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_bits_exp_len(t: &str) -> usize {
+    match t {
+        "int8x8_t" | "int8x16_t" | "uint8x8_t" | "uint8x16_t" | "poly8x8_t" | "poly8x16_t"
+        | "i8" | "u8" => 3,
+        "int16x4_t" | "int16x8_t" | "uint16x4_t" | "uint16x8_t" | "poly16x4_t" | "poly16x8_t"
+        | "i16" | "u16" => 4,
+        "int32x2_t" | "int32x4_t" | "uint32x2_t" | "uint32x4_t" | "i32" | "u32" => 5,
+        "int64x1_t" | "int64x2_t" | "uint64x1_t" | "uint64x2_t" | "poly64x1_t" | "poly64x2_t"
+        | "i64" | "u64" => 6,
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_to_suffix(t: &str) -> &str {
+    match t {
+        "int8x8_t" => "_s8",
+        "int8x16_t" => "q_s8",
+        "int16x4_t" => "_s16",
+        "int16x8_t" => "q_s16",
+        "int32x2_t" => "_s32",
+        "int32x4_t" => "q_s32",
+        "int64x1_t" => "_s64",
+        "int64x2_t" => "q_s64",
+        "uint8x8_t" => "_u8",
+        "uint8x16_t" => "q_u8",
+        "uint16x4_t" => "_u16",
+        "uint16x8_t" => "q_u16",
+        "uint32x2_t" => "_u32",
+        "uint32x4_t" => "q_u32",
+        "uint64x1_t" => "_u64",
+        "uint64x2_t" => "q_u64",
+        "float16x4_t" => "_f16",
+        "float16x8_t" => "q_f16",
+        "float32x2_t" => "_f32",
+        "float32x4_t" => "q_f32",
+        "float64x1_t" => "_f64",
+        "float64x2_t" => "q_f64",
+        "poly8x8_t" => "_p8",
+        "poly8x16_t" => "q_p8",
+        "poly16x4_t" => "_p16",
+        "poly16x8_t" => "q_p16",
+        "poly64x1_t" => "_p64",
+        "poly64x2_t" => "q_p64",
+        "int8x8x2_t" => "_s8_x2",
+        "int8x8x3_t" => "_s8_x3",
+        "int8x8x4_t" => "_s8_x4",
+        "int16x4x2_t" => "_s16_x2",
+        "int16x4x3_t" => "_s16_x3",
+        "int16x4x4_t" => "_s16_x4",
+        "int32x2x2_t" => "_s32_x2",
+        "int32x2x3_t" => "_s32_x3",
+        "int32x2x4_t" => "_s32_x4",
+        "int64x1x2_t" => "_s64_x2",
+        "int64x1x3_t" => "_s64_x3",
+        "int64x1x4_t" => "_s64_x4",
+        "uint8x8x2_t" => "_u8_x2",
+        "uint8x8x3_t" => "_u8_x3",
+        "uint8x8x4_t" => "_u8_x4",
+        "uint16x4x2_t" => "_u16_x2",
+        "uint16x4x3_t" => "_u16_x3",
+        "uint16x4x4_t" => "_u16_x4",
+        "uint32x2x2_t" => "_u32_x2",
+        "uint32x2x3_t" => "_u32_x3",
+        "uint32x2x4_t" => "_u32_x4",
+        "uint64x1x2_t" => "_u64_x2",
+        "uint64x1x3_t" => "_u64_x3",
+        "uint64x1x4_t" => "_u64_x4",
+        "poly8x8x2_t" => "_p8_x2",
+        "poly8x8x3_t" => "_p8_x3",
+        "poly8x8x4_t" => "_p8_x4",
+        "poly16x4x2_t" => "_p16_x2",
+        "poly16x4x3_t" => "_p16_x3",
+        "poly16x4x4_t" => "_p16_x4",
+        "poly64x1x2_t" => "_p64_x2",
+        "poly64x1x3_t" => "_p64_x3",
+        "poly64x1x4_t" => "_p64_x4",
+        "float32x2x2_t" => "_f32_x2",
+        "float32x2x3_t" => "_f32_x3",
+        "float32x2x4_t" => "_f32_x4",
+        "float64x1x2_t" => "_f64_x2",
+        "float64x1x3_t" => "_f64_x3",
+        "float64x1x4_t" => "_f64_x4",
+        "int8x16x2_t" => "q_s8_x2",
+        "int8x16x3_t" => "q_s8_x3",
+        "int8x16x4_t" => "q_s8_x4",
+        "int16x8x2_t" => "q_s16_x2",
+        "int16x8x3_t" => "q_s16_x3",
+        "int16x8x4_t" => "q_s16_x4",
+        "int32x4x2_t" => "q_s32_x2",
+        "int32x4x3_t" => "q_s32_x3",
+        "int32x4x4_t" => "q_s32_x4",
+        "int64x2x2_t" => "q_s64_x2",
+        "int64x2x3_t" => "q_s64_x3",
+        "int64x2x4_t" => "q_s64_x4",
+        "uint8x16x2_t" => "q_u8_x2",
+        "uint8x16x3_t" => "q_u8_x3",
+        "uint8x16x4_t" => "q_u8_x4",
+        "uint16x8x2_t" => "q_u16_x2",
+        "uint16x8x3_t" => "q_u16_x3",
+        "uint16x8x4_t" => "q_u16_x4",
+        "uint32x4x2_t" => "q_u32_x2",
+        "uint32x4x3_t" => "q_u32_x3",
+        "uint32x4x4_t" => "q_u32_x4",
+        "uint64x2x2_t" => "q_u64_x2",
+        "uint64x2x3_t" => "q_u64_x3",
+        "uint64x2x4_t" => "q_u64_x4",
+        "poly8x16x2_t" => "q_p8_x2",
+        "poly8x16x3_t" => "q_p8_x3",
+        "poly8x16x4_t" => "q_p8_x4",
+        "poly16x8x2_t" => "q_p16_x2",
+        "poly16x8x3_t" => "q_p16_x3",
+        "poly16x8x4_t" => "q_p16_x4",
+        "poly64x2x2_t" => "q_p64_x2",
+        "poly64x2x3_t" => "q_p64_x3",
+        "poly64x2x4_t" => "q_p64_x4",
+        "float32x4x2_t" => "q_f32_x2",
+        "float32x4x3_t" => "q_f32_x3",
+        "float32x4x4_t" => "q_f32_x4",
+        "float64x2x2_t" => "q_f64_x2",
+        "float64x2x3_t" => "q_f64_x3",
+        "float64x2x4_t" => "q_f64_x4",
+        "i8" => "b_s8",
+        "i16" => "h_s16",
+        "i32" => "s_s32",
+        "i64" => "d_s64",
+        "u8" => "b_u8",
+        "u16" => "h_u16",
+        "u32" => "s_u32",
+        "u64" => "d_u64",
+        "f32" => "s_f32",
+        "f64" => "d_f64",
+        "p8" => "b_p8",
+        "p16" => "h_p16",
+        "p128" => "q_p128",
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_to_dup_suffix(t: &str) -> String {
+    let s: Vec<_> = type_to_suffix(t).split('_').collect();
+    assert_eq!(s.len(), 2);
+    format!("{}_dup_{}", s[0], s[1])
+}
+
+fn type_to_lane_suffix(t: &str) -> String {
+    let s: Vec<_> = type_to_suffix(t).split('_').collect();
+    assert_eq!(s.len(), 2);
+    format!("{}_lane_{}", s[0], s[1])
+}
+
+fn type_to_n_suffix(t: &str) -> &str {
+    match t {
+        "int8x8_t" => "_n_s8",
+        "int8x16_t" => "q_n_s8",
+        "int16x4_t" => "_n_s16",
+        "int16x8_t" => "q_n_s16",
+        "int32x2_t" => "_n_s32",
+        "int32x4_t" => "q_n_s32",
+        "int64x1_t" => "_n_s64",
+        "int64x2_t" => "q_n_s64",
+        "uint8x8_t" => "_n_u8",
+        "uint8x16_t" => "q_n_u8",
+        "uint16x4_t" => "_n_u16",
+        "uint16x8_t" => "q_n_u16",
+        "uint32x2_t" => "_n_u32",
+        "uint32x4_t" => "q_n_u32",
+        "uint64x1_t" => "_n_u64",
+        "uint64x2_t" => "q_n_u64",
+        "float16x4_t" => "_n_f16",
+        "float16x8_t" => "q_n_f16",
+        "float32x2_t" => "_n_f32",
+        "float32x4_t" => "q_n_f32",
+        "float64x1_t" => "_n_f64",
+        "float64x2_t" => "q_n_f64",
+        "poly8x8_t" => "_n_p8",
+        "poly8x16_t" => "q_n_p8",
+        "poly16x4_t" => "_n_p16",
+        "poly16x8_t" => "q_n_p16",
+        "poly64x1_t" => "_n_p64",
+        "poly64x2_t" => "q_n_p64",
+        "i8" => "b_n_s8",
+        "i16" => "h_n_s16",
+        "i32" => "s_n_s32",
+        "i64" => "d_n_s64",
+        "u8" => "b_n_u8",
+        "u16" => "h_n_u16",
+        "u32" => "s_n_u32",
+        "u64" => "d_n_u64",
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_to_noq_n_suffix(t: &str) -> &str {
+    match t {
+        "int8x8_t" | "int8x16_t" => "_n_s8",
+        "int16x4_t" | "int16x8_t" => "_n_s16",
+        "int32x2_t" | "int32x4_t" => "_n_s32",
+        "int64x1_t" | "int64x2_t" => "_n_s64",
+        "uint8x8_t" | "uint8x16_t" => "_n_u8",
+        "uint16x4_t" | "uint16x8_t" => "_n_u16",
+        "uint32x2_t" | "uint32x4_t" => "_n_u32",
+        "uint64x1_t" | "uint64x2_t" => "_n_u64",
+        "float16x4_t" | "float16x8_t" => "_n_f16",
+        "float32x2_t" | "float32x4_t" => "_n_f32",
+        "float64x1_t" | "float64x2_t" => "_n_f64",
+        "poly8x8_t" | "poly8x16_t" => "_n_p8",
+        "poly16x4_t" | "poly16x8_t" => "_n_p16",
+        "poly64x1_t" | "poly64x2_t" => "_n_p64",
+        "i8" => "b_n_s8",
+        "i16" => "h_n_s16",
+        "i32" => "s_n_s32",
+        "i64" => "d_n_s64",
+        "u8" => "b_n_u8",
+        "u16" => "h_n_u16",
+        "u32" => "s_n_u32",
+        "u64" => "d_n_u64",
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_to_lane_suffixes<'a>(out_t: &'a str, in_t: &'a str, re_to_out: bool) -> String {
+    let mut str = String::new();
+    let suf = type_to_suffix(out_t);
+    if !suf.starts_with("_") {
+        str.push_str(&suf[0..1]);
+    }
+    str.push_str("_lane");
+    if !re_to_out {
+        str.push_str(type_to_suffix(in_t));
+    } else {
+        if type_to_suffix(in_t).starts_with("q") {
+            str.push_str("q");
+        };
+        let suf2 = type_to_noq_suffix(out_t);
+        str.push_str(suf2);
+    }
+    str
+}
+
+fn type_to_rot_suffix(c_name: &str, suf: &str) -> String {
+    let ns: Vec<_> = c_name.split('_').collect();
+    assert_eq!(ns.len(), 2);
+    if suf.starts_with("q") {
+        format!("{}q_{}{}", ns[0], ns[1], &suf[1..])
+    } else {
+        format!("{}{}", c_name, suf)
+    }
+}
+
+fn type_to_signed(t: &str) -> String {
+    let s = t.replace("uint", "int");
+    let s = s.replace("poly", "int");
+    s
+}
+
+fn type_to_unsigned(t: &str) -> String {
+    if t.contains("uint") {
+        return t.to_string();
+    }
+    let s = t.replace("int", "uint");
+    let s = s.replace("poly", "uint");
+    s
+}
+
+fn type_to_double_suffixes<'a>(out_t: &'a str, in_t: &'a str) -> String {
+    let mut str = String::new();
+    let suf = type_to_suffix(in_t);
+    if suf.starts_with("q") && type_to_suffix(out_t).starts_with("q") {
+        str.push_str("q");
+    }
+    if !suf.starts_with("_") && !suf.starts_with("q") {
+        str.push_str(&suf[0..1]);
+    }
+    str.push_str(type_to_noq_suffix(out_t));
+    str.push_str(type_to_noq_suffix(in_t));
+    str
+}
+
+fn type_to_double_n_suffixes<'a>(out_t: &'a str, in_t: &'a str) -> String {
+    let mut str = String::new();
+    let suf = type_to_suffix(in_t);
+    if suf.starts_with("q") && type_to_suffix(out_t).starts_with("q") {
+        str.push_str("q");
+    }
+    if !suf.starts_with("_") && !suf.starts_with("q") {
+        str.push_str(&suf[0..1]);
+    }
+    str.push_str("_n");
+    str.push_str(type_to_noq_suffix(out_t));
+    str.push_str(type_to_noq_suffix(in_t));
+    str
+}
+
+fn type_to_noq_double_suffixes<'a>(out_t: &'a str, in_t: &'a str) -> String {
+    let mut str = String::new();
+    str.push_str(type_to_noq_suffix(out_t));
+    str.push_str(type_to_noq_suffix(in_t));
+    str
+}
+
+fn type_to_noq_suffix(t: &str) -> &str {
+    match t {
+        "int8x8_t" | "int8x16_t" | "i8" => "_s8",
+        "int16x4_t" | "int16x8_t" | "i16" => "_s16",
+        "int32x2_t" | "int32x4_t" | "i32" => "_s32",
+        "int64x1_t" | "int64x2_t" | "i64" => "_s64",
+        "uint8x8_t" | "uint8x16_t" | "u8" => "_u8",
+        "uint16x4_t" | "uint16x8_t" | "u16" => "_u16",
+        "uint32x2_t" | "uint32x4_t" | "u32" => "_u32",
+        "uint64x1_t" | "uint64x2_t" | "u64" => "_u64",
+        "float16x4_t" | "float16x8_t" => "_f16",
+        "float32x2_t" | "float32x4_t" | "f32" => "_f32",
+        "float64x1_t" | "float64x2_t" | "f64" => "_f64",
+        "poly8x8_t" | "poly8x16_t" => "_p8",
+        "poly16x4_t" | "poly16x8_t" => "_p16",
+        "poly64x1_t" | "poly64x2_t" | "p64" => "_p64",
+        "p128" => "_p128",
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+#[derive(Clone, Copy)]
+enum Suffix {
+    Normal,
+    Double,
+    NoQ,
+    NoQDouble,
+    NSuffix,
+    DoubleN,
+    NoQNSuffix,
+    OutSuffix,
+    OutNSuffix,
+    OutNox,
+    In1Nox,
+    OutDupNox,
+    OutLaneNox,
+    In1LaneNox,
+    Lane,
+    In2,
+    In2Lane,
+    OutLane,
+    Rot,
+    RotLane,
+}
+
+#[derive(Clone, Copy)]
+enum TargetFeature {
+    Default,
+    ArmV7,
+    Vfp4,
+    FPArmV8,
+    AES,
+    FCMA,
+    Dotprod,
+    I8MM,
+    SHA3,
+    RDM,
+    SM4,
+    FTTS,
+}
+
+#[derive(Clone, Copy)]
+enum Fntype {
+    Normal,
+    Load,
+    Store,
+}
+
+fn type_to_global_type(t: &str) -> &str {
+    match t {
+        "int8x8_t" | "int8x8x2_t" | "int8x8x3_t" | "int8x8x4_t" => "i8x8",
+        "int8x16_t" | "int8x16x2_t" | "int8x16x3_t" | "int8x16x4_t" => "i8x16",
+        "int16x4_t" | "int16x4x2_t" | "int16x4x3_t" | "int16x4x4_t" => "i16x4",
+        "int16x8_t" | "int16x8x2_t" | "int16x8x3_t" | "int16x8x4_t" => "i16x8",
+        "int32x2_t" | "int32x2x2_t" | "int32x2x3_t" | "int32x2x4_t" => "i32x2",
+        "int32x4_t" | "int32x4x2_t" | "int32x4x3_t" | "int32x4x4_t" => "i32x4",
+        "int64x1_t" | "int64x1x2_t" | "int64x1x3_t" | "int64x1x4_t" => "i64x1",
+        "int64x2_t" | "int64x2x2_t" | "int64x2x3_t" | "int64x2x4_t" => "i64x2",
+        "uint8x8_t" | "uint8x8x2_t" | "uint8x8x3_t" | "uint8x8x4_t" => "u8x8",
+        "uint8x16_t" | "uint8x16x2_t" | "uint8x16x3_t" | "uint8x16x4_t" => "u8x16",
+        "uint16x4_t" | "uint16x4x2_t" | "uint16x4x3_t" | "uint16x4x4_t" => "u16x4",
+        "uint16x8_t" | "uint16x8x2_t" | "uint16x8x3_t" | "uint16x8x4_t" => "u16x8",
+        "uint32x2_t" | "uint32x2x2_t" | "uint32x2x3_t" | "uint32x2x4_t" => "u32x2",
+        "uint32x4_t" | "uint32x4x2_t" | "uint32x4x3_t" | "uint32x4x4_t" => "u32x4",
+        "uint64x1_t" | "uint64x1x2_t" | "uint64x1x3_t" | "uint64x1x4_t" => "u64x1",
+        "uint64x2_t" | "uint64x2x2_t" | "uint64x2x3_t" | "uint64x2x4_t" => "u64x2",
+        "float16x4_t" => "f16x4",
+        "float16x8_t" => "f16x8",
+        "float32x2_t" | "float32x2x2_t" | "float32x2x3_t" | "float32x2x4_t" => "f32x2",
+        "float32x4_t" | "float32x4x2_t" | "float32x4x3_t" | "float32x4x4_t" => "f32x4",
+        "float64x1_t" | "float64x1x2_t" | "float64x1x3_t" | "float64x1x4_t" => "f64",
+        "float64x2_t" | "float64x2x2_t" | "float64x2x3_t" | "float64x2x4_t" => "f64x2",
+        "poly8x8_t" | "poly8x8x2_t" | "poly8x8x3_t" | "poly8x8x4_t" => "i8x8",
+        "poly8x16_t" | "poly8x16x2_t" | "poly8x16x3_t" | "poly8x16x4_t" => "i8x16",
+        "poly16x4_t" | "poly16x4x2_t" | "poly16x4x3_t" | "poly16x4x4_t" => "i16x4",
+        "poly16x8_t" | "poly16x8x2_t" | "poly16x8x3_t" | "poly16x8x4_t" => "i16x8",
+        "poly64x1_t" | "poly64x1x2_t" | "poly64x1x3_t" | "poly64x1x4_t" => "i64x1",
+        "poly64x2_t" | "poly64x2x2_t" | "poly64x2x3_t" | "poly64x2x4_t" => "i64x2",
+        "i8" => "i8",
+        "i16" => "i16",
+        "i32" => "i32",
+        "i64" => "i64",
+        "u8" => "u8",
+        "u16" => "u16",
+        "u32" => "u32",
+        "u64" => "u64",
+        "f32" => "f32",
+        "f64" => "f64",
+        "p8" => "p8",
+        "p16" => "p16",
+        "p64" => "p64",
+        "p128" => "p128",
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_to_sub_type(t: &str) -> String {
+    let s: Vec<_> = t.split('x').collect();
+    match s.len() {
+        2 => String::from(t),
+        3 => format!("{}x{}_t", s[0], s[1]),
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_to_native_type(t: &str) -> String {
+    let s: Vec<_> = t.split('x').collect();
+    match s.len() {
+        1 => {
+            assert!(t.contains("*const") || t.contains("*mut"));
+            let sub: Vec<_> = t.split(' ').collect();
+            String::from(sub[1])
+        }
+        2 | 3 => match &s[0][0..3] {
+            "int" => format!("i{}", &s[0][3..]),
+            "uin" => format!("u{}", &s[0][4..]),
+            "flo" => format!("f{}", &s[0][5..]),
+            "pol" => format!("u{}", &s[0][4..]),
+            _ => panic!("unknown type: {}", t),
+        },
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn native_type_to_type(t: &str) -> &str {
+    match t {
+        "i8" => "int8x8_t",
+        "i16" => "int16x4_t",
+        "i32" => "int32x2_t",
+        "i64" => "int64x1_t",
+        "u8" => "uint8x8_t",
+        "u16" => "uint16x4_t",
+        "u32" => "uint32x2_t",
+        "u64" => "uint64x1_t",
+        "f16" => "float16x4_t",
+        "f32" => "float32x2_t",
+        "f64" => "float64x1_t",
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn native_type_to_long_type(t: &str) -> &str {
+    match t {
+        "i8" => "int8x16_t",
+        "i16" => "int16x8_t",
+        "i32" => "int32x4_t",
+        "i64" => "int64x2_t",
+        "u8" => "uint8x16_t",
+        "u16" => "uint16x8_t",
+        "u32" => "uint32x4_t",
+        "u64" => "uint64x2_t",
+        "f16" => "float16x8_t",
+        "f32" => "float32x4_t",
+        "f64" => "float64x2_t",
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_to_half(t: &str) -> &str {
+    match t {
+        "int8x16_t" => "int8x8_t",
+        "int16x8_t" => "int16x4_t",
+        "int32x4_t" => "int32x2_t",
+        "int64x2_t" => "int64x1_t",
+        "uint8x16_t" => "uint8x8_t",
+        "uint16x8_t" => "uint16x4_t",
+        "uint32x4_t" => "uint32x2_t",
+        "uint64x2_t" => "uint64x1_t",
+        "poly8x16_t" => "poly8x8_t",
+        "poly16x8_t" => "poly16x4_t",
+        "float32x4_t" => "float32x2_t",
+        "float64x2_t" => "float64x1_t",
+        _ => panic!("unknown half type for {}", t),
+    }
+}
+
+fn asc(start: i32, len: usize) -> String {
+    let mut s = String::from("[");
+    for i in 0..len {
+        if i != 0 {
+            s.push_str(", ");
+        }
+        let n = start + i as i32;
+        s.push_str(&n.to_string());
+    }
+    s.push_str("]");
+    s
+}
+
+fn transpose1(x: usize) -> &'static str {
+    match x {
+        2 => "[0, 2]",
+        4 => "[0, 4, 2, 6]",
+        8 => "[0, 8, 2, 10, 4, 12, 6, 14]",
+        16 => "[0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]",
+        _ => panic!("unknown transpose order of len {}", x),
+    }
+}
+
+fn transpose2(x: usize) -> &'static str {
+    match x {
+        2 => "[1, 3]",
+        4 => "[1, 5, 3, 7]",
+        8 => "[1, 9, 3, 11, 5, 13, 7, 15]",
+        16 => "[1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31]",
+        _ => panic!("unknown transpose order of len {}", x),
+    }
+}
+
+fn zip1(x: usize) -> &'static str {
+    match x {
+        2 => "[0, 2]",
+        4 => "[0, 4, 1, 5]",
+        8 => "[0, 8, 1, 9, 2, 10, 3, 11]",
+        16 => "[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]",
+        _ => panic!("unknown zip order of len {}", x),
+    }
+}
+
+fn zip2(x: usize) -> &'static str {
+    match x {
+        2 => "[1, 3]",
+        4 => "[2, 6, 3, 7]",
+        8 => "[4, 12, 5, 13, 6, 14, 7, 15]",
+        16 => "[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]",
+        _ => panic!("unknown zip order of len {}", x),
+    }
+}
+
+fn unzip1(x: usize) -> &'static str {
+    match x {
+        2 => "[0, 2]",
+        4 => "[0, 2, 4, 6]",
+        8 => "[0, 2, 4, 6, 8, 10, 12, 14]",
+        16 => "[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]",
+        _ => panic!("unknown unzip order of len {}", x),
+    }
+}
+
+fn unzip2(x: usize) -> &'static str {
+    match x {
+        2 => "[1, 3]",
+        4 => "[1, 3, 5, 7]",
+        8 => "[1, 3, 5, 7, 9, 11, 13, 15]",
+        16 => "[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]",
+        _ => panic!("unknown unzip order of len {}", x),
+    }
+}
+
+fn values(t: &str, vs: &[String]) -> String {
+    if vs.len() == 1 && !t.contains('x') {
+        format!(": {} = {}", t, vs[0])
+    } else if vs.len() == 1 && type_to_global_type(t) == "f64" {
+        format!(": {} = {}", type_to_global_type(t), vs[0])
+    } else {
+        let s: Vec<_> = t.split('x').collect();
+        if s.len() == 3 {
+            format!(
+                ": [{}; {}] = [{}]",
+                type_to_native_type(t),
+                type_len(t),
+                vs.iter()
+                    .map(|v| map_val(type_to_global_type(t), v))
+                    //.map(|v| format!("{}{}", v, type_to_native_type(t)))
+                    .collect::<Vec<_>>()
+                    .join(", ")
+            )
+        } else {
+            format!(
+                ": {} = {}::new({})",
+                type_to_global_type(t),
+                type_to_global_type(t),
+                vs.iter()
+                    .map(|v| map_val(type_to_global_type(t), v))
+                    //.map(|v| format!("{}{}", v, type_to_native_type(t)))
+                    .collect::<Vec<_>>()
+                    .join(", ")
+            )
+        }
+    }
+}
+
+fn max_val(t: &str) -> &'static str {
+    match &t[..3] {
+        "u8x" => "0xFF",
+        "u16" => "0xFF_FF",
+        "u32" => "0xFF_FF_FF_FF",
+        "u64" => "0xFF_FF_FF_FF_FF_FF_FF_FF",
+        "i8x" => "0x7F",
+        "i16" => "0x7F_FF",
+        "i32" => "0x7F_FF_FF_FF",
+        "i64" => "0x7F_FF_FF_FF_FF_FF_FF_FF",
+        "f32" => "3.40282347e+38",
+        "f64" => "1.7976931348623157e+308",
+        _ => panic!("No TRUE for type {}", t),
+    }
+}
+
+fn min_val(t: &str) -> &'static str {
+    match &t[..3] {
+        "u8x" => "0",
+        "u16" => "0",
+        "u32" => "0",
+        "u64" => "0",
+        "i8x" => "-128",
+        "i16" => "-32768",
+        "i32" => "-2147483648",
+        "i64" => "-9223372036854775808",
+        "f32" => "-3.40282347e+38",
+        "f64" => "-1.7976931348623157e+308",
+        _ => panic!("No TRUE for type {}", t),
+    }
+}
+
+fn true_val(t: &str) -> &'static str {
+    match &t[..3] {
+        "u8x" => "0xFF",
+        "u16" => "0xFF_FF",
+        "u32" => "0xFF_FF_FF_FF",
+        "u64" => "0xFF_FF_FF_FF_FF_FF_FF_FF",
+        _ => panic!("No TRUE for type {}", t),
+    }
+}
+
+fn ff_val(t: &str) -> &'static str {
+    match &t[..3] {
+        "u8x" => "0xFF",
+        "u16" => "0xFF_FF",
+        "u32" => "0xFF_FF_FF_FF",
+        "u64" => "0xFF_FF_FF_FF_FF_FF_FF_FF",
+        "i8x" => "0xFF",
+        "i16" => "0xFF_FF",
+        "i32" => "0xFF_FF_FF_FF",
+        "i64" => "0xFF_FF_FF_FF_FF_FF_FF_FF",
+        _ => panic!("No TRUE for type {}", t),
+    }
+}
+
+fn false_val(_t: &str) -> &'static str {
+    "0"
+}
+
+fn bits(t: &str) -> &'static str {
+    match &t[..3] {
+        "u8x" => "8",
+        "u16" => "16",
+        "u32" => "32",
+        "u64" => "64",
+        "i8x" => "8",
+        "i16" => "16",
+        "i32" => "32",
+        "i64" => "64",
+        "p8x" => "8",
+        "p16" => "16",
+        "p64" => "64",
+        _ => panic!("Unknown bits for type {}", t),
+    }
+}
+
+fn bits_minus_one(t: &str) -> &'static str {
+    match &t[..3] {
+        "u8x" => "7",
+        "u16" => "15",
+        "u32" => "31",
+        "u64" => "63",
+        "i8x" => "7",
+        "i16" => "15",
+        "i32" => "31",
+        "i64" => "63",
+        "p8x" => "7",
+        "p16" => "15",
+        "p64" => "63",
+        _ => panic!("Unknown bits for type {}", t),
+    }
+}
+
+fn half_bits(t: &str) -> &'static str {
+    match &t[..3] {
+        "u8x" => "4",
+        "u16" => "8",
+        "u32" => "16",
+        "u64" => "32",
+        "i8x" => "4",
+        "i16" => "8",
+        "i32" => "16",
+        "i64" => "32",
+        "p8x" => "4",
+        "p16" => "8",
+        "p64" => "32",
+        _ => panic!("Unknown bits for type {}", t),
+    }
+}
+
+fn type_len_str(t: &str) -> &'static str {
+    match t {
+        "int8x8_t" => "8",
+        "int8x16_t" => "16",
+        "int16x4_t" => "4",
+        "int16x8_t" => "8",
+        "int32x2_t" => "2",
+        "int32x4_t" => "4",
+        "int64x1_t" => "1",
+        "int64x2_t" => "2",
+        "uint8x8_t" => "8",
+        "uint8x16_t" => "16",
+        "uint16x4_t" => "4",
+        "uint16x8_t" => "8",
+        "uint32x2_t" => "2",
+        "uint32x4_t" => "4",
+        "uint64x1_t" => "1",
+        "uint64x2_t" => "2",
+        "float16x4_t" => "4",
+        "float16x8_t" => "8",
+        "float32x2_t" => "2",
+        "float32x4_t" => "4",
+        "float64x1_t" => "1",
+        "float64x2_t" => "2",
+        "poly8x8_t" => "8",
+        "poly8x16_t" => "16",
+        "poly16x4_t" => "4",
+        "poly16x8_t" => "8",
+        "poly64x1_t" => "1",
+        "poly64x2_t" => "2",
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn type_half_len_str(t: &str) -> &'static str {
+    match t {
+        "int8x8_t" => "4",
+        "int8x16_t" => "8",
+        "int16x4_t" => "2",
+        "int16x8_t" => "4",
+        "int32x2_t" => "1",
+        "int32x4_t" => "2",
+        "int64x1_t" => "0",
+        "int64x2_t" => "1",
+        "uint8x8_t" => "4",
+        "uint8x16_t" => "8",
+        "uint16x4_t" => "2",
+        "uint16x8_t" => "4",
+        "uint32x2_t" => "1",
+        "uint32x4_t" => "2",
+        "uint64x1_t" => "0",
+        "uint64x2_t" => "1",
+        "float16x4_t" => "2",
+        "float16x8_t" => "4",
+        "float32x2_t" => "1",
+        "float32x4_t" => "2",
+        "float64x1_t" => "0",
+        "float64x2_t" => "1",
+        "poly8x8_t" => "4",
+        "poly8x16_t" => "8",
+        "poly16x4_t" => "2",
+        "poly16x8_t" => "4",
+        "poly64x1_t" => "0",
+        "poly64x2_t" => "1",
+        _ => panic!("unknown type: {}", t),
+    }
+}
+
+fn map_val<'v>(t: &str, v: &'v str) -> &'v str {
+    match v {
+        "FALSE" => false_val(t),
+        "TRUE" => true_val(t),
+        "MAX" => max_val(t),
+        "MIN" => min_val(t),
+        "FF" => ff_val(t),
+        "BITS" => bits(t),
+        "BITS_M1" => bits_minus_one(t),
+        "HFBITS" => half_bits(t),
+        "LEN" => type_len_str(t),
+        "HFLEN" => type_half_len_str(t),
+        o => o,
+    }
+}
+
+fn type_to_ext(t: &str, v: bool, r: bool, pi8: bool) -> String {
+    if !t.contains('x') {
+        return t.replace("u", "i");
+    }
+    let native = type_to_native_type(t);
+    let sub_ext = match type_sub_len(t) {
+        1 => String::new(),
+        _ if v => format!(
+            ".p0v{}{}",
+            &type_len(&type_to_sub_type(t)).to_string(),
+            native
+        ),
+        _ if pi8 => format!(".p0i8"),
+        _ => format!(".p0{}", native),
+    };
+    let sub_type = match &native[0..1] {
+        "i" | "f" => native,
+        "u" => native.replace("u", "i"),
+        _ => panic!("unknown type: {}", t),
+    };
+    let ext = format!(
+        "v{}{}{}",
+        &type_len(&type_to_sub_type(t)).to_string(),
+        sub_type,
+        sub_ext
+    );
+    if r {
+        let ss: Vec<_> = ext.split('.').collect();
+        if ss.len() != 2 {
+            ext
+        } else {
+            format!("{}.{}", ss[1], ss[0])
+        }
+    } else {
+        ext
+    }
+}
+
+fn ext(s: &str, in_t: &[&str; 3], out_t: &str) -> String {
+    s.replace("_EXT_", &type_to_ext(in_t[0], false, false, false))
+        .replace("_EXT2_", &type_to_ext(out_t, false, false, false))
+        .replace("_EXT3_", &type_to_ext(in_t[1], false, false, false))
+        .replace("_EXT4_", &type_to_ext(in_t[2], false, false, false))
+        .replace("_EXTr3_", &type_to_ext(in_t[1], false, true, false))
+        .replace("_EXTv2_", &type_to_ext(out_t, true, false, false))
+        .replace("_EXTpi8_", &type_to_ext(in_t[1], false, false, true))
+        .replace("_EXTpi82_", &type_to_ext(out_t, false, false, true))
+        .replace("_EXTpi8r_", &type_to_ext(in_t[1], false, true, true))
+}
+
+fn is_vldx(name: &str) -> bool {
+    let s: Vec<_> = name.split('_').collect();
+    &name[0..3] == "vld"
+        && name[3..4].parse::<i32>().unwrap() > 1
+        && (s.last().unwrap().starts_with("s") || s.last().unwrap().starts_with("f"))
+}
+
+fn is_vstx(name: &str) -> bool {
+    let s: Vec<_> = name.split('_').collect();
+    s.len() == 2
+        && &name[0..3] == "vst"
+        && name[3..4].parse::<i32>().unwrap() > 1
+        && (s[1].starts_with("s") || s[1].starts_with("f"))
+}
+
+#[allow(clippy::too_many_arguments)]
+fn gen_aarch64(
+    current_comment: &str,
+    current_fn: &Option<String>,
+    current_name: &str,
+    current_aarch64: &Option<String>,
+    link_aarch64: &Option<String>,
+    const_aarch64: &Option<String>,
+    constn: &Option<String>,
+    in_t: &[&str; 3],
+    out_t: &str,
+    current_tests: &[(
+        Vec<String>,
+        Vec<String>,
+        Vec<String>,
+        Option<String>,
+        Vec<String>,
+    )],
+    suffix: Suffix,
+    para_num: i32,
+    target: TargetFeature,
+    fixed: &Vec<String>,
+    multi_fn: &Vec<String>,
+    fn_type: Fntype,
+) -> (String, String) {
+    let name = match suffix {
+        Normal => format!("{}{}", current_name, type_to_suffix(in_t[1])),
+        NoQ => format!("{}{}", current_name, type_to_noq_suffix(in_t[1])),
+        Double => format!(
+            "{}{}",
+            current_name,
+            type_to_double_suffixes(out_t, in_t[1])
+        ),
+        NoQDouble => format!(
+            "{}{}",
+            current_name,
+            type_to_noq_double_suffixes(out_t, in_t[1])
+        ),
+        NSuffix => format!("{}{}", current_name, type_to_n_suffix(in_t[1])),
+        DoubleN => format!(
+            "{}{}",
+            current_name,
+            type_to_double_n_suffixes(out_t, in_t[1])
+        ),
+        NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])),
+        OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)),
+        OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)),
+        OutNox => format!(
+            "{}{}",
+            current_name,
+            type_to_suffix(&type_to_sub_type(out_t))
+        ),
+        In1Nox => format!(
+            "{}{}",
+            current_name,
+            type_to_suffix(&type_to_sub_type(in_t[1]))
+        ),
+        OutDupNox => format!(
+            "{}{}",
+            current_name,
+            type_to_dup_suffix(&type_to_sub_type(out_t))
+        ),
+        OutLaneNox => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffix(&type_to_sub_type(out_t))
+        ),
+        In1LaneNox => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffix(&type_to_sub_type(in_t[1]))
+        ),
+        Lane => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffixes(out_t, in_t[1], false)
+        ),
+        In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
+        In2Lane => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffixes(out_t, in_t[2], false)
+        ),
+        OutLane => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffixes(out_t, in_t[2], true)
+        ),
+        Rot => type_to_rot_suffix(current_name, type_to_suffix(out_t)),
+        RotLane => type_to_rot_suffix(current_name, &type_to_lane_suffixes(out_t, in_t[2], false)),
+    };
+    let current_target = match target {
+        Default => "neon",
+        ArmV7 => "neon",
+        Vfp4 => "neon",
+        FPArmV8 => "neon",
+        AES => "neon,aes",
+        FCMA => "neon,fcma",
+        Dotprod => "neon,dotprod",
+        I8MM => "neon,i8mm",
+        SHA3 => "neon,sha3",
+        RDM => "rdm",
+        SM4 => "neon,sm4",
+        FTTS => "neon,frintts",
+    };
+    let current_fn = if let Some(current_fn) = current_fn.clone() {
+        if link_aarch64.is_some() {
+            panic!("[{}] Can't specify link and fn at the same time.", name)
+        }
+        current_fn
+    } else if link_aarch64.is_some() {
+        format!("{}_", name)
+    } else {
+        if multi_fn.is_empty() {
+            panic!(
+                "[{}] Either (multi) fn or link-aarch have to be specified.",
+                name
+            )
+        }
+        String::new()
+    };
+    let current_aarch64 = current_aarch64.clone().unwrap();
+    let mut link_t: Vec<String> = vec![
+        in_t[0].to_string(),
+        in_t[1].to_string(),
+        in_t[2].to_string(),
+        out_t.to_string(),
+    ];
+    let mut ext_c = String::new();
+    if let Some(mut link_aarch64) = link_aarch64.clone() {
+        if link_aarch64.contains(":") {
+            let links: Vec<_> = link_aarch64.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(links.len(), 5);
+            link_aarch64 = links[0].to_string();
+            link_t = vec![
+                links[1].clone(),
+                links[2].clone(),
+                links[3].clone(),
+                links[4].clone(),
+            ];
+        }
+        let link_aarch64 = if link_aarch64.starts_with("llvm") {
+            ext(&link_aarch64, in_t, out_t)
+        } else {
+            let mut link = String::from("llvm.aarch64.neon.");
+            link.push_str(&link_aarch64);
+            ext(&link, in_t, out_t)
+        };
+        let (ext_inputs, ext_output) = {
+            if const_aarch64.is_some() {
+                if !matches!(fn_type, Fntype::Normal) {
+                    let ptr_type = match fn_type {
+                        Fntype::Load => "*const i8",
+                        Fntype::Store => "*mut i8",
+                        _ => panic!("unsupported fn type"),
+                    };
+                    let sub = type_to_sub_type(in_t[1]);
+                    (
+                        match type_sub_len(in_t[1]) {
+                            1 => format!("a: {}, n: i64, ptr: {}", sub, ptr_type),
+                            2 => format!("a: {}, b: {}, n: i64, ptr: {}", sub, sub, ptr_type),
+                            3 => format!(
+                                "a: {}, b: {}, c: {}, n: i64, ptr: {}",
+                                sub, sub, sub, ptr_type
+                            ),
+                            4 => format!(
+                                "a: {}, b: {}, c: {}, d: {}, n: i64, ptr: {}",
+                                sub, sub, sub, sub, ptr_type
+                            ),
+                            _ => panic!("unsupported type: {}", in_t[1]),
+                        },
+                        if out_t != "void" {
+                            format!(" -> {}", out_t)
+                        } else {
+                            String::new()
+                        },
+                    )
+                } else {
+                    (
+                        match para_num {
+                            1 => format!("a: {}, n: i32", in_t[0]),
+                            2 => format!("a: {}, b: {}, n: i32", in_t[0], in_t[1]),
+                            3 => format!("a: {}, b: {}, c: {}, n: i32", in_t[0], in_t[1], in_t[2]),
+                            _ => unimplemented!("unknown para_num"),
+                        },
+                        format!(" -> {}", out_t),
+                    )
+                }
+            } else if matches!(fn_type, Fntype::Store) {
+                let sub = type_to_sub_type(in_t[1]);
+                let ptr_type = if is_vstx(&name) {
+                    "i8".to_string()
+                } else {
+                    type_to_native_type(in_t[1])
+                };
+                let subs = match type_sub_len(in_t[1]) {
+                    1 => format!("a: {}", sub),
+                    2 => format!("a: {}, b: {}", sub, sub),
+                    3 => format!("a: {}, b: {}, c: {}", sub, sub, sub),
+                    4 => format!("a: {}, b: {}, c: {}, d: {}", sub, sub, sub, sub),
+                    _ => panic!("unsupported type: {}", in_t[1]),
+                };
+                (format!("{}, ptr: *mut {}", subs, ptr_type), String::new())
+            } else if is_vldx(&name) {
+                let ptr_type = if name.contains("dup") {
+                    type_to_native_type(out_t)
+                } else {
+                    type_to_sub_type(out_t)
+                };
+                (
+                    format!("ptr: *const {}", ptr_type),
+                    format!(" -> {}", out_t),
+                )
+            } else {
+                (
+                    match para_num {
+                        1 => format!("a: {}", link_t[0]),
+                        2 => format!("a: {}, b: {}", link_t[0], link_t[1]),
+                        3 => format!("a: {}, b: {}, c: {}", link_t[0], link_t[1], link_t[2]),
+                        _ => unimplemented!("unknown para_num"),
+                    },
+                    format!(" -> {}", link_t[3]),
+                )
+            }
+        };
+        ext_c = format!(
+            r#"#[allow(improper_ctypes)]
+    extern "unadjusted" {{
+        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
+        fn {}({}){};
+    }}
+    "#,
+            link_aarch64, current_fn, ext_inputs, ext_output,
+        );
+    };
+    let const_declare = if let Some(constn) = constn {
+        if constn.contains(":") {
+            let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(constns.len(), 2);
+            format!(r#"<const {}: i32, const {}: i32>"#, constns[0], constns[1])
+        } else {
+            format!(r#"<const {}: i32>"#, constn)
+        }
+    } else {
+        String::new()
+    };
+    let multi_calls = if !multi_fn.is_empty() {
+        let mut calls = String::new();
+        for i in 0..multi_fn.len() {
+            if i > 0 {
+                calls.push_str("\n    ");
+            }
+            calls.push_str(&get_call(
+                &multi_fn[i],
+                current_name,
+                &const_declare,
+                in_t,
+                out_t,
+                fixed,
+                None,
+                true,
+            ));
+        }
+        calls
+    } else {
+        String::new()
+    };
+    let const_assert = if let Some(constn) = constn {
+        if constn.contains(":") {
+            let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
+            let const_test = current_tests[0].3.as_ref().unwrap();
+            let const_tests: Vec<_> = const_test.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(constns.len(), 2);
+            assert_eq!(const_tests.len(), 2);
+            format!(
+                r#", {} = {}, {} = {}"#,
+                constns[0],
+                map_val(in_t[1], &const_tests[0]),
+                constns[1],
+                map_val(in_t[1], &const_tests[1]),
+            )
+        } else {
+            format!(
+                r#", {} = {}"#,
+                constn,
+                map_val(in_t[1], current_tests[0].3.as_ref().unwrap())
+            )
+        }
+    } else {
+        String::new()
+    };
+    let const_legacy = if let Some(constn) = constn {
+        if constn.contains(":") {
+            format!(
+                "\n#[rustc_legacy_const_generics({}, {})]",
+                para_num - 1,
+                para_num + 1
+            )
+        } else {
+            format!("\n#[rustc_legacy_const_generics({})]", para_num)
+        }
+    } else {
+        String::new()
+    };
+    let fn_decl = {
+        let fn_output = if out_t == "void" {
+            String::new()
+        } else {
+            format!("-> {} ", out_t)
+        };
+        let fn_inputs = match para_num {
+            1 => format!("(a: {})", in_t[0]),
+            2 => format!("(a: {}, b: {})", in_t[0], in_t[1]),
+            3 => format!("(a: {}, b: {}, c: {})", in_t[0], in_t[1], in_t[2]),
+            _ => panic!("unsupported parameter number"),
+        };
+        format!(
+            "pub unsafe fn {}{}{} {}",
+            name, const_declare, fn_inputs, fn_output
+        )
+    };
+    let call_params = {
+        if let (Some(const_aarch64), Some(_)) = (const_aarch64, link_aarch64) {
+            if !matches!(fn_type, Fntype::Normal) {
+                let subs = match type_sub_len(in_t[1]) {
+                    1 => "b",
+                    2 => "b.0, b.1",
+                    3 => "b.0, b.1, b.2",
+                    4 => "b.0, b.1, b.2, b.3",
+                    _ => panic!("unsupported type: {}", in_t[1]),
+                };
+                format!(
+                    r#"{}
+    {}{}({}, {} as i64, a as _)"#,
+                    multi_calls,
+                    ext_c,
+                    current_fn,
+                    subs,
+                    constn.as_deref().unwrap()
+                )
+            } else {
+                match para_num {
+                    1 => format!(
+                        r#"{}
+    {}{}(a, {})"#,
+                        multi_calls, ext_c, current_fn, const_aarch64
+                    ),
+                    2 => format!(
+                        r#"{}
+    {}{}(a, b, {})"#,
+                        multi_calls, ext_c, current_fn, const_aarch64
+                    ),
+                    _ => String::new(),
+                }
+            }
+        } else if link_aarch64.is_some() && matches!(fn_type, Fntype::Store) {
+            let cast = if is_vstx(&name) { " as _" } else { "" };
+            match type_sub_len(in_t[1]) {
+                1 => format!(r#"{}{}(b, a{})"#, ext_c, current_fn, cast),
+                2 => format!(r#"{}{}(b.0, b.1, a{})"#, ext_c, current_fn, cast),
+                3 => format!(r#"{}{}(b.0, b.1, b.2, a{})"#, ext_c, current_fn, cast),
+                4 => format!(r#"{}{}(b.0, b.1, b.2, b.3, a{})"#, ext_c, current_fn, cast),
+                _ => panic!("unsupported type: {}", in_t[1]),
+            }
+        } else if link_aarch64.is_some() && is_vldx(&name) {
+            format!(r#"{}{}(a as _)"#, ext_c, current_fn,)
+        } else {
+            let trans: [&str; 2] = if link_t[3] != out_t {
+                ["transmute(", ")"]
+            } else {
+                ["", ""]
+            };
+            match (multi_calls.len(), para_num, fixed.len()) {
+                (0, 1, 0) => format!(r#"{}{}{}(a){}"#, ext_c, trans[0], current_fn, trans[1]),
+                (0, 1, _) => {
+                    let fixed: Vec<String> =
+                        fixed.iter().take(type_len(in_t[0])).cloned().collect();
+                    format!(
+                        r#"let b{};
+    {}{}{}(a, transmute(b)){}"#,
+                        values(in_t[0], &fixed),
+                        ext_c,
+                        trans[0],
+                        current_fn,
+                        trans[1],
+                    )
+                }
+                (0, 2, _) => format!(r#"{}{}{}(a, b){}"#, ext_c, trans[0], current_fn, trans[1],),
+                (0, 3, _) => format!(r#"{}{}(a, b, c)"#, ext_c, current_fn,),
+                (_, 1, _) => format!(r#"{}{}"#, ext_c, multi_calls,),
+                (_, 2, _) => format!(r#"{}{}"#, ext_c, multi_calls,),
+                (_, 3, _) => format!(r#"{}{}"#, ext_c, multi_calls,),
+                (_, _, _) => String::new(),
+            }
+        }
+    };
+    let stable = match target {
+        Default | ArmV7 | Vfp4 | FPArmV8 | AES => {
+            String::from("\n#[stable(feature = \"neon_intrinsics\", since = \"1.59.0\")]")
+        }
+        RDM => String::from("\n#[stable(feature = \"rdm_intrinsics\", since = \"1.62.0\")]"),
+        _ => String::new(),
+    };
+    let function = format!(
+        r#"
+{}
+#[inline]
+#[target_feature(enable = "{}")]
+#[cfg_attr(test, assert_instr({}{}))]{}{}
+{}{{
+    {}
+}}
+"#,
+        current_comment,
+        current_target,
+        current_aarch64,
+        const_assert,
+        const_legacy,
+        stable,
+        fn_decl,
+        call_params
+    );
+    let test_target = match target {
+        I8MM => "neon,i8mm",
+        SM4 => "neon,sm4",
+        SHA3 => "neon,sha3",
+        FTTS => "neon,frintts",
+        _ => "neon",
+    };
+    let test = match fn_type {
+        Fntype::Normal => gen_test(
+            &name,
+            in_t,
+            &out_t,
+            current_tests,
+            [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
+            type_len(out_t),
+            para_num,
+            test_target,
+        ),
+        Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)),
+        Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])),
+    };
+    (function, test)
+}
+
+fn gen_load_test(
+    name: &str,
+    in_t: &[&str; 3],
+    out_t: &str,
+    current_tests: &[(
+        Vec<String>,
+        Vec<String>,
+        Vec<String>,
+        Option<String>,
+        Vec<String>,
+    )],
+    type_len: usize,
+) -> String {
+    let mut test = format!(
+        r#"
+    #[simd_test(enable = "neon")]
+    unsafe fn test_{}() {{"#,
+        name,
+    );
+    for (a, b, _, n, e) in current_tests {
+        let a: Vec<String> = a.iter().take(type_len + 1).cloned().collect();
+        let e: Vec<String> = e.iter().take(type_len).cloned().collect();
+        let has_b = b.len() > 0;
+        let has_n = n.is_some();
+        let mut input = String::from("[");
+        for i in 0..type_len + 1 {
+            if i != 0 {
+                input.push_str(", ");
+            }
+            input.push_str(&a[i])
+        }
+        input.push_str("]");
+        let output = |v: &Vec<String>| {
+            let mut output = String::from("[");
+            for i in 0..type_sub_len(out_t) {
+                if i != 0 {
+                    output.push_str(", ");
+                }
+                let sub_len = type_len / type_sub_len(out_t);
+                if type_to_global_type(out_t) != "f64" {
+                    let mut sub_output = format!("{}::new(", type_to_global_type(out_t));
+                    for j in 0..sub_len {
+                        if j != 0 {
+                            sub_output.push_str(", ");
+                        }
+                        sub_output.push_str(&v[i * sub_len + j]);
+                    }
+                    sub_output.push_str(")");
+                    output.push_str(&sub_output);
+                } else {
+                    output.push_str(&v[i]);
+                }
+            }
+            output.push_str("]");
+            output
+        };
+        let input_b = if has_b {
+            let b: Vec<String> = b.iter().take(type_len).cloned().collect();
+            format!(
+                r#"
+        let b: [{}; {}] = {};"#,
+                type_to_global_type(in_t[1]),
+                type_sub_len(in_t[1]),
+                output(&b),
+            )
+        } else {
+            String::new()
+        };
+        let t = format!(
+            r#"
+        let a: [{}; {}] = {};{}
+        let e: [{}; {}] = {};
+        let r: [{}; {}] = transmute({}{}(a[1..].as_ptr(){}));
+        assert_eq!(r, e);
+"#,
+            type_to_native_type(out_t),
+            type_len + 1,
+            input,
+            input_b,
+            type_to_global_type(out_t),
+            type_sub_len(out_t),
+            output(&e),
+            type_to_global_type(out_t),
+            type_sub_len(out_t),
+            name,
+            if has_n {
+                format!("::<{}>", n.as_deref().unwrap())
+            } else {
+                String::new()
+            },
+            if has_b { ", transmute(b)" } else { "" },
+        );
+        test.push_str(&t);
+    }
+    test.push_str("    }\n");
+    test
+}
+
+fn gen_store_test(
+    name: &str,
+    in_t: &[&str; 3],
+    _out_t: &str,
+    current_tests: &[(
+        Vec<String>,
+        Vec<String>,
+        Vec<String>,
+        Option<String>,
+        Vec<String>,
+    )],
+    type_len: usize,
+) -> String {
+    let mut test = format!(
+        r#"
+    #[simd_test(enable = "neon")]
+    unsafe fn test_{}() {{"#,
+        name,
+    );
+    for (a, _, _, constn, e) in current_tests {
+        let a: Vec<String> = a.iter().take(type_len + 1).cloned().collect();
+        let e: Vec<String> = e.iter().take(type_len).cloned().collect();
+        let mut input = String::from("[");
+        for i in 0..type_len + 1 {
+            if i != 0 {
+                input.push_str(", ");
+            }
+            input.push_str(&a[i])
+        }
+        input.push_str("]");
+        let mut output = String::from("[");
+        for i in 0..type_len {
+            if i != 0 {
+                output.push_str(", ");
+            }
+            output.push_str(&e[i])
+        }
+        output.push_str("]");
+        let const_n = constn
+            .as_deref()
+            .map_or(String::new(), |n| format!("::<{}>", n.to_string()));
+        let t = format!(
+            r#"
+        let a: [{}; {}] = {};
+        let e: [{}; {}] = {};
+        let mut r: [{}; {}] = [0{}; {}];
+        {}{}(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
+        assert_eq!(r, e);
+"#,
+            type_to_native_type(in_t[1]),
+            type_len + 1,
+            input,
+            type_to_native_type(in_t[1]),
+            type_len,
+            output,
+            type_to_native_type(in_t[1]),
+            type_len,
+            type_to_native_type(in_t[1]),
+            type_len,
+            name,
+            const_n,
+        );
+        test.push_str(&t);
+    }
+    test.push_str("    }\n");
+    test
+}
+
+fn gen_test(
+    name: &str,
+    in_t: &[&str; 3],
+    out_t: &str,
+    current_tests: &[(
+        Vec<String>,
+        Vec<String>,
+        Vec<String>,
+        Option<String>,
+        Vec<String>,
+    )],
+    len_in: [usize; 3],
+    len_out: usize,
+    para_num: i32,
+    target: &str,
+) -> String {
+    let mut test = format!(
+        r#"
+    #[simd_test(enable = "{}")]
+    unsafe fn test_{}() {{"#,
+        target, name,
+    );
+    for (a, b, c, n, e) in current_tests {
+        let a: Vec<String> = a.iter().take(len_in[0]).cloned().collect();
+        let b: Vec<String> = b.iter().take(len_in[1]).cloned().collect();
+        let c: Vec<String> = c.iter().take(len_in[2]).cloned().collect();
+        let e: Vec<String> = e.iter().take(len_out).cloned().collect();
+        let const_value = if let Some(constn) = n {
+            if constn.contains(":") {
+                let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
+                format!(
+                    r#"::<{}, {}>"#,
+                    map_val(in_t[1], &constns[0]),
+                    map_val(in_t[1], &constns[1])
+                )
+            } else {
+                format!(r#"::<{}>"#, map_val(in_t[1], constn))
+            }
+        } else {
+            String::new()
+        };
+        let r_type = match type_sub_len(out_t) {
+            1 => type_to_global_type(out_t).to_string(),
+            _ => format!("[{}; {}]", type_to_native_type(out_t), type_len(out_t)),
+        };
+        let t = {
+            match para_num {
+                1 => {
+                    format!(
+                        r#"
+        let a{};
+        let e{};
+        let r: {} = transmute({}{}(transmute(a)));
+        assert_eq!(r, e);
+"#,
+                        values(in_t[0], &a),
+                        values(out_t, &e),
+                        r_type,
+                        name,
+                        const_value
+                    )
+                }
+                2 => {
+                    format!(
+                        r#"
+        let a{};
+        let b{};
+        let e{};
+        let r: {} = transmute({}{}(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+"#,
+                        values(in_t[0], &a),
+                        values(in_t[1], &b),
+                        values(out_t, &e),
+                        r_type,
+                        name,
+                        const_value
+                    )
+                }
+                3 => {
+                    format!(
+                        r#"
+        let a{};
+        let b{};
+        let c{};
+        let e{};
+        let r: {} = transmute({}{}(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+"#,
+                        values(in_t[0], &a),
+                        values(in_t[1], &b),
+                        values(in_t[2], &c),
+                        values(out_t, &e),
+                        r_type,
+                        name,
+                        const_value
+                    )
+                }
+                _ => {
+                    panic!("no support para_num:{}", para_num.to_string())
+                }
+            }
+        };
+
+        test.push_str(&t);
+    }
+    test.push_str("    }\n");
+    test
+}
+
+#[allow(clippy::too_many_arguments)]
+fn gen_arm(
+    current_comment: &str,
+    current_fn: &Option<String>,
+    current_name: &str,
+    current_arm: &str,
+    link_arm: &Option<String>,
+    current_aarch64: &Option<String>,
+    link_aarch64: &Option<String>,
+    const_arm: &Option<String>,
+    const_aarch64: &Option<String>,
+    constn: &Option<String>,
+    in_t: &[&str; 3],
+    out_t: &str,
+    current_tests: &[(
+        Vec<String>,
+        Vec<String>,
+        Vec<String>,
+        Option<String>,
+        Vec<String>,
+    )],
+    suffix: Suffix,
+    para_num: i32,
+    target: TargetFeature,
+    fixed: &Vec<String>,
+    multi_fn: &Vec<String>,
+    fn_type: Fntype,
+    separate: bool,
+) -> (String, String) {
+    let name = match suffix {
+        Normal => format!("{}{}", current_name, type_to_suffix(in_t[1])),
+        NoQ => format!("{}{}", current_name, type_to_noq_suffix(in_t[1])),
+        Double => format!(
+            "{}{}",
+            current_name,
+            type_to_double_suffixes(out_t, in_t[1])
+        ),
+        NoQDouble => format!(
+            "{}{}",
+            current_name,
+            type_to_noq_double_suffixes(out_t, in_t[1])
+        ),
+        NSuffix => format!("{}{}", current_name, type_to_n_suffix(in_t[1])),
+        DoubleN => format!(
+            "{}{}",
+            current_name,
+            type_to_double_n_suffixes(out_t, in_t[1])
+        ),
+        NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])),
+        OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)),
+        OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)),
+        OutNox => format!(
+            "{}{}",
+            current_name,
+            type_to_suffix(&type_to_sub_type(out_t))
+        ),
+        In1Nox => format!(
+            "{}{}",
+            current_name,
+            type_to_suffix(&type_to_sub_type(in_t[1]))
+        ),
+        OutDupNox => format!(
+            "{}{}",
+            current_name,
+            type_to_dup_suffix(&type_to_sub_type(out_t))
+        ),
+        OutLaneNox => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffix(&type_to_sub_type(out_t))
+        ),
+        In1LaneNox => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffix(&type_to_sub_type(in_t[1]))
+        ),
+        Lane => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffixes(out_t, in_t[1], false)
+        ),
+        In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
+        In2Lane => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffixes(out_t, in_t[2], false)
+        ),
+        OutLane => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffixes(out_t, in_t[2], true)
+        ),
+        Rot => type_to_rot_suffix(current_name, type_to_suffix(out_t)),
+        RotLane => type_to_rot_suffix(current_name, &type_to_lane_suffixes(out_t, in_t[2], false)),
+    };
+    let current_aarch64 = current_aarch64
+        .clone()
+        .unwrap_or_else(|| current_arm.to_string());
+    let current_target_aarch64 = match target {
+        Default => "neon",
+        ArmV7 => "neon",
+        Vfp4 => "neon",
+        FPArmV8 => "neon",
+        AES => "neon,aes",
+        FCMA => "neon,fcma",
+        Dotprod => "neon,dotprod",
+        I8MM => "neon,i8mm",
+        SHA3 => "neon,sha3",
+        RDM => "rdm",
+        SM4 => "neon,sm4",
+        FTTS => "neon,frintts",
+    };
+    let current_target_arm = match target {
+        Default => "v7",
+        ArmV7 => "v7",
+        Vfp4 => "vfp4",
+        FPArmV8 => "fp-armv8,v8",
+        AES => "aes,v8",
+        FCMA => "v8",    // v8.3a
+        Dotprod => "v8", // v8.2a
+        I8MM => "v8,i8mm",
+        RDM => unreachable!(),
+        SM4 => unreachable!(),
+        SHA3 => unreachable!(),
+        FTTS => unreachable!(),
+    };
+    let current_fn = if let Some(current_fn) = current_fn.clone() {
+        if link_aarch64.is_some() || link_arm.is_some() {
+            panic!(
+                "[{}] Can't specify link and function at the same time. {} / {:?} / {:?}",
+                name, current_fn, link_aarch64, link_arm
+            )
+        }
+        current_fn
+    } else if link_aarch64.is_some() || link_arm.is_some() {
+        format!("{}_", name)
+    } else {
+        if multi_fn.is_empty() {
+            panic!(
+                "[{}] Either fn or link-arm and link-aarch have to be specified.",
+                name
+            )
+        }
+        String::new()
+    };
+    let mut ext_c = String::new();
+    let mut ext_c_arm = if multi_fn.is_empty() || link_arm.is_none() {
+        String::new()
+    } else {
+        String::from(
+            r#"
+    "#,
+        )
+    };
+    let mut ext_c_aarch64 = if multi_fn.is_empty() || link_aarch64.is_none() {
+        String::new()
+    } else {
+        String::from(
+            r#"
+    "#,
+        )
+    };
+    let mut link_arm_t: Vec<String> = vec![
+        in_t[0].to_string(),
+        in_t[1].to_string(),
+        in_t[2].to_string(),
+        out_t.to_string(),
+    ];
+    let mut link_aarch64_t: Vec<String> = vec![
+        in_t[0].to_string(),
+        in_t[1].to_string(),
+        in_t[2].to_string(),
+        out_t.to_string(),
+    ];
+    if let (Some(mut link_arm), Some(mut link_aarch64)) = (link_arm.clone(), link_aarch64.clone()) {
+        if link_arm.contains(":") {
+            let links: Vec<_> = link_arm.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(links.len(), 5);
+            link_arm = links[0].to_string();
+            link_arm_t = vec![
+                links[1].clone(),
+                links[2].clone(),
+                links[3].clone(),
+                links[4].clone(),
+            ];
+        }
+        if link_aarch64.contains(":") {
+            let links: Vec<_> = link_aarch64.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(links.len(), 5);
+            link_aarch64 = links[0].to_string();
+            link_aarch64_t = vec![
+                links[1].clone(),
+                links[2].clone(),
+                links[3].clone(),
+                links[4].clone(),
+            ];
+        }
+        let link_arm = if link_arm.starts_with("llvm") {
+            ext(&link_arm, in_t, out_t)
+        } else {
+            let mut link = String::from("llvm.arm.neon.");
+            link.push_str(&link_arm);
+            ext(&link, in_t, out_t)
+        };
+        let link_aarch64 = if link_aarch64.starts_with("llvm") {
+            ext(&link_aarch64, in_t, out_t)
+        } else {
+            let mut link = String::from("llvm.aarch64.neon.");
+            link.push_str(&link_aarch64);
+            ext(&link, in_t, out_t)
+        };
+        if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] {
+            ext_c = format!(
+                r#"#[allow(improper_ctypes)]
+    extern "unadjusted" {{
+        #[cfg_attr(target_arch = "arm", link_name = "{}")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
+        fn {}({}) -> {};
+    }}
+"#,
+                link_arm,
+                link_aarch64,
+                current_fn,
+                match para_num {
+                    1 => format!("a: {}", in_t[0]),
+                    2 => format!("a: {}, b: {}", in_t[0], in_t[1]),
+                    3 => format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2]),
+                    _ => unimplemented!("unknown para_num"),
+                },
+                out_t
+            );
+        };
+        let (arm_ext_inputs, arm_ext_output) = {
+            if let Some(const_arm) = const_arm {
+                if !matches!(fn_type, Fntype::Normal) {
+                    let ptr_type = match fn_type {
+                        Fntype::Load => "*const i8",
+                        Fntype::Store => "*mut i8",
+                        _ => panic!("unsupported fn type"),
+                    };
+                    let sub_type = type_to_sub_type(in_t[1]);
+                    let inputs = match type_sub_len(in_t[1]) {
+                        1 => format!("a: {}", sub_type),
+                        2 => format!("a: {}, b: {}", sub_type, sub_type,),
+                        3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,),
+                        4 => format!(
+                            "a: {}, b: {}, c: {}, d: {}",
+                            sub_type, sub_type, sub_type, sub_type,
+                        ),
+                        _ => panic!("unknown type: {}", in_t[1]),
+                    };
+                    let out = if out_t == "void" {
+                        String::new()
+                    } else {
+                        format!(" -> {}", out_t)
+                    };
+                    (
+                        format!("ptr: {}, {}, n: i32, size: i32", ptr_type, inputs),
+                        out,
+                    )
+                } else {
+                    let (_, const_type) = if const_arm.contains(":") {
+                        let consts: Vec<_> =
+                            const_arm.split(':').map(|v| v.trim().to_string()).collect();
+                        (consts[0].clone(), consts[1].clone())
+                    } else {
+                        (
+                            const_arm.to_string(),
+                            in_t[para_num as usize - 1].to_string(),
+                        )
+                    };
+                    (
+                        match para_num {
+                            1 => format!("a: {}, n: {}", in_t[0], const_type),
+                            2 => format!("a: {}, b: {}, n: {}", in_t[0], in_t[1], const_type),
+                            3 => format!(
+                                "a: {}, b: {}, c: {}, n: {}",
+                                in_t[0], in_t[1], in_t[2], const_type
+                            ),
+                            _ => unimplemented!("unknown para_num"),
+                        },
+                        format!(" -> {}", out_t),
+                    )
+                }
+            } else if out_t != link_arm_t[3] {
+                (
+                    match para_num {
+                        1 => format!("a: {}", link_arm_t[0]),
+                        2 => format!("a: {}, b: {}", link_arm_t[0], link_arm_t[1]),
+                        3 => format!(
+                            "a: {}, b: {}, c: {}",
+                            link_arm_t[0], link_arm_t[1], link_arm_t[2]
+                        ),
+                        _ => unimplemented!("unknown para_num"),
+                    },
+                    format!(" -> {}", link_arm_t[3]),
+                )
+            } else if matches!(fn_type, Fntype::Store) {
+                let sub_type = type_to_sub_type(in_t[1]);
+                let inputs = match type_sub_len(in_t[1]) {
+                    1 => format!("a: {}", sub_type),
+                    2 => format!("a: {}, b: {}", sub_type, sub_type,),
+                    3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,),
+                    4 => format!(
+                        "a: {}, b: {}, c: {}, d: {}",
+                        sub_type, sub_type, sub_type, sub_type,
+                    ),
+                    _ => panic!("unknown type: {}", in_t[1]),
+                };
+                let (ptr_type, size) = if is_vstx(&name) {
+                    ("i8".to_string(), ", size: i32")
+                } else {
+                    (type_to_native_type(in_t[1]), "")
+                };
+                (
+                    format!("ptr: *mut {}, {}{}", ptr_type, inputs, size),
+                    String::new(),
+                )
+            } else if is_vldx(&name) {
+                (
+                    format!("ptr: *const i8, size: i32"),
+                    format!(" -> {}", out_t),
+                )
+            } else {
+                (String::new(), String::new())
+            }
+        };
+        ext_c_arm.push_str(&format!(
+            r#"#[allow(improper_ctypes)]
+    extern "unadjusted" {{
+        #[cfg_attr(target_arch = "arm", link_name = "{}")]
+        fn {}({}){};
+    }}
+"#,
+            link_arm, current_fn, arm_ext_inputs, arm_ext_output,
+        ));
+        let (aarch64_ext_inputs, aarch64_ext_output) = {
+            if let Some(const_aarch64) = const_aarch64 {
+                if !matches!(fn_type, Fntype::Normal) {
+                    let ptr_type = match fn_type {
+                        Fntype::Load => "*const i8",
+                        Fntype::Store => "*mut i8",
+                        _ => panic!("unsupported fn type"),
+                    };
+                    let sub_type = type_to_sub_type(in_t[1]);
+                    let mut inputs = match type_sub_len(in_t[1]) {
+                        1 => format!("a: {}", sub_type,),
+                        2 => format!("a: {}, b: {}", sub_type, sub_type,),
+                        3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,),
+                        4 => format!(
+                            "a: {}, b: {}, c: {}, d: {}",
+                            sub_type, sub_type, sub_type, sub_type,
+                        ),
+                        _ => panic!("unknown type: {}", in_t[1]),
+                    };
+                    inputs.push_str(&format!(", n: i64, ptr: {}", ptr_type));
+                    let out = if out_t == "void" {
+                        String::new()
+                    } else {
+                        format!(" -> {}", out_t)
+                    };
+                    (inputs, out)
+                } else if const_aarch64.contains("dup-in_len-N as ttn") {
+                    (
+                        match para_num {
+                            1 => format!("a: {}, n: {}", in_t[0], in_t[0]),
+                            2 => format!("a: {}, b: {}, n: {}", in_t[0], in_t[1], in_t[1]),
+                            3 => format!(
+                                "a: {}, b: {}, c: {}, n: {}",
+                                in_t[0], in_t[1], in_t[2], in_t[1]
+                            ),
+                            _ => unimplemented!("unknown para_num"),
+                        },
+                        format!(" -> {}", out_t),
+                    )
+                } else {
+                    (
+                        match para_num {
+                            1 => format!("a: {}, n: i32", in_t[0]),
+                            2 => format!("a: {}, b: {}, n: i32", in_t[0], in_t[1]),
+                            3 => format!("a: {}, b: {}, c: {}, n: i32", in_t[0], in_t[1], in_t[2]),
+                            _ => unimplemented!("unknown para_num"),
+                        },
+                        format!(" -> {}", out_t),
+                    )
+                }
+            } else if out_t != link_aarch64_t[3] {
+                (
+                    match para_num {
+                        1 => format!("a: {}", link_aarch64_t[0]),
+                        2 => format!("a: {}, b: {}", link_aarch64_t[0], link_aarch64_t[1]),
+                        3 => format!(
+                            "a: {}, b: {}, c: {}",
+                            link_aarch64_t[0], link_aarch64_t[1], link_aarch64_t[2]
+                        ),
+                        _ => unimplemented!("unknown para_num"),
+                    },
+                    format!(" -> {}", link_aarch64_t[3]),
+                )
+            } else if matches!(fn_type, Fntype::Store) {
+                let sub_type = type_to_sub_type(in_t[1]);
+                let mut inputs = match type_sub_len(in_t[1]) {
+                    1 => format!("a: {}", sub_type,),
+                    2 => format!("a: {}, b: {}", sub_type, sub_type,),
+                    3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,),
+                    4 => format!(
+                        "a: {}, b: {}, c: {}, d: {}",
+                        sub_type, sub_type, sub_type, sub_type,
+                    ),
+                    _ => panic!("unknown type: {}", in_t[1]),
+                };
+                let ptr_type = if is_vstx(&name) {
+                    "i8".to_string()
+                } else {
+                    type_to_native_type(in_t[1])
+                };
+                inputs.push_str(&format!(", ptr: *mut {}", ptr_type));
+                (inputs, String::new())
+            } else if is_vldx(&name) {
+                let ptr_type = if name.contains("dup") {
+                    type_to_native_type(out_t)
+                } else {
+                    type_to_sub_type(out_t)
+                };
+                (
+                    format!("ptr: *const {}", ptr_type),
+                    format!(" -> {}", out_t),
+                )
+            } else {
+                (String::new(), String::new())
+            }
+        };
+        ext_c_aarch64.push_str(&format!(
+            r#"#[allow(improper_ctypes)]
+    extern "unadjusted" {{
+        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
+        fn {}({}){};
+    }}
+"#,
+            link_aarch64, current_fn, aarch64_ext_inputs, aarch64_ext_output,
+        ));
+    };
+    let const_declare = if let Some(constn) = constn {
+        format!(r#"<const {}: i32>"#, constn)
+    } else {
+        String::new()
+    };
+    let multi_calls = if !multi_fn.is_empty() {
+        let mut calls = String::new();
+        for i in 0..multi_fn.len() {
+            if i > 0 {
+                calls.push_str("\n    ");
+            }
+            calls.push_str(&get_call(
+                &multi_fn[i],
+                current_name,
+                &const_declare,
+                in_t,
+                out_t,
+                fixed,
+                None,
+                false,
+            ));
+        }
+        calls
+    } else {
+        String::new()
+    };
+    let const_assert = if let Some(constn) = constn {
+        format!(
+            r#", {} = {}"#,
+            constn,
+            map_val(in_t[1], current_tests[0].3.as_ref().unwrap())
+        )
+    } else {
+        String::new()
+    };
+    let const_legacy = if constn.is_some() {
+        format!("\n#[rustc_legacy_const_generics({})]", para_num)
+    } else {
+        String::new()
+    };
+    let fn_decl = {
+        let fn_output = if out_t == "void" {
+            String::new()
+        } else {
+            format!("-> {} ", out_t)
+        };
+        let fn_inputs = match para_num {
+            1 => format!("(a: {})", in_t[0]),
+            2 => format!("(a: {}, b: {})", in_t[0], in_t[1]),
+            3 => format!("(a: {}, b: {}, c: {})", in_t[0], in_t[1], in_t[2]),
+            _ => panic!("unsupported parameter number"),
+        };
+        format!(
+            "pub unsafe fn {}{}{} {}",
+            name, const_declare, fn_inputs, fn_output
+        )
+    };
+    let function = if separate {
+        let call_arm = {
+            let arm_params = if let (Some(const_arm), Some(_)) = (const_arm, link_arm) {
+                if !matches!(fn_type, Fntype::Normal) {
+                    let subs = match type_sub_len(in_t[1]) {
+                        1 => "b",
+                        2 => "b.0, b.1",
+                        3 => "b.0, b.1, b.2",
+                        4 => "b.0, b.1, b.2, b.3",
+                        _ => "",
+                    };
+                    format!(
+                        "{}(a as _, {}, {}, {})",
+                        current_fn,
+                        subs,
+                        constn.as_deref().unwrap(),
+                        type_bits(&type_to_sub_type(in_t[1])) / 8,
+                    )
+                } else {
+                    let cnt = if const_arm.contains(':') {
+                        let consts: Vec<_> =
+                            const_arm.split(':').map(|v| v.trim().to_string()).collect();
+                        consts[0].clone()
+                    } else {
+                        let const_arm = const_arm.replace("ttn", &type_to_native_type(in_t[1]));
+                        let mut cnt = String::from(in_t[1]);
+                        cnt.push_str("(");
+                        for i in 0..type_len(in_t[1]) {
+                            if i != 0 {
+                                cnt.push_str(", ");
+                            }
+                            cnt.push_str(&const_arm);
+                        }
+                        cnt.push_str(")");
+                        cnt
+                    };
+                    match para_num {
+                        1 => format!("{}(a, {})", current_fn, cnt),
+                        2 => format!("{}(a, b, {})", current_fn, cnt),
+                        _ => String::new(),
+                    }
+                }
+            } else if out_t != link_arm_t[3] {
+                match para_num {
+                    1 => format!("transmute({}(a))", current_fn,),
+                    2 => format!("transmute({}(transmute(a), transmute(b)))", current_fn,),
+                    _ => String::new(),
+                }
+            } else if matches!(fn_type, Fntype::Store) {
+                let (cast, size) = if is_vstx(&name) {
+                    (
+                        " as _",
+                        format!(", {}", type_bits(&type_to_sub_type(in_t[1])) / 8),
+                    )
+                } else {
+                    ("", String::new())
+                };
+                match type_sub_len(in_t[1]) {
+                    1 => format!("{}(a{}, b{})", current_fn, cast, size),
+                    2 => format!("{}(a{}, b.0, b.1{})", current_fn, cast, size),
+                    3 => format!("{}(a{}, b.0, b.1, b.2{})", current_fn, cast, size),
+                    4 => format!("{}(a{}, b.0, b.1, b.2, b.3{})", current_fn, cast, size),
+                    _ => String::new(),
+                }
+            } else if link_arm.is_some() && is_vldx(&name) {
+                format!(
+                    "{}(a as *const i8, {})",
+                    current_fn,
+                    type_bits(&type_to_sub_type(out_t)) / 8
+                )
+            } else {
+                String::new()
+            };
+            format!(
+                r#"{}{{
+    {}{}{}
+}}"#,
+                fn_decl, multi_calls, ext_c_arm, arm_params
+            )
+        };
+        let call_aarch64 = {
+            let aarch64_params =
+                if let (Some(const_aarch64), Some(_)) = (const_aarch64, link_aarch64) {
+                    if !matches!(fn_type, Fntype::Normal) {
+                        let subs = match type_sub_len(in_t[1]) {
+                            1 => "b",
+                            2 => "b.0, b.1",
+                            3 => "b.0, b.1, b.2",
+                            4 => "b.0, b.1, b.2, b.3",
+                            _ => "",
+                        };
+                        format!(
+                            "{}({}, {} as i64, a as _)",
+                            current_fn,
+                            subs,
+                            constn.as_deref().unwrap()
+                        )
+                    } else if const_aarch64.contains("dup-in_len-N as ttn") {
+                        let const_aarch64 = format!("N as {}", type_to_native_type(in_t[1]));
+                        let mut cnt = String::from(in_t[1]);
+                        cnt.push_str("(");
+                        for i in 0..type_len(in_t[1]) {
+                            if i != 0 {
+                                cnt.push_str(", ");
+                            }
+                            cnt.push_str(&const_aarch64);
+                        }
+                        cnt.push_str(")");
+                        format!("{}(a, {})", current_fn, cnt)
+                    } else {
+                        match para_num {
+                            1 => format!("{}(a, {})", current_fn, const_aarch64),
+                            2 => format!("{}(a, b, {})", current_fn, const_aarch64),
+                            _ => String::new(),
+                        }
+                    }
+                } else if out_t != link_aarch64_t[3] {
+                    match para_num {
+                        1 => format!("transmute({}(a))", current_fn,),
+                        2 => format!("transmute({}(a, b))", current_fn,),
+                        _ => String::new(),
+                    }
+                } else if matches!(fn_type, Fntype::Store) {
+                    let cast = if is_vstx(&name) { " as _" } else { "" };
+                    match type_sub_len(in_t[1]) {
+                        1 => format!("{}(b, a{})", current_fn, cast),
+                        2 => format!("{}(b.0, b.1, a{})", current_fn, cast),
+                        3 => format!("{}(b.0, b.1, b.2, a{})", current_fn, cast),
+                        4 => format!("{}(b.0, b.1, b.2, b.3, a{})", current_fn, cast),
+                        _ => String::new(),
+                    }
+                } else if link_aarch64.is_some() && is_vldx(&name) {
+                    format!("{}(a as _)", current_fn)
+                } else {
+                    String::new()
+                };
+            format!(
+                r#"{}{{
+    {}{}{}
+}}"#,
+                fn_decl, multi_calls, ext_c_aarch64, aarch64_params
+            )
+        };
+        let stable_aarch64 = match target {
+            Default | ArmV7 | Vfp4 | FPArmV8 | AES => {
+                String::from("\n#[stable(feature = \"neon_intrinsics\", since = \"1.59.0\")]")
+            }
+            RDM => String::from("\n#[stable(feature = \"rdm_intrinsics\", since = \"1.62.0\")]"),
+            _ => String::new(),
+        };
+        format!(
+            r#"
+{}
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,{}")]
+#[cfg_attr(test, assert_instr({}{}))]{}
+{}
+
+{}
+#[inline]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "{}")]
+#[cfg_attr(test, assert_instr({}{}))]{}{}
+{}
+"#,
+            current_comment,
+            current_target_arm,
+            expand_intrinsic(&current_arm, in_t[1]),
+            const_assert,
+            const_legacy,
+            call_arm,
+            current_comment,
+            current_target_aarch64,
+            expand_intrinsic(&current_aarch64, in_t[1]),
+            const_assert,
+            const_legacy,
+            stable_aarch64,
+            call_aarch64,
+        )
+    } else {
+        let call = {
+            let stmts = match (multi_calls.len(), para_num, fixed.len()) {
+                (0, 1, 0) => format!(r#"{}{}(a)"#, ext_c, current_fn,),
+                (0, 1, _) => {
+                    let fixed: Vec<String> =
+                        fixed.iter().take(type_len(in_t[0])).cloned().collect();
+                    format!(
+                        r#"let b{};
+    {}{}(a, transmute(b))"#,
+                        values(in_t[0], &fixed),
+                        ext_c,
+                        current_fn,
+                    )
+                }
+                (0, 2, _) => format!(r#"{}{}(a, b)"#, ext_c, current_fn,),
+                (0, 3, _) => format!(r#"{}{}(a, b, c)"#, ext_c, current_fn,),
+                (_, 1, _) => format!(r#"{}{}"#, ext_c, multi_calls,),
+                (_, 2, _) => format!(r#"{}{}"#, ext_c, multi_calls,),
+                (_, 3, _) => format!(r#"{}{}"#, ext_c, multi_calls,),
+                (_, _, _) => String::new(),
+            };
+            if stmts != String::new() {
+                format!(
+                    r#"{}{{
+    {}
+}}"#,
+                    fn_decl, stmts
+                )
+            } else {
+                String::new()
+            }
+        };
+        let stable_aarch64 = match target {
+            Default | ArmV7 | Vfp4 | FPArmV8 | AES => String::from("\n#[cfg_attr(target_arch = \"aarch64\", stable(feature = \"neon_intrinsics\", since = \"1.59.0\"))]"),
+            RDM => String::from("\n#[cfg_attr(target_arch = \"aarch64\", stable(feature = \"rdm_intrinsics\", since = \"1.62.0\"))]"),
+            _ => String::new(),
+        };
+        format!(
+            r#"
+{}
+#[inline]
+#[target_feature(enable = "{}")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "{}"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr({}{}))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr({}{}))]{}{}
+{}
+"#,
+            current_comment,
+            current_target_aarch64,
+            current_target_arm,
+            expand_intrinsic(&current_arm, in_t[1]),
+            const_assert,
+            expand_intrinsic(&current_aarch64, in_t[1]),
+            const_assert,
+            const_legacy,
+            stable_aarch64,
+            call,
+        )
+    };
+    let test_target = match target {
+        I8MM => "neon,i8mm",
+        SM4 => "neon,sm4",
+        SHA3 => "neon,sha3",
+        FTTS => "neon,frintts",
+        _ => "neon",
+    };
+    let test = match fn_type {
+        Fntype::Normal => gen_test(
+            &name,
+            in_t,
+            &out_t,
+            current_tests,
+            [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
+            type_len(out_t),
+            para_num,
+            test_target,
+        ),
+        Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)),
+        Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])),
+    };
+    (function, test)
+}
+
+fn expand_intrinsic(intr: &str, t: &str) -> String {
+    if intr.ends_with('.') {
+        let ext = match t {
+            "int8x8_t" => "i8",
+            "int8x16_t" => "i8",
+            "int16x4_t" => "i16",
+            "int16x8_t" => "i16",
+            "int32x2_t" => "i32",
+            "int32x4_t" => "i32",
+            "int64x1_t" => "i64",
+            "int64x2_t" => "i64",
+            "uint8x8_t" => "i8",
+            "uint8x16_t" => "i8",
+            "uint16x4_t" => "i16",
+            "uint16x8_t" => "i16",
+            "uint32x2_t" => "i32",
+            "uint32x4_t" => "i32",
+            "uint64x1_t" => "i64",
+            "uint64x2_t" => "i64",
+            "float16x4_t" => "f16",
+            "float16x8_t" => "f16",
+            "float32x2_t" => "f32",
+            "float32x4_t" => "f32",
+            "float64x1_t" => "f64",
+            "float64x2_t" => "f64",
+            "poly8x8_t" => "i8",
+            "poly8x16_t" => "i8",
+            "poly16x4_t" => "i16",
+            "poly16x8_t" => "i16",
+            /*
+            "poly64x1_t" => "i64x1",
+            "poly64x2_t" => "i64x2",
+            */
+            _ => panic!("unknown type for extension: {}", t),
+        };
+        format!(r#""{}{}""#, intr, ext)
+    } else if intr.ends_with(".s") {
+        let ext = match t {
+            "int8x8_t" => "s8",
+            "int8x16_t" => "s8",
+            "int16x4_t" => "s16",
+            "int16x8_t" => "s16",
+            "int32x2_t" => "s32",
+            "int32x4_t" => "s32",
+            "int64x1_t" => "s64",
+            "int64x2_t" => "s64",
+            "uint8x8_t" => "u8",
+            "uint8x16_t" => "u8",
+            "uint16x4_t" => "u16",
+            "uint16x8_t" => "u16",
+            "uint32x2_t" => "u32",
+            "uint32x4_t" => "u32",
+            "uint64x1_t" => "u64",
+            "uint64x2_t" => "u64",
+            "poly8x8_t" => "p8",
+            "poly8x16_t" => "p8",
+            "poly16x4_t" => "p16",
+            "poly16x8_t" => "p16",
+            "float16x4_t" => "f16",
+            "float16x8_t" => "f16",
+            "float32x2_t" => "f32",
+            "float32x4_t" => "f32",
+            "float64x1_t" => "f64",
+            "float64x2_t" => "f64",
+            /*
+            "poly64x1_t" => "i64x1",
+            "poly64x2_t" => "i64x2",
+            */
+            _ => panic!("unknown type for extension: {}", t),
+        };
+        format!(r#""{}{}""#, &intr[..intr.len() - 1], ext)
+    } else if intr.ends_with(".l") {
+        let ext = match t {
+            "int8x8_t" => "8",
+            "int8x16_t" => "8",
+            "int16x4_t" => "16",
+            "int16x8_t" => "16",
+            "int32x2_t" => "32",
+            "int32x4_t" => "32",
+            "int64x1_t" => "64",
+            "int64x2_t" => "64",
+            "uint8x8_t" => "8",
+            "uint8x16_t" => "8",
+            "uint16x4_t" => "16",
+            "uint16x8_t" => "16",
+            "uint32x2_t" => "32",
+            "uint32x4_t" => "32",
+            "uint64x1_t" => "64",
+            "uint64x2_t" => "64",
+            "poly8x8_t" => "8",
+            "poly8x16_t" => "8",
+            "poly16x4_t" => "16",
+            "poly16x8_t" => "16",
+            "float16x4_t" => "16",
+            "float16x8_t" => "16",
+            "float32x2_t" => "32",
+            "float32x4_t" => "32",
+            "float64x1_t" => "64",
+            "float64x2_t" => "64",
+            "poly64x1_t" => "64",
+            "poly64x2_t" => "64",
+            _ => panic!("unknown type for extension: {}", t),
+        };
+        format!(r#""{}{}""#, &intr[..intr.len() - 1], ext)
+    } else {
+        intr.to_string()
+    }
+}
+
+fn get_call(
+    in_str: &str,
+    current_name: &str,
+    const_declare: &str,
+    in_t: &[&str; 3],
+    out_t: &str,
+    fixed: &Vec<String>,
+    n: Option<i32>,
+    aarch64: bool,
+) -> String {
+    let params: Vec<_> = in_str.split(',').map(|v| v.trim().to_string()).collect();
+    assert!(params.len() > 0);
+    let mut fn_name = params[0].clone();
+    if fn_name == "a" {
+        return String::from("a");
+    }
+    if fn_name == "transpose-1-in_len" {
+        return transpose1(type_len(in_t[1])).to_string();
+    }
+    if fn_name == "transpose-2-in_len" {
+        return transpose2(type_len(in_t[1])).to_string();
+    }
+    if fn_name == "zip-1-in_len" {
+        return zip1(type_len(in_t[1])).to_string();
+    }
+    if fn_name == "zip-2-in_len" {
+        return zip2(type_len(in_t[1])).to_string();
+    }
+    if fn_name == "unzip-1-in_len" {
+        return unzip1(type_len(in_t[1])).to_string();
+    }
+    if fn_name == "unzip-2-in_len" {
+        return unzip2(type_len(in_t[1])).to_string();
+    }
+    if fn_name.starts_with("dup") {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        let len = match &*fn_format[1] {
+            "out_len" => type_len(out_t),
+            "in_len" => type_len(in_t[1]),
+            "in0_len" => type_len(in_t[0]),
+            "halflen" => type_len(in_t[1]) / 2,
+            _ => 0,
+        };
+        let mut s = format!("{} [", const_declare);
+        for i in 0..len {
+            if i != 0 {
+                s.push_str(", ");
+            }
+            s.push_str(&fn_format[2]);
+        }
+        s.push_str("]");
+        return s;
+    }
+    if fn_name.starts_with("asc") {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        let start = match &*fn_format[1] {
+            "0" => 0,
+            "n" => n.unwrap(),
+            "out_len" => type_len(out_t) as i32,
+            "halflen" => (type_len(in_t[1]) / 2) as i32,
+            s => s.parse::<i32>().unwrap(),
+        };
+        let len = match &*fn_format[2] {
+            "out_len" => type_len(out_t),
+            "in_len" => type_len(in_t[1]),
+            "in0_len" => type_len(in_t[0]),
+            "halflen" => type_len(in_t[1]) / 2,
+            _ => 0,
+        };
+        return asc(start, len);
+    }
+    if fn_name.starts_with("base") {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        assert_eq!(fn_format.len(), 3);
+        let mut s = format!("<const {}: i32> [", &fn_format[2]);
+        let base_len = fn_format[1].parse::<usize>().unwrap();
+        for i in 0..type_len(in_t[1]) / base_len {
+            for j in 0..base_len {
+                if i != 0 || j != 0 {
+                    s.push_str(", ");
+                }
+                s.push_str(&format!("{} * {} as u32", base_len, &fn_format[2]));
+                if j != 0 {
+                    s.push_str(&format!(" + {}", j));
+                }
+            }
+        }
+        s.push_str("]");
+        return s;
+    }
+    if fn_name.starts_with("as") {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        assert_eq!(fn_format.len(), 3);
+        let t = match &*fn_format[2] {
+            "in_ttn" => type_to_native_type(in_t[1]),
+            _ => String::new(),
+        };
+        return format!("{} as {}", &fn_format[1], t);
+    }
+    if fn_name.starts_with("ins") {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        let n = n.unwrap();
+        let len = match &*fn_format[1] {
+            "out_len" => type_len(out_t),
+            "in_len" => type_len(in_t[1]),
+            "in0_len" => type_len(in_t[0]),
+            _ => 0,
+        };
+        let offset = match &*fn_format[2] {
+            "out_len" => type_len(out_t),
+            "in_len" => type_len(in_t[1]),
+            "in0_len" => type_len(in_t[0]),
+            _ => 0,
+        };
+        let mut s = format!("{} [", const_declare);
+        for i in 0..len {
+            if i != 0 {
+                s.push_str(", ");
+            }
+            if i == n as usize {
+                s.push_str(&format!("{} + {} as u32", offset.to_string(), fn_format[3]));
+            } else {
+                s.push_str(&i.to_string());
+            }
+        }
+        s.push_str("]");
+        return s;
+    }
+    if fn_name.starts_with("static_assert_imm") {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        let len = match &*fn_format[1] {
+            "out_exp_len" => type_exp_len(out_t, 1),
+            "out_bits_exp_len" => type_bits_exp_len(out_t),
+            "in_exp_len" => type_exp_len(in_t[1], 1),
+            "in_bits_exp_len" => type_bits_exp_len(in_t[1]),
+            "in0_exp_len" => type_exp_len(in_t[0], 1),
+            "in1_exp_len" => type_exp_len(in_t[1], 1),
+            "in2_exp_len" => type_exp_len(in_t[2], 1),
+            "in2_rot" => type_exp_len(in_t[2], 2),
+            "in2_dot" => type_exp_len(in_t[2], 4),
+            _ => 0,
+        };
+        if len == 0 {
+            return format!(
+                r#"static_assert!({} : i32 where {} == 0);"#,
+                fn_format[2], fn_format[2]
+            );
+        } else {
+            return format!(r#"static_assert_imm{}!({});"#, len, fn_format[2]);
+        }
+    }
+    if fn_name.starts_with("static_assert") {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        let lim1 = if fn_format[2] == "bits" {
+            type_bits(in_t[1]).to_string()
+        } else if fn_format[2] == "halfbits" {
+            (type_bits(in_t[1]) / 2).to_string()
+        } else {
+            fn_format[2].clone()
+        };
+        let lim2 = if fn_format[3] == "bits" {
+            type_bits(in_t[1]).to_string()
+        } else if fn_format[3] == "halfbits" {
+            (type_bits(in_t[1]) / 2).to_string()
+        } else {
+            fn_format[3].clone()
+        };
+        if lim1 == lim2 {
+            return format!(
+                r#"static_assert!({} : i32 where {} == {});"#,
+                fn_format[1], fn_format[1], lim1
+            );
+        } else {
+            return format!(
+                r#"static_assert!({} : i32 where {} >= {} && {} <= {});"#,
+                fn_format[1], fn_format[1], lim1, fn_format[1], lim2
+            );
+        }
+    }
+    if fn_name.starts_with("fix_right_shift_imm") {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        let lim = if fn_format[2] == "bits" {
+            type_bits(in_t[1]).to_string()
+        } else {
+            fn_format[2].clone()
+        };
+        let fixed = if in_t[1].starts_with('u') {
+            format!("return vdup{nself}(0);", nself = type_to_n_suffix(in_t[1]))
+        } else {
+            (lim.parse::<i32>().unwrap() - 1).to_string()
+        };
+
+        return format!(
+            r#"let {name}: i32 = if {const_name} == {upper} {{ {fixed} }} else {{ N }};"#,
+            name = fn_format[1].to_lowercase(),
+            const_name = fn_format[1],
+            upper = lim,
+            fixed = fixed,
+        );
+    }
+
+    if fn_name.starts_with("matchn") {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        let len = match &*fn_format[1] {
+            "out_exp_len" => type_exp_len(out_t, 1),
+            "in_exp_len" => type_exp_len(in_t[1], 1),
+            "in0_exp_len" => type_exp_len(in_t[0], 1),
+            _ => 0,
+        };
+        let mut call = format!("match {} & 0b{} {{\n", &fn_format[2], "1".repeat(len));
+        let mut sub_call = String::new();
+        for p in 1..params.len() {
+            if !sub_call.is_empty() {
+                sub_call.push_str(", ");
+            }
+            sub_call.push_str(&params[p]);
+        }
+        for i in 0..(2u32.pow(len as u32) as usize) {
+            let sub_match = format!(
+                "        {} => {},\n",
+                i,
+                get_call(
+                    &sub_call,
+                    current_name,
+                    const_declare,
+                    in_t,
+                    out_t,
+                    fixed,
+                    Some(i as i32),
+                    aarch64
+                )
+            );
+            call.push_str(&sub_match);
+        }
+        call.push_str("        _ => unreachable_unchecked(),\n    }");
+        return call;
+    }
+    let mut re: Option<(String, String)> = None;
+    let mut param_str = String::new();
+    let mut i = 1;
+    while i < params.len() {
+        let s = &params[i];
+        if s.starts_with('{') {
+            let mut sub_fn = String::new();
+            let mut parentheses = 0;
+            while i < params.len() {
+                if !sub_fn.is_empty() {
+                    sub_fn.push_str(", ");
+                }
+                sub_fn.push_str(&params[i]);
+                let l = params[i].len();
+                for j in 0..l {
+                    if &params[i][j..j + 1] == "{" {
+                        parentheses += 1;
+                    } else {
+                        break;
+                    }
+                }
+                for j in 0..l {
+                    if &params[i][l - j - 1..l - j] == "}" {
+                        parentheses -= 1;
+                    } else {
+                        break;
+                    }
+                }
+                if parentheses == 0 {
+                    break;
+                }
+                i += 1;
+            }
+            let sub_call = get_call(
+                &sub_fn[1..sub_fn.len() - 1],
+                current_name,
+                const_declare,
+                in_t,
+                out_t,
+                fixed,
+                n.clone(),
+                aarch64,
+            );
+            if !param_str.is_empty() {
+                param_str.push_str(", ");
+            }
+            param_str.push_str(&sub_call);
+        } else if s.contains(':') {
+            let re_params: Vec<_> = s.split(':').map(|v| v.to_string()).collect();
+            if re_params[1] == "" {
+                re = Some((re_params[0].clone(), in_t[1].to_string()));
+            } else if re_params[1] == "in_t" {
+                re = Some((re_params[0].clone(), in_t[1].to_string()));
+            } else if re_params[1] == "signed" {
+                re = Some((re_params[0].clone(), type_to_signed(in_t[1])));
+            } else if re_params[1] == "unsigned" {
+                re = Some((re_params[0].clone(), type_to_unsigned(in_t[1])));
+            } else if re_params[1] == "in_t0" {
+                re = Some((re_params[0].clone(), in_t[0].to_string()));
+            } else if re_params[1] == "in_t1" {
+                re = Some((re_params[0].clone(), in_t[1].to_string()));
+            } else if re_params[1] == "out_t" {
+                re = Some((re_params[0].clone(), out_t.to_string()));
+            } else if re_params[1] == "half" {
+                re = Some((re_params[0].clone(), type_to_half(in_t[1]).to_string()));
+            } else if re_params[1] == "in_ntt" {
+                re = Some((
+                    re_params[0].clone(),
+                    native_type_to_type(in_t[1]).to_string(),
+                ));
+            } else if re_params[1] == "in_long_ntt" {
+                re = Some((
+                    re_params[0].clone(),
+                    native_type_to_long_type(in_t[1]).to_string(),
+                ));
+            } else if re_params[1] == "out_ntt" {
+                re = Some((re_params[0].clone(), native_type_to_type(out_t).to_string()));
+            } else if re_params[1] == "out_long_ntt" {
+                re = Some((
+                    re_params[0].clone(),
+                    native_type_to_long_type(out_t).to_string(),
+                ));
+            } else {
+                re = Some((re_params[0].clone(), re_params[1].clone()));
+            }
+        } else {
+            if !param_str.is_empty() {
+                param_str.push_str(", ");
+            }
+            param_str.push_str(s);
+        }
+        i += 1;
+    }
+    if fn_name == "fixed" {
+        let (re_name, re_type) = re.unwrap();
+        let fixed: Vec<String> = fixed.iter().take(type_len(in_t[1])).cloned().collect();
+        return format!(r#"let {}{};"#, re_name, values(&re_type, &fixed));
+    }
+    if fn_name == "fixed-half-right" {
+        let fixed: Vec<String> = fixed.iter().take(type_len(in_t[1])).cloned().collect();
+        let half = fixed[type_len(in_t[1]) / 2..]
+            .iter()
+            .fold(String::new(), |mut s, fix| {
+                s.push_str(fix);
+                s.push_str(", ");
+                s
+            });
+        return format!(r#"[{}]"#, &half[..half.len() - 2]);
+    }
+    if fn_name == "a - b" {
+        return fn_name;
+    }
+    if fn_name == "-a" {
+        return fn_name;
+    }
+    if fn_name.contains('-') {
+        let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
+        assert_eq!(fn_format.len(), 3);
+        fn_name = if fn_format[0] == "self" {
+            current_name.to_string()
+        } else {
+            fn_format[0].clone()
+        };
+        if fn_format[1] == "self" {
+            fn_name.push_str(type_to_suffix(in_t[1]));
+        } else if fn_format[1] == "nself" {
+            fn_name.push_str(type_to_n_suffix(in_t[1]));
+        } else if fn_format[1] == "nselfvfp4" {
+            fn_name.push_str(type_to_n_suffix(in_t[1]));
+            if !aarch64 {
+                fn_name.push_str("_vfp4");
+            }
+        } else if fn_format[1] == "out" {
+            fn_name.push_str(type_to_suffix(out_t));
+        } else if fn_format[1] == "in0" {
+            fn_name.push_str(type_to_suffix(in_t[0]));
+        } else if fn_format[1] == "in2" {
+            fn_name.push_str(type_to_suffix(in_t[2]));
+        } else if fn_format[1] == "in2lane" {
+            fn_name.push_str(&type_to_lane_suffixes(out_t, in_t[2], false));
+        } else if fn_format[1] == "outlane" {
+            fn_name.push_str(&type_to_lane_suffixes(out_t, in_t[2], true));
+        } else if fn_format[1] == "signed" {
+            fn_name.push_str(type_to_suffix(&type_to_signed(&String::from(in_t[1]))));
+        } else if fn_format[1] == "outsigned" {
+            fn_name.push_str(type_to_suffix(&type_to_signed(&String::from(out_t))));
+        } else if fn_format[1] == "outsignednox" {
+            fn_name.push_str(&type_to_suffix(&type_to_sub_type(&type_to_signed(
+                &String::from(out_t),
+            ))));
+        } else if fn_format[1] == "in1signednox" {
+            fn_name.push_str(&type_to_suffix(&type_to_sub_type(&type_to_signed(
+                &String::from(in_t[1]),
+            ))));
+        } else if fn_format[1] == "outsigneddupnox" {
+            fn_name.push_str(&type_to_dup_suffix(&type_to_sub_type(&type_to_signed(
+                &String::from(out_t),
+            ))));
+        } else if fn_format[1] == "outsignedlanenox" {
+            fn_name.push_str(&type_to_lane_suffix(&type_to_sub_type(&type_to_signed(
+                &String::from(out_t),
+            ))));
+        } else if fn_format[1] == "in1signedlanenox" {
+            fn_name.push_str(&type_to_lane_suffix(&type_to_sub_type(&type_to_signed(
+                &String::from(in_t[1]),
+            ))));
+        } else if fn_format[1] == "unsigned" {
+            fn_name.push_str(type_to_suffix(&type_to_unsigned(in_t[1])));
+        } else if fn_format[1] == "doubleself" {
+            fn_name.push_str(&type_to_double_suffixes(out_t, in_t[1]));
+        } else if fn_format[1] == "noq_doubleself" {
+            fn_name.push_str(&type_to_noq_double_suffixes(out_t, in_t[1]));
+        } else if fn_format[1] == "noqself" {
+            fn_name.push_str(type_to_noq_suffix(in_t[1]));
+        } else if fn_format[1] == "noqsigned" {
+            fn_name.push_str(type_to_noq_suffix(&type_to_signed(&String::from(in_t[1]))));
+        } else if fn_format[1] == "nosuffix" {
+        } else if fn_format[1] == "in_len" {
+            fn_name.push_str(&type_len(in_t[1]).to_string());
+        } else if fn_format[1] == "in0_len" {
+            fn_name.push_str(&type_len(in_t[0]).to_string());
+        } else if fn_format[1] == "out_len" {
+            fn_name.push_str(&type_len(out_t).to_string());
+        } else if fn_format[1] == "halflen" {
+            fn_name.push_str(&(type_len(in_t[1]) / 2).to_string());
+        } else if fn_format[1] == "nout" {
+            fn_name.push_str(type_to_n_suffix(out_t));
+        } else if fn_format[1] == "nin0" {
+            fn_name.push_str(type_to_n_suffix(in_t[0]));
+        } else if fn_format[1] == "nsigned" {
+            fn_name.push_str(type_to_n_suffix(&type_to_signed(&String::from(in_t[1]))));
+        } else if fn_format[1] == "in_ntt" {
+            fn_name.push_str(type_to_suffix(native_type_to_type(in_t[1])));
+        } else if fn_format[1] == "out_ntt" {
+            fn_name.push_str(type_to_suffix(native_type_to_type(out_t)));
+        } else if fn_format[1] == "rot" {
+            fn_name = type_to_rot_suffix(&fn_name, type_to_suffix(out_t));
+        } else {
+            fn_name.push_str(&fn_format[1]);
+        };
+        if fn_format[2] == "ext" {
+            fn_name.push_str("_");
+        } else if fn_format[2] == "noext" {
+        } else if fn_format[2].starts_with("<") {
+            assert!(fn_format[2].ends_with(">"));
+            let types: Vec<_> = fn_format[2][1..fn_format[2].len() - 1]
+                .split(' ')
+                .map(|v| v.to_string())
+                .collect();
+            assert_eq!(types.len(), 2);
+            let type1 = if types[0] == "element_t" {
+                type_to_native_type(in_t[1])
+            } else {
+                String::from(&types[0])
+            };
+            let type2 = if types[1] == "element_t" {
+                type_to_native_type(in_t[1])
+            } else {
+                String::from(&types[1])
+            };
+            fn_name.push_str(&format!("::<{}, {}>", &type1, &type2));
+        } else {
+            fn_name.push_str(&fn_format[2]);
+        }
+    }
+    if param_str.is_empty() {
+        return fn_name.replace("out_t", out_t);
+    }
+    let fn_str = if let Some((re_name, re_type)) = re.clone() {
+        format!(
+            r#"let {}: {} = {}({});"#,
+            re_name, re_type, fn_name, param_str
+        )
+    } else if fn_name.starts_with("*") {
+        format!(r#"{} = {};"#, fn_name, param_str)
+    } else {
+        format!(r#"{}({})"#, fn_name, param_str)
+    };
+    return fn_str;
+}
+
+fn main() -> io::Result<()> {
+    let args: Vec<String> = env::args().collect();
+    let in_file = args.get(1).cloned().unwrap_or_else(|| IN.to_string());
+
+    let f = File::open(in_file).expect("Failed to open neon.spec");
+    let f = BufReader::new(f);
+
+    let mut current_comment = String::new();
+    let mut current_name: Option<String> = None;
+    let mut current_fn: Option<String> = None;
+    let mut current_arm: Option<String> = None;
+    let mut current_aarch64: Option<String> = None;
+    let mut link_arm: Option<String> = None;
+    let mut link_aarch64: Option<String> = None;
+    let mut const_arm: Option<String> = None;
+    let mut const_aarch64: Option<String> = None;
+    let mut constn: Option<String> = None;
+    let mut para_num = 2;
+    let mut suffix: Suffix = Normal;
+    let mut a: Vec<String> = Vec::new();
+    let mut b: Vec<String> = Vec::new();
+    let mut c: Vec<String> = Vec::new();
+    let mut n: Option<String> = None;
+    let mut fixed: Vec<String> = Vec::new();
+    let mut current_tests: Vec<(
+        Vec<String>,
+        Vec<String>,
+        Vec<String>,
+        Option<String>,
+        Vec<String>,
+    )> = Vec::new();
+    let mut multi_fn: Vec<String> = Vec::new();
+    let mut target: TargetFeature = Default;
+    let mut fn_type: Fntype = Fntype::Normal;
+    let mut separate = false;
+
+    //
+    // THIS FILE IS GENERATED FORM neon.spec DO NOT CHANGE IT MANUALLY
+    //
+    let mut out_arm = String::from(
+        r#"// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen/neon.spec` and run the following command to re-generate this file:
+//
+// ```
+// OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen -- crates/stdarch-gen/neon.spec
+// ```
+use super::*;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+"#,
+    );
+    let mut tests_arm = String::from(
+        r#"
+#[cfg(test)]
+#[allow(overflowing_literals)]
+mod test {
+    use super::*;
+    use crate::core_arch::simd::*;
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+"#,
+    );
+    //
+    // THIS FILE IS GENERATED FORM neon.spec DO NOT CHANGE IT MANUALLY
+    //
+    let mut out_aarch64 = String::from(
+        r#"// This code is automatically generated. DO NOT MODIFY.
+//
+// Instead, modify `crates/stdarch-gen/neon.spec` and run the following command to re-generate this file:
+//
+// ```
+// OUT_DIR=`pwd`/crates/core_arch cargo run -p stdarch-gen -- crates/stdarch-gen/neon.spec
+// ```
+use super::*;
+#[cfg(test)]
+use stdarch_test::assert_instr;
+"#,
+    );
+    let mut tests_aarch64 = String::from(
+        r#"
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::core_arch::simd::*;
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+"#,
+    );
+
+    for line in f.lines() {
+        let line = line.unwrap();
+        if line.is_empty() {
+            continue;
+        }
+        if line.starts_with("/// ") {
+            current_comment = line;
+            current_name = None;
+            current_fn = None;
+            current_arm = None;
+            current_aarch64 = None;
+            link_aarch64 = None;
+            link_arm = None;
+            const_aarch64 = None;
+            const_arm = None;
+            current_tests = Vec::new();
+            constn = None;
+            para_num = 2;
+            suffix = Normal;
+            a = Vec::new();
+            b = Vec::new();
+            c = Vec::new();
+            fixed = Vec::new();
+            n = None;
+            multi_fn = Vec::new();
+            target = Default;
+            fn_type = Fntype::Normal;
+            separate = false;
+        } else if line.starts_with("//") {
+        } else if line.starts_with("name = ") {
+            current_name = Some(String::from(&line[7..]));
+        } else if line.starts_with("fn = ") {
+            current_fn = Some(String::from(&line[5..]));
+        } else if line.starts_with("multi_fn = ") {
+            multi_fn.push(String::from(&line[11..]));
+        } else if line.starts_with("constn = ") {
+            constn = Some(String::from(&line[9..]));
+        } else if line.starts_with("arm = ") {
+            current_arm = Some(String::from(&line[6..]));
+        } else if line.starts_with("aarch64 = ") {
+            current_aarch64 = Some(String::from(&line[10..]));
+        } else if line.starts_with("double-suffixes") {
+            suffix = Double;
+        } else if line.starts_with("no-q") {
+            suffix = NoQ;
+        } else if line.starts_with("noq-double-suffixes") {
+            suffix = NoQDouble;
+        } else if line.starts_with("n-suffix") {
+            suffix = NSuffix;
+        } else if line.starts_with("double-n-suffixes") {
+            suffix = DoubleN;
+        } else if line.starts_with("out-n-suffix") {
+            suffix = OutNSuffix;
+        } else if line.starts_with("noq-n-suffix") {
+            suffix = NoQNSuffix;
+        } else if line.starts_with("out-suffix") {
+            suffix = OutSuffix;
+        } else if line.starts_with("out-nox") {
+            suffix = OutNox;
+        } else if line.starts_with("in1-nox") {
+            suffix = In1Nox;
+        } else if line.starts_with("out-dup-nox") {
+            suffix = OutDupNox;
+        } else if line.starts_with("out-lane-nox") {
+            suffix = OutLaneNox;
+        } else if line.starts_with("in1-lane-nox") {
+            suffix = In1LaneNox;
+        } else if line.starts_with("lane-suffixes") {
+            suffix = Lane;
+        } else if line.starts_with("in2-suffix") {
+            suffix = In2;
+        } else if line.starts_with("in2-lane-suffixes") {
+            suffix = In2Lane;
+        } else if line.starts_with("out-lane-suffixes") {
+            suffix = OutLane;
+        } else if line.starts_with("rot-suffix") {
+            suffix = Rot;
+        } else if line.starts_with("rot-lane-suffixes") {
+            suffix = RotLane;
+        } else if line.starts_with("a = ") {
+            a = line[4..].split(',').map(|v| v.trim().to_string()).collect();
+        } else if line.starts_with("b = ") {
+            b = line[4..].split(',').map(|v| v.trim().to_string()).collect();
+        } else if line.starts_with("c = ") {
+            c = line[4..].split(',').map(|v| v.trim().to_string()).collect();
+        } else if line.starts_with("n = ") {
+            n = Some(String::from(&line[4..]));
+        } else if line.starts_with("fixed = ") {
+            fixed = line[8..].split(',').map(|v| v.trim().to_string()).collect();
+        } else if line.starts_with("validate ") {
+            let e = line[9..].split(',').map(|v| v.trim().to_string()).collect();
+            current_tests.push((a.clone(), b.clone(), c.clone(), n.clone(), e));
+        } else if line.starts_with("link-aarch64 = ") {
+            link_aarch64 = Some(String::from(&line[15..]));
+        } else if line.starts_with("const-aarch64 = ") {
+            const_aarch64 = Some(String::from(&line[16..]));
+        } else if line.starts_with("link-arm = ") {
+            link_arm = Some(String::from(&line[11..]));
+        } else if line.starts_with("const-arm = ") {
+            const_arm = Some(String::from(&line[12..]));
+        } else if line.starts_with("load_fn") {
+            fn_type = Fntype::Load;
+        } else if line.starts_with("store_fn") {
+            fn_type = Fntype::Store;
+        } else if line.starts_with("arm-aarch64-separate") {
+            separate = true;
+        } else if line.starts_with("target = ") {
+            target = match Some(String::from(&line[9..])) {
+                Some(input) => match input.as_str() {
+                    "v7" => ArmV7,
+                    "vfp4" => Vfp4,
+                    "fp-armv8" => FPArmV8,
+                    "aes" => AES,
+                    "fcma" => FCMA,
+                    "dotprod" => Dotprod,
+                    "i8mm" => I8MM,
+                    "sha3" => SHA3,
+                    "rdm" => RDM,
+                    "sm4" => SM4,
+                    "frintts" => FTTS,
+                    _ => Default,
+                },
+                _ => Default,
+            }
+        } else if line.starts_with("generate ") {
+            let line = &line[9..];
+            let types: Vec<String> = line
+                .split(',')
+                .map(|v| v.trim().to_string())
+                .flat_map(|v| match v.as_str() {
+                    "uint*_t" => UINT_TYPES.iter().map(|v| v.to_string()).collect(),
+                    "uint64x*_t" => UINT_TYPES_64.iter().map(|v| v.to_string()).collect(),
+                    "int*_t" => INT_TYPES.iter().map(|v| v.to_string()).collect(),
+                    "int64x*_t" => INT_TYPES_64.iter().map(|v| v.to_string()).collect(),
+                    "float*_t" => FLOAT_TYPES.iter().map(|v| v.to_string()).collect(),
+                    "float64x*_t" => FLOAT_TYPES_64.iter().map(|v| v.to_string()).collect(),
+                    _ => vec![v],
+                })
+                .collect();
+
+            for line in types {
+                let spec: Vec<&str> = line.split(':').map(|e| e.trim()).collect();
+                let in_t: [&str; 3];
+                let out_t;
+                if spec.len() == 1 {
+                    in_t = [spec[0], spec[0], spec[0]];
+                    out_t = spec[0];
+                } else if spec.len() == 2 {
+                    in_t = [spec[0], spec[0], spec[0]];
+                    out_t = spec[1];
+                } else if spec.len() == 3 {
+                    in_t = [spec[0], spec[1], spec[1]];
+                    out_t = spec[2];
+                } else if spec.len() == 4 {
+                    in_t = [spec[0], spec[1], spec[2]];
+                    out_t = spec[3];
+                } else {
+                    panic!("Bad spec: {}", line)
+                }
+                if b.len() == 0 {
+                    if matches!(fn_type, Fntype::Store) {
+                        para_num = 2;
+                    } else {
+                        para_num = 1;
+                    }
+                } else if c.len() != 0 {
+                    para_num = 3;
+                }
+                let current_name = current_name.clone().unwrap();
+                if let Some(current_arm) = current_arm.clone() {
+                    let (function, test) = gen_arm(
+                        &current_comment,
+                        &current_fn,
+                        &current_name,
+                        &current_arm,
+                        &link_arm,
+                        &current_aarch64,
+                        &link_aarch64,
+                        &const_arm,
+                        &const_aarch64,
+                        &constn,
+                        &in_t,
+                        &out_t,
+                        &current_tests,
+                        suffix,
+                        para_num,
+                        target,
+                        &fixed,
+                        &multi_fn,
+                        fn_type,
+                        separate,
+                    );
+                    out_arm.push_str(&function);
+                    tests_arm.push_str(&test);
+                } else {
+                    let (function, test) = gen_aarch64(
+                        &current_comment,
+                        &current_fn,
+                        &current_name,
+                        &current_aarch64,
+                        &link_aarch64,
+                        &const_aarch64,
+                        &constn,
+                        &in_t,
+                        &out_t,
+                        &current_tests,
+                        suffix,
+                        para_num,
+                        target,
+                        &fixed,
+                        &multi_fn,
+                        fn_type,
+                    );
+                    out_aarch64.push_str(&function);
+                    tests_aarch64.push_str(&test);
+                }
+            }
+        }
+    }
+    tests_arm.push('}');
+    tests_arm.push('\n');
+    tests_aarch64.push('}');
+    tests_aarch64.push('\n');
+
+    let arm_out_path: PathBuf =
+        PathBuf::from(env::var("OUT_DIR").unwrap_or("crates/core_arch".to_string()))
+            .join("src")
+            .join("arm_shared")
+            .join("neon");
+    std::fs::create_dir_all(&arm_out_path)?;
+
+    let mut file_arm = File::create(arm_out_path.join(ARM_OUT))?;
+    file_arm.write_all(out_arm.as_bytes())?;
+    file_arm.write_all(tests_arm.as_bytes())?;
+
+    let aarch64_out_path: PathBuf =
+        PathBuf::from(env::var("OUT_DIR").unwrap_or("crates/core_arch".to_string()))
+            .join("src")
+            .join("aarch64")
+            .join("neon");
+    std::fs::create_dir_all(&aarch64_out_path)?;
+
+    let mut file_aarch = File::create(aarch64_out_path.join(AARCH64_OUT))?;
+    file_aarch.write_all(out_aarch64.as_bytes())?;
+    file_aarch.write_all(tests_aarch64.as_bytes())?;
+    /*
+    if let Err(e) = Command::new("rustfmt")
+        .arg(&arm_out_path)
+        .arg(&aarch64_out_path)
+        .status() {
+            eprintln!("Could not format `{}`: {}", arm_out_path.to_str().unwrap(), e);
+            eprintln!("Could not format `{}`: {}", aarch64_out_path.to_str().unwrap(), e);
+    };
+    */
+    Ok(())
+}
diff --git a/library/stdarch/crates/stdarch-test/Cargo.toml b/library/stdarch/crates/stdarch-test/Cargo.toml
new file mode 100644
index 000000000..9ac1057be
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "stdarch-test"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2018"
+
+[dependencies]
+assert-instr-macro = { path = "../assert-instr-macro" }
+simd-test-macro = { path = "../simd-test-macro" }
+cc = "1.0"
+lazy_static = "1.0"
+rustc-demangle = "0.1.8"
+cfg-if = "0.1"
+
+# We use a crates.io dependency to disassemble wasm binaries to look for
+# instructions for `#[assert_instr]`. Note that we use an `=` dependency here
+# instead of a floating dependency because the text format for wasm changes over
+# time, and we want to make updates to this explicit rather than automatically
+# picking up updates which might break CI with new instruction names.
+[target.'cfg(target_arch = "wasm32")'.dependencies]
+wasmprinter = "=0.2.24"
+
+[features]
+default = []
diff --git a/library/stdarch/crates/stdarch-test/src/disassembly.rs b/library/stdarch/crates/stdarch-test/src/disassembly.rs
new file mode 100644
index 000000000..3ace6b20e
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/src/disassembly.rs
@@ -0,0 +1,193 @@
+//! Disassembly calling function for most targets.
+
+use crate::Function;
+use std::{collections::HashSet, env, process::Command, str};
+
+// Extracts the "shim" name from the `symbol`.
+fn normalize(mut symbol: &str) -> String {
+    // Remove trailing colon:
+    if symbol.ends_with(':') {
+        symbol = &symbol[..symbol.len() - 1];
+    }
+    if symbol.ends_with('>') {
+        symbol = &symbol[..symbol.len() - 1];
+    }
+    if let Some(idx) = symbol.find('<') {
+        symbol = &symbol[idx + 1..];
+    }
+
+    let mut symbol = rustc_demangle::demangle(symbol).to_string();
+    symbol = match symbol.rfind("::h") {
+        Some(i) => symbol[..i].to_string(),
+        None => symbol.to_string(),
+    };
+
+    // Remove Rust paths
+    if let Some(last_colon) = symbol.rfind(':') {
+        symbol = (&symbol[last_colon + 1..]).to_string();
+    }
+
+    // Normalize to no leading underscore to handle platforms that may
+    // inject extra ones in symbol names.
+    while symbol.starts_with('_') {
+        symbol.remove(0);
+    }
+    // Windows/x86 has a suffix such as @@4.
+    if let Some(idx) = symbol.find("@@") {
+        symbol = (&symbol[..idx]).to_string();
+    }
+    symbol
+}
+
+pub(crate) fn disassemble_myself() -> HashSet<Function> {
+    let me = env::current_exe().expect("failed to get current exe");
+
+    let disassembly = if cfg!(target_os = "windows") && cfg!(target_env = "msvc") {
+        let target = if cfg!(target_arch = "x86_64") {
+            "x86_64-pc-windows-msvc"
+        } else if cfg!(target_arch = "x86") {
+            "i686-pc-windows-msvc"
+        } else {
+            panic!("disassembly unimplemented")
+        };
+        let mut cmd = cc::windows_registry::find(target, "dumpbin.exe")
+            .expect("failed to find `dumpbin` tool");
+        let output = cmd
+            .arg("/DISASM")
+            .arg(&me)
+            .output()
+            .expect("failed to execute dumpbin");
+        println!(
+            "{}\n{}",
+            output.status,
+            String::from_utf8_lossy(&output.stderr)
+        );
+        assert!(output.status.success());
+        // Windows does not return valid UTF-8 output:
+        String::from_utf8_lossy(Vec::leak(output.stdout))
+    } else if cfg!(target_os = "windows") {
+        panic!("disassembly unimplemented")
+    } else {
+        let objdump = env::var("OBJDUMP").unwrap_or_else(|_| "objdump".to_string());
+        let add_args = if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") {
+            // Target features need to be enabled for LLVM objdump on Macos ARM64
+            vec!["--mattr=+v8.6a,+crypto,+tme"]
+        } else {
+            vec![]
+        };
+        let output = Command::new(objdump.clone())
+            .arg("--disassemble")
+            .arg("--no-show-raw-insn")
+            .args(add_args)
+            .arg(&me)
+            .output()
+            .unwrap_or_else(|_| panic!("failed to execute objdump. OBJDUMP={}", objdump));
+        println!(
+            "{}\n{}",
+            output.status,
+            String::from_utf8_lossy(&output.stderr)
+        );
+        assert!(output.status.success());
+
+        String::from_utf8_lossy(Vec::leak(output.stdout))
+    };
+
+    parse(&disassembly)
+}
+
+fn parse(output: &str) -> HashSet<Function> {
+    let mut lines = output.lines();
+
+    println!(
+        "First 100 lines of the disassembly input containing {} lines:",
+        lines.clone().count()
+    );
+    for line in output.lines().take(100) {
+        println!("{}", line);
+    }
+
+    let mut functions = HashSet::new();
+    let mut cached_header = None;
+    while let Some(header) = cached_header.take().or_else(|| lines.next()) {
+        if !header.ends_with(':') || !header.contains("stdarch_test_shim") {
+            continue;
+        }
+        eprintln!("header: {}", header);
+        let symbol = normalize(header);
+        eprintln!("normalized symbol: {}", symbol);
+        let mut instructions = Vec::new();
+        while let Some(instruction) = lines.next() {
+            if instruction.ends_with(':') {
+                cached_header = Some(instruction);
+                break;
+            }
+            if instruction.is_empty() {
+                cached_header = None;
+                break;
+            }
+            let mut parts = if cfg!(target_env = "msvc") {
+                // Each line looks like:
+                //
+                // >  $addr: ab cd ef     $instr..
+                // >         00 12          # this line os optional
+                if instruction.starts_with("       ") {
+                    continue;
+                }
+                instruction
+                    .split_whitespace()
+                    .skip(1)
+                    .skip_while(|s| s.len() == 2 && usize::from_str_radix(s, 16).is_ok())
+                    .map(std::string::ToString::to_string)
+                    .skip_while(|s| *s == "lock") // skip x86-specific prefix
+                    .collect::<Vec<String>>()
+            } else {
+                // objdump with --no-show-raw-insn
+                // Each line of instructions should look like:
+                //
+                //      $rel_offset:       $instruction...
+                instruction
+                    .split_whitespace()
+                    .skip(1)
+                    .skip_while(|s| *s == "lock") // skip x86-specific prefix
+                    .map(std::string::ToString::to_string)
+                    .collect::<Vec<String>>()
+            };
+
+            if cfg!(target_arch = "aarch64") {
+                // Normalize [us]shll.* ..., #0 instructions to the preferred form: [us]xtl.* ...
+                // as LLVM objdump does not do that.
+                // See https://developer.arm.com/documentation/ddi0602/latest/SIMD-FP-Instructions/UXTL--UXTL2--Unsigned-extend-Long--an-alias-of-USHLL--USHLL2-
+                // and https://developer.arm.com/documentation/ddi0602/latest/SIMD-FP-Instructions/SXTL--SXTL2--Signed-extend-Long--an-alias-of-SSHLL--SSHLL2-
+                // for details.
+                match (parts.first(), parts.last()) {
+                    (Some(instr), Some(last_arg))
+                        if (instr.starts_with("ushll.") || instr.starts_with("sshll."))
+                            && last_arg == "#0" =>
+                    {
+                        assert_eq!(parts.len(), 4);
+                        let mut new_parts = Vec::with_capacity(3);
+                        let new_instr = format!("{}{}{}", &instr[..1], "xtl", &instr[5..]);
+                        new_parts.push(new_instr);
+                        new_parts.push(parts[1].clone());
+                        new_parts.push(parts[2][0..parts[2].len() - 1].to_owned()); // strip trailing comma
+                        parts = new_parts;
+                    }
+                    _ => {}
+                };
+            }
+            instructions.push(parts.join(" "));
+        }
+        let function = Function {
+            name: symbol,
+            instrs: instructions,
+        };
+        assert!(functions.insert(function));
+    }
+
+    eprintln!("all found functions dump:");
+    for k in &functions {
+        eprintln!("  f: {}", k.name);
+    }
+
+    functions
+}
diff --git a/library/stdarch/crates/stdarch-test/src/lib.rs b/library/stdarch/crates/stdarch-test/src/lib.rs
new file mode 100644
index 000000000..078736c66
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/src/lib.rs
@@ -0,0 +1,202 @@
+//! Runtime support needed for testing the stdarch crate.
+//!
+//! This basically just disassembles the current executable and then parses the
+//! output once globally and then provides the `assert` function which makes
+//! assertions about the disassembly of a function.
+#![feature(bench_black_box)] // For black_box
+#![deny(rust_2018_idioms)]
+#![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]
+
+#[macro_use]
+extern crate lazy_static;
+#[macro_use]
+extern crate cfg_if;
+
+pub use assert_instr_macro::*;
+pub use simd_test_macro::*;
+use std::{cmp, collections::HashSet, env, hash, hint::black_box, str};
+
+cfg_if! {
+    if #[cfg(target_arch = "wasm32")] {
+        pub mod wasm;
+        use wasm::disassemble_myself;
+    } else {
+        mod disassembly;
+        use crate::disassembly::disassemble_myself;
+    }
+}
+
+lazy_static! {
+    static ref DISASSEMBLY: HashSet<Function> = disassemble_myself();
+}
+
+#[derive(Debug)]
+struct Function {
+    name: String,
+    instrs: Vec<String>,
+}
+impl Function {
+    fn new(n: &str) -> Self {
+        Self {
+            name: n.to_string(),
+            instrs: Vec::new(),
+        }
+    }
+}
+
+impl cmp::PartialEq for Function {
+    fn eq(&self, other: &Self) -> bool {
+        self.name == other.name
+    }
+}
+impl cmp::Eq for Function {}
+
+impl hash::Hash for Function {
+    fn hash<H: hash::Hasher>(&self, state: &mut H) {
+        self.name.hash(state)
+    }
+}
+
+/// Main entry point for this crate, called by the `#[assert_instr]` macro.
+///
+/// This asserts that the function at `fnptr` contains the instruction
+/// `expected` provided.
+pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
+    // Make sure that the shim is not removed
+    black_box(shim_addr);
+
+    //eprintln!("shim name: {}", fnname);
+    let function = &DISASSEMBLY
+        .get(&Function::new(fnname))
+        .unwrap_or_else(|| panic!("function \"{}\" not found in the disassembly", fnname));
+    //eprintln!("  function: {:?}", function);
+
+    let mut instrs = &function.instrs[..];
+    while instrs.last().map_or(false, |s| s == "nop") {
+        instrs = &instrs[..instrs.len() - 1];
+    }
+
+    // Look for `expected` as the first part of any instruction in this
+    // function, e.g., tzcntl in tzcntl %rax,%rax.
+    //
+    // There are two cases when the expected instruction is nop:
+    // 1. The expected intrinsic is compiled away so we can't
+    // check for it - aka the intrinsic is not generating any code.
+    // 2. It is a mark, indicating that the instruction will be
+    // compiled into other instructions - mainly because of llvm
+    // optimization.
+    let found = expected == "nop" || instrs.iter().any(|s| s.starts_with(expected));
+
+    // Look for subroutine call instructions in the disassembly to detect whether
+    // inlining failed: all intrinsics are `#[inline(always)]`, so calling one
+    // intrinsic from another should not generate subroutine call instructions.
+    let inlining_failed = if cfg!(target_arch = "x86_64") || cfg!(target_arch = "wasm32") {
+        instrs.iter().any(|s| s.starts_with("call "))
+    } else if cfg!(target_arch = "x86") {
+        instrs.windows(2).any(|s| {
+            // On 32-bit x86 position independent code will call itself and be
+            // immediately followed by a `pop` to learn about the current address.
+            // Let's not take that into account when considering whether a function
+            // failed inlining something.
+            s[0].starts_with("call ") && s[1].starts_with("pop") // FIXME: original logic but does not match comment
+        })
+    } else if cfg!(target_arch = "aarch64") {
+        instrs.iter().any(|s| s.starts_with("bl "))
+    } else {
+        // FIXME: Add detection for other archs
+        false
+    };
+
+    let instruction_limit = std::env::var("STDARCH_ASSERT_INSTR_LIMIT")
+        .ok()
+        .map_or_else(
+            || match expected {
+                // `cpuid` returns a pretty big aggregate structure, so exempt
+                // it from the slightly more restrictive 22 instructions below.
+                "cpuid" => 30,
+
+                // Apparently, on Windows, LLVM generates a bunch of
+                // saves/restores of xmm registers around these intstructions,
+                // which exceeds the limit of 20 below. As it seems dictated by
+                // Windows's ABI (I believe?), we probably can't do much
+                // about it.
+                "vzeroall" | "vzeroupper" if cfg!(windows) => 30,
+
+                // Intrinsics using `cvtpi2ps` are typically "composites" and
+                // in some cases exceed the limit.
+                "cvtpi2ps" => 25,
+                // core_arch/src/arm_shared/simd32
+                // vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
+                "usad8" | "vfma" | "vfms" => 27,
+                "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
+                // core_arch/src/arm_shared/simd32
+                // vst1q_s64_x4_vst1 : #instructions = 22 >= 22 (limit)
+                "vld3" => 23,
+                // core_arch/src/arm_shared/simd32
+                // vld4q_lane_u32_vld4 : #instructions = 31 >= 22 (limit)
+                "vld4" => 32,
+                // core_arch/src/arm_shared/simd32
+                // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit)
+                "vst1" => 41,
+                // core_arch/src/arm_shared/simd32
+                // vst4q_u32_vst4 : #instructions = 26 >= 22 (limit)
+                "vst4" => 27,
+
+                // Temporary, currently the fptosi.sat and fptoui.sat LLVM
+                // intrinsics emit unnecessary code on arm. This can be
+                // removed once it has been addressed in LLVM.
+                "fcvtzu" | "fcvtzs" | "vcvt" => 64,
+
+                // core_arch/src/arm_shared/simd32
+                // vst1q_p64_x4_nop : #instructions = 33 >= 22 (limit)
+                "nop" if fnname.contains("vst1q_p64") => 34,
+
+                // Original limit was 20 instructions, but ARM DSP Intrinsics
+                // are exactly 20 instructions long. So, bump the limit to 22
+                // instead of adding here a long list of exceptions.
+                _ => 22,
+            },
+            |v| v.parse().unwrap(),
+        );
+    let probably_only_one_instruction = instrs.len() < instruction_limit;
+
+    if found && probably_only_one_instruction && !inlining_failed {
+        return;
+    }
+
+    // Help debug by printing out the found disassembly, and then panic as we
+    // didn't find the instruction.
+    println!("disassembly for {}: ", fnname,);
+    for (i, instr) in instrs.iter().enumerate() {
+        println!("\t{:2}: {}", i, instr);
+    }
+
+    if !found {
+        panic!(
+            "failed to find instruction `{}` in the disassembly",
+            expected
+        );
+    } else if !probably_only_one_instruction {
+        panic!(
+            "instruction found, but the disassembly contains too many \
+             instructions: #instructions = {} >= {} (limit)",
+            instrs.len(),
+            instruction_limit
+        );
+    } else if inlining_failed {
+        panic!(
+            "instruction found, but the disassembly contains subroutine \
+             call instructions, which hint that inlining failed"
+        );
+    }
+}
+
+pub fn assert_skip_test_ok(name: &str) {
+    if env::var("STDARCH_TEST_EVERYTHING").is_err() {
+        return;
+    }
+    panic!("skipped test `{}` when it shouldn't be skipped", name);
+}
+
+// See comment in `assert-instr-macro` crate for why this exists
+pub static mut _DONT_DEDUP: *const u8 = std::ptr::null();
diff --git a/library/stdarch/crates/stdarch-test/src/wasm.rs b/library/stdarch/crates/stdarch-test/src/wasm.rs
new file mode 100644
index 000000000..bf411c121
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/src/wasm.rs
@@ -0,0 +1,55 @@
+//! Disassembly calling function for `wasm32` targets.
+
+use crate::Function;
+use std::collections::HashSet;
+
+pub(crate) fn disassemble_myself() -> HashSet<Function> {
+    // Use `std::env::args` to find the path to our executable. Assume the
+    // environment is configured such that we can read that file. Read it and
+    // use the `wasmprinter` crate to transform the binary to text, then search
+    // the text for appropriately named functions.
+    let me = std::env::args()
+        .next()
+        .expect("failed to find current wasm file");
+    let output = wasmprinter::print_file(&me).unwrap();
+
+    let mut ret: HashSet<Function> = HashSet::new();
+    let mut lines = output.lines().map(|s| s.trim());
+    while let Some(line) = lines.next() {
+        // If this isn't a function, we don't care about it.
+        if !line.starts_with("(func ") {
+            continue;
+        }
+
+        let mut function = Function {
+            name: String::new(),
+            instrs: Vec::new(),
+        };
+
+        // Empty functions will end in `))` so there's nothing to do, otherwise
+        // we'll have a bunch of following lines which are instructions.
+        //
+        // Lines that have an imbalanced `)` mark the end of a function.
+        if !line.ends_with("))") {
+            while let Some(line) = lines.next() {
+                function.instrs.push(line.to_string());
+                if !line.starts_with("(") && line.ends_with(")") {
+                    break;
+                }
+            }
+        }
+        // The second element here split on whitespace should be the name of
+        // the function, skipping the type/params/results
+        function.name = line.split_whitespace().nth(1).unwrap().to_string();
+        if function.name.starts_with("$") {
+            function.name = function.name[1..].to_string()
+        }
+
+        if !function.name.contains("stdarch_test_shim") {
+            continue;
+        }
+
+        assert!(ret.insert(function));
+    }
+    return ret;
+}
diff --git a/library/stdarch/crates/stdarch-verify/Cargo.toml b/library/stdarch/crates/stdarch-verify/Cargo.toml
new file mode 100644
index 000000000..6362e3d57
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "stdarch-verify"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2018"
+
+[dependencies]
+proc-macro2 = "1.0"
+quote = "1.0"
+syn = { version = "1.0", features = ["full"] }
+
+[lib]
+proc-macro = true
+test = false
+
+[dev-dependencies]
+serde = { version = "1.0", features = ['derive'] }
+serde-xml-rs = "0.3"
+html5ever = "0.23.0"
diff --git a/library/stdarch/crates/stdarch-verify/arm-intrinsics.html b/library/stdarch/crates/stdarch-verify/arm-intrinsics.html
new file mode 100644
index 000000000..ac246c6ba
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/arm-intrinsics.html
@@ -0,0 +1,93399 @@
+
+
+<!DOCTYPE html>
+<html class="page no-js" lang="en">
+<head>
+    <script>
+        if ((window.location.href.indexOf('<') || window.location.href.indexOf('>')) > -1) {
+            window.location.href = window.location.href.replace(/<.*>/g, '');            
+        }
+    </script>
+
+    <title>Technologies | NEON Intrinsics Reference – Arm Developer</title>
+    
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta http-equiv="X-UA-Compatible" content="IE=Edge">
+<meta content="en" http-equiv="content-language">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<meta name="author" content="Arm Ltd.">
+<meta name="description" content="All the NEON intrinsics reference in an interactive page.">
+<meta name="keywords" content="">
+<meta content="Copyright &#169; 1995-2018 Arm Limited (or its affiliates). All rights reserved." name="copyright">
+<meta name="apple-mobile-web-app-capable" content="yes">
+<meta name="msapplication-config" content="https://developer.arm.com:443/shared/common/img/favicon/browserconfig.xml">
+<meta name="msapplication-TileColor" content="#2b5797">
+<meta name="msapplication-TileImage" content="https://developer.arm.com:443/shared/common/img/favicon/mstile-144x144.png">
+<meta name="theme-color" content="#ffffff">
+<meta name="server" content="ARMGPCD2" />
+
+<meta property="og:title" content="Technologies | NEON Intrinsics Reference – Arm Developer">
+<meta property="og:description" content="All the NEON intrinsics reference in an interactive page.">
+<meta property="og:image" content="https://developer.arm.com:443">
+<meta property="og:site_name" content="ARM Developer">
+<meta property="og:url" content="https://developer.arm.com/technologies/neon/intrinsics">
+<meta property="og:type" content="website">
+<meta property="og:locale" content="en">
+
+<meta property="article:author" content="Arm Ltd.">
+<meta property="article:publisher" content="Arm Ltd.">
+
+<meta name="twitter:card" content="summary">
+<meta name="twitter:site" content="ARM Developer">
+<meta name="twitter:title" content="Technologies | NEON Intrinsics Reference – Arm Developer">
+<meta name="twitter:description" content="All the NEON intrinsics reference in an interactive page.">
+<meta name="twitter:image" content="https://developer.arm.com:443">
+<meta name="twitter:url" content="https://developer.arm.com/technologies/neon/intrinsics">
+
+<meta itemprop="name" content="Technologies | NEON Intrinsics Reference – Arm Developer">
+<meta itemprop="description" content="All the NEON intrinsics reference in an interactive page.">
+<meta itemprop="image" content="https://developer.arm.com:443">
+
+
+    
+
+    <link rel="stylesheet" type="text/css" href="/shared/developer.arm.com/css/app.css?v=D41D8CD98F00B204E9800998ECF8427E" />
+
+    
+
+<link rel="apple-touch-icon" sizes="57x57" href="https://developer.arm.com/shared/common/img/favicon/apple-touch-icon.png?v=2.29.0.0" />
+<link rel="apple-touch-icon" sizes="60x60" href="https://developer.arm.com/shared/common/img/favicon/apple-touch-icon.png?v=2.29.0.0" />
+<link rel="apple-touch-icon" sizes="72x72" href="https://developer.arm.com/shared/common/img/favicon/apple-touch-icon.png?v=2.29.0.0" />
+<link rel="apple-touch-icon" sizes="76x76" href="https://developer.arm.com/shared/common/img/favicon/apple-touch-icon.png?v=2.29.0.0" />
+<link rel="apple-touch-icon" sizes="114x114" href="https://developer.arm.com/shared/common/img/favicon/apple-touch-icon.png?v=2.29.0.0" />
+<link rel="apple-touch-icon" sizes="120x120" href="https://developer.arm.com/shared/common/img/favicon/apple-touch-icon.png?v=2.29.0.0" />
+<link rel="apple-touch-icon" sizes="144x144" href="https://developer.arm.com/shared/common/img/favicon/apple-touch-icon.png?v=2.29.0.0" />
+<link rel="apple-touch-icon" sizes="152x152" href="https://developer.arm.com/shared/common/img/favicon/apple-touch-icon.png?v=2.29.0.0" />
+<link rel="apple-touch-icon" sizes="180x180" href="https://developer.arm.com/shared/common/img/favicon/apple-touch-icon.png?v=2.29.0.0" />
+<link rel="icon" type="image/png" href="https://developer.arm.com/shared/common/img/favicon/favicon-32x32.png?v=2.29.0.0" sizes="32x32" />
+<link rel="icon" type="image/png" href="https://developer.arm.com/shared/common/img/favicon/favicon-48x48.png?v=2.29.0.0" sizes="48x48" />
+<link rel="icon" type="image/png" href="https://developer.arm.com/shared/common/img/favicon/android-chrome-192x192.png?v=2.29.0.0" sizes="192x192" />
+<link rel="icon" type="image/png" href="https://developer.arm.com/shared/common/img/favicon/android-chrome-256x256.png?v=2.29.0.0" sizes="256x256" />
+<link rel="icon" type="image/png" href="https://developer.arm.com/shared/common/img/favicon/favicon-16x16.png?v=2.29.0.0" sizes="16x16" />
+<link rel="shortcut icon" type="image/ico" href="https://developer.arm.com/shared/common/img/favicon/favicon.ico?v=2.29.0.0" />
+<link rel="manifest" href="https://developer.arm.com/shared/common/img/favicon/manifest.json?v=2.29.0.0" />
+
+    <link rel="search" type="application/opensearchdescription+xml" title="ARM Developer" href="/opensearch.xml"/>
+
+    
+    
+
+
+<!-- Google Tag Manager -->
+<script>
+(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
+j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
+'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
+    })(window, document, 'script', 'dataLayer', 'GTM-K25LQR');
+</script>
+<!-- End Google Tag Manager -->
+
+
+</head>
+<body id="">
+    
+    
+
+<noscript>
+    <iframe src="https://www.googletagmanager.com/ns.html?id=GTM-K25LQR" height="0" width="0" style="display:none;visibility:hidden"></iframe>
+</noscript>
+
+
+    
+    <div class="c-feedback-message-container u-no-print"><style>
+/* Docs top margin fix */
+#content.arm-docs .section-wrapper h1 {
+    padding-top: 0 !important;
+}
+#content.arm-docs .section-wrapper {
+    margin-top: 2em;
+}
+</style>
+<style>
+    .top-bar-section .has-dropdown ul.dropdown a.active {
+      box-shadow: none;
+    }
+</style>
+<div id="modal-welcome" class="reveal-modal" data-reveal>
+  <p class="lead">ARM’s developer website includes documentation, tutorials, support resources and more.</p>
+  <p>Over the next few months we will be adding more developer resources and documentation for all the products and technologies that ARM provides.</p>
+  <a class="close-reveal-modal button" style="position: relative; float: right; color: white; font-size: 1em;">Close</a>
+</div>
+
+<!-- Hot fix for accordion icons -->
+<style>
+.accordion .accordion-navigation > a .accordion-icon:before {
+    content: "\f196" !important;
+}
+.accordion .accordion-navigation.active > a .accordion-icon:before {
+    content: "\f147" !important;
+}
+.accordion .accordion-navigation > a .accordion-icon {
+    font-size: 1em !important;
+    vertical-align: 0 !important;
+}
+</style></div>
+
+    
+    
+
+    
+    
+
+<div class="c-skip-navigation u-no-print">
+    <a href="#content" aria-label="Clik here to skip to Main Content">Skip to Main Content</a>
+    <a href="#footer" aria-label="Clik here to skip to Footer Navigation">Skip to Footer Navigation</a>
+</div>
+
+<div class="c-unsupported-browser-message u-no-print text-center old-ie-version">
+    <p>
+        <strong>
+            Sorry, your browser is not supported. <a href="https://whatbrowser.org/" target="_blank"> We recommend upgrading your browser</a>.
+            We have done our best to make all the documentation and resources available on old versions of Internet Explorer, but vector image support and the layout may not be optimal. Technical documentation is available as a PDF Download.
+        </strong>
+    </p>
+</div>
+
+<noscript>
+    <div class="js-disabled-message u-no-print text-center">
+        <p>
+            <strong>JavaScript seems to be disabled in your browser.</strong><br />
+            You must have JavaScript enabled in your browser to utilize the functionality of this website.
+        </p>
+    </div>
+</noscript>
+
+
+<div class="c-notifications-wrapper">
+
+    
+    <style>
+        .c-notifications-wrapper {
+            background-color: #e5eceb;
+            position: relative;
+            z-index: 99;
+        }
+    </style>
+
+    
+    
+
+
+    
+    <script>
+        (function() {
+            function setHeight() {
+                var $notification = document.querySelector('.c-notification');
+                if (!$notification) return;
+                var computedStyles = getComputedStyle($notification);
+                var height = computedStyles.getPropertyValue('height');
+                var $parent = $notification.parentElement;
+                if ($parent) $parent.style.height = height;
+            }
+            function getCookie(cname) {
+                var name = cname + '=';
+                var decodedCookie = decodeURIComponent(document.cookie);
+                var ca = decodedCookie.split(';');
+                for(var i = 0; i <ca.length; i++) {
+                    var c = ca[i];
+                    while (c.charAt(0) == ' ') {
+                        c = c.substring(1);
+                    }
+                    if (c.indexOf(name) == 0) {
+                        return c.substring(name.length, c.length);
+                    }
+                }
+                return '';
+            }                   
+            var $closeNotification = document.querySelector('.c-notification__close');
+            var $notificationWrapper = document.querySelector('.c-notifications-wrapper');
+            if (!$notificationWrapper) return;
+            if (!$closeNotification) return;
+            $closeNotification.onclick = function() {    
+                $notificationWrapper.style.display = 'none';
+                var expiryDate = new Date();
+                expiryDate.setTime(expiryDate.getTime() + (3650 * 24 * 60 * 60 * 1000));
+                document.cookie ='acceptSesameCookie=true; expires=' + expiryDate + '; domain=.arm.com; path=/';
+            };
+            var acceptCookie = getCookie('acceptSesameCookie');
+            if (acceptCookie) {
+                $notificationWrapper.style.display = 'none';
+            }
+            window.addEventListener('resize', setHeight);
+            setHeight(); 
+        })();
+    </script>
+</div>
+
+
+<header class="c-header u-no-print" role="banner">
+    
+    <div class="arm-global-menu-wrapper">
+        <div class="arm-global-menu">
+
+            
+            <nav class="top-bar js-mobile-navigation" data-topbar="" role="navigation">
+
+                
+                <div class="global-menu">
+ 
+                    
+                    <ul class="title-area">
+
+                        
+                        <li class="navigation-dropdown">
+                            <span class="navigation-dropdown-label">
+                                <a href="/">
+                                    <span>
+                                        <img src="/shared/developer.arm.com/img/arm-developer.svg" alt="ARM Developer" />
+                                    </span>
+                                    <i class="fa fa-caret-down"></i>
+                                </a>
+                            </span>
+                            <ul class="navigation-dropdown-list">
+    <li class="navigation-dropdown-list-item">
+        <a href="/" title="Home"><span><i class="fa fa-home"></i> Home</span></a>
+    </li>
+    <li class="navigation-dropdown-list-divider"><span></span></li>    
+        <li class="navigation-dropdown-list-item">
+            <a href="https://developer.arm.com/embedded" title="Embedded Software Development"><span>Embedded Software Development</span></a>
+        </li>
+        <li class="navigation-dropdown-list-item">
+            <a href="https://developer.arm.com/open-source" title="Linux &amp; Open Source"><span>Linux &amp; Open Source</span></a>
+        </li>
+        <li class="navigation-dropdown-list-item">
+            <a href="https://www.arm.com/resources/education" title="Education"><span>Education</span></a>
+        </li>
+        <li class="navigation-dropdown-list-item">
+            <a href="https://www.arm.com/resources/research" title="Research"><span>Research</span></a>
+        </li>
+        <li class="navigation-dropdown-list-item">
+            <a href="https://developer.arm.com/graphics" title="Graphics &amp; Multimedia Development"><span>Graphics &amp; Multimedia Development</span></a>
+        </li>
+        <li class="navigation-dropdown-list-item">
+            <a href="https://developer.arm.com/soc" title="SoC Design"><span>SoC Design</span></a>
+        </li>
+        <li class="navigation-dropdown-list-item">
+            <a href="https://developer.arm.com/hpc" title="High Performance Computing"><span>High Performance Computing</span></a>
+        </li>
+</ul>
+
+                        </li>
+
+                        
+                        <li class="menu">
+    <ul>
+            <li><a class="underline" href="https://developer.arm.com/products" title="Products">Products</a></li>
+            <li><a class="underline" href="https://developer.arm.com/solutions" title="Solutions">Solutions</a></li>
+            <li><a class="underline" href="https://developer.arm.com/technologies" title="Technologies">Technologies</a></li>
+            <li><a class="underline" href="https://developer.arm.com/support" title="Support">Support</a></li>
+    </ul>
+</li>
+
+
+                        
+                        <li class="search js-search">
+                                <div id="global-search-box">
+                                    <!-- When customizing this component, ensure to use "Coveo.$" instead of the regular jQuery "$" to
+     avoid any conflicts with Sitecore's Page Editor/Experience Editor.  -->
+
+<div>
+    <link rel="stylesheet" href="/Coveo/css/CoveoFullSearchNewDesign.css" />
+    <link rel="stylesheet" href="/Coveo/css/CoveoComponent.css" />
+    <link href="/shared/developer.arm.com/css/search.css" rel="stylesheet" />
+
+    <script type="text/javascript" src="/Coveo/js/CoveoJsSearch.WithDependencies.min.js"></script>
+    <script type="text/javascript" src="/Coveo/js/CoveoForSitecorePolyfills.js"></script>
+    <script type="text/javascript" src="/Coveo/js/CoveoForSitecore.js"></script>
+
+</div>
+<script src="/Coveo/js/CoveoForSitecoreOmniboxResultListFixCursor.js"></script>
+
+
+
+
+    <div>
+        <script type="text/javascript" src="/Coveo/js/cultures/en.js"></script>
+        <script type="text/javascript">
+            Coveo.$(function () {
+              
+                var searchbox = Coveo.$('#globalsearchbox');
+                if (typeof (CoveoForSitecore) !== 'undefined') {
+                    CoveoForSitecore.componentsOptions = {"analyticsCustomMetadata" : {"sitename" : "website" , "siteName" : "website" , "pageFullPath" : "/sitecore/content/developer/technologies/neon/intrinsics"},"analyticsEndpointUri" : "/coveo/rest/v6/analytics" , "boostExpressions" : "" , "clientLanguageFieldName" : "@z95xlanguage" , "clientLanguageName" : "en" , "defaultSortType" : "" , "defaultSortField" : "" , "defaultSortCriteriaNoSpace" : "Relevancy" , "defaultSortCriteriaLowercase" : "relevancy" , "enableClientSideLogging" : false,"externalCollections" : [],"externalSources" : [],"filterResultsOnCurrentCulture" : true,"filterExpression" : "NOT @templateid==(\"adb6ca4f-03ef-4f47-b9ac-9ce2ba53ff97\",\"fe5dd826-48c6-436d-b87a-7c4210c7413b\") AND @haslayout == 1 AND @z95xpath == \"110d559fdea542ea9c1c8a5df7e70ef9\"" , "id" : "coveoa8e6c6d9" , "indexSourceName" : "Coveo_web_index - DEVELOPER" , "isEditingInPageEditor" : false,"isPreviewingInPageEditor" : false,"isPreviewingInPageEditorWithSimulatedDevice" : false,"latestVersionFieldName" : "@z95xlatestversion" , "pageFullPath" : "/sitecore/content/developer/technologies/neon/intrinsics" , "pageName" : "intrinsics" , "restEndpointUri" : "/coveo/rest" , "searchboxPlaceholderText" : "" , "sendToSitecoreAnalytics" : false,"sitecoreItemId" : "eae5ffe5-224d-49c2-b491-93cad803b595" , "sitecoreItemUri" : "sitecore://web/{EAE5FFE5-224D-49C2-B491-93CAD803B595}?lang=en\u0026ver=2" , "siteName" : "website" , "searchRedirectionItemName" : "search" , "searchRedirectionUrl" : "/search" , "keepOmniboxSuggestionsProvidersDefaultOrdering" : false};
+                    searchbox.coveoForSitecore('initSearchbox',
+                        CoveoForSitecore.componentsOptions);
+                } else {
+                    Coveo.SearchEndpoint.endpoints["default"] = new Coveo.SearchEndpoint({"restUri" : "/coveo/rest" , "queryStringArguments" : {"sitecoreItemUri" : "sitecore://web/{EAE5FFE5-224D-49C2-B491-93CAD803B595}?lang=en\u0026ver=2" , "siteName" : "website"}});
+                    searchbox.coveo('initSearchbox',
+                        '/search');
+                }
+
+                 Coveo.$('#globalsearchbox').on("afterInitialization", function() {
+                var queryBox = Coveo.$(this).find("input.CoveoQueryBox");
+                if (!queryBox) {
+                    return;
+                }
+                queryBox.attr("placeholder", '');
+                queryBox.attr("aria-label", 'Search');
+            });
+                
+            });
+        </script>
+
+        <div id="globalsearchbox"
+                          >
+            <div class="CoveoAnalytics"
+                 data-anonymous="True"
+                 data-endpoint="/coveo/rest/coveoanalytics"
+                 data-search-hub="search"
+                 data-send-to-cloud="True">
+            </div>
+            <div class="CoveoSearchbox"
+                 data-auto-focus="True"
+                 data-enable-lowercase-operators="False"
+                 data-enable-partial-match="False"
+                 data-partial-match-keywords="5"
+                 data-partial-match-threshold="50%"
+                 data-enable-question-marks="False"
+                 data-enable-wildcards="False"
+                 data-enable-omnibox="true"
+                 data-omnibox-timeout="500"
+                 data-enable-field-addon="False"
+                 data-enable-simple-field-addon="False"
+                 data-enable-top-query-addon="False"
+                 data-enable-reveal-query-suggest-addon="False"
+                 data-enable-query-extension-addon="False"
+                 ></div>
+
+
+            
+
+
+
+
+    <script type="text/javascript">
+        
+       
+    </script>
+    <span class="CoveoForSitecoreOmniboxResultList"
+          data-header-title='Suggested Results'
+          data-query-expression=''
+          data-number-of-results='10'
+          data-result-template-id='globalsearchresults'>
+    </span>
+    <script id="globalsearchresults" class="result-template" type="text/x-underscore-template">
+        <div>
+            <a href="{{=clickUri}}" class="coveo-title">{{=title?highlight(title, titleHighlights):''}}</a>
+        </div>
+    </script>
+
+
+        </div>
+    </div>
+
+                                </div>
+                        </li>
+                
+                        
+                        
+
+                        
+                        <li class="menu-icon c-mobile-toggle c-mobile-search-toggle js-mobile-toggle" data-toggle="search">
+                            <a href="#" tabindex="0" title="Search" aria-label="Search" aria-haspopup="true">
+                                <i class="fa fa-search"></i><span class="sr-only">Search</span>
+                            </a>
+                        </li>
+
+                        
+                        
+
+                        
+                        <li class="menu-icon c-mobile-toggle c-mobile-navigation-toggle js-mobile-toggle" data-toggle="navigation">
+                            <a href="#" tabindex="0" title="Mobile Navigation" aria-label="Mobile Navigation" aria-haspopup="true">
+                                <i class="fa fa-bars"></i><span class="sr-only">Mobile Navigation</span>
+                            </a>
+                        </li>
+                
+                        
+                        <li class="developer-user-menu">
+                            
+
+<div id="c-65110123-4209-4daa-a4a3-335c8f0caa10" class="c-user-menu" role="navigation" aria-label="User menu">
+    <ul class="c-user-menu__items c-navigation__items c-user-menu__root" role="menubar">
+    <li class="c-user-menu__item" aria-haspopup="true" role="menuitem">
+        <a class="c-user-menu__toggle" title="User Menu" tabindex="0" aria-expanded="false">
+            <i class="fa fa-user" aria-hidden="true"></i>
+        </a>
+        <ul class="c-user-menu__items c-navigation__items c-user-menu__section is-aligned-right" aria-hidden="true" role="menu" tabindex="-1">
+                    <li class="c-user-menu__item" aria-haspopup="false" role="menuitem">
+                        <a href="/login?returnUrl=/technologies/neon/intrinsics" title="Login" tabindex="0" aria-expanded="false">Login</a>
+                    </li>
+                    <li class="c-user-menu__item" aria-haspopup="false" role="menuitem">
+                        <a href="/register?returnUrl=/technologies/neon/intrinsics" title="Register" tabindex="0" aria-expanded="false">Register</a>
+                    </li>
+        </ul>
+    </li>
+</ul>
+
+</div>
+
+<link rel="stylesheet" href="https://developer.arm.com/shared/arm-account/css/modules/user-menu.css?v=2.29.0.0" />
+
+                        </li>
+                    </ul>
+                    
+
+                </div>
+                
+
+                
+                <section class="arm-mobile-navigation top-bar-section hide-for-large-up">
+                    <ul class="right">
+    <li class="has-dropdown">
+        <a href="#">Main Menu</a>
+        <ul class="dropdown">
+                <li><a href="/products">Products</a></li>
+                <li><a href="/solutions">Solutions</a></li>
+                <li><a href="/technologies">Technologies</a></li>
+                <li><a href="/support">Support</a></li>
+        </ul>
+    </li>
+    <ul class="left"><li><a class="" href="/technologies">Overview</a></li><li><a class="" href="/technologies/big-little">big.LITTLE</a></li><li><a class="active" href="/technologies/neon">NEON</a></li><li><a class="" href="/embedded/cmsis">CMSIS</a></li><li class="has-dropdown"><a class="" href="/technologies/dsp">DSP</a><ul class="dropdown"><li><a class="" href="/technologies/dsp">DSP Overview</a></li><li><a class="" href="/technologies/dsp/arm-dsp-ecosystem-partners">Arm DSP ecosystem partners</a></li><li><a class="" href="/technologies/dsp/dsp-for-cortex-r">DSP for Cortex-R</a></li><li><a class="" href="/technologies/dsp/dsp-for-cortex-m">DSP for Cortex-M</a></li><li><a class="" href="/technologies/neon">NEON for Cortex-A and Cortex-R52</a></li></ul></li><li class="has-dropdown"><a class="" href="/technologies/machine-learning-on-arm">Machine Learning on Arm</a><ul class="dropdown"><li class="has-dropdown"><a class="" href="/technologies/machine-learning-on-arm/developer-material">Developer material</a><ul class="dropdown"><li class="has-dropdown"><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides">How-to guides</a><ul class="dropdown"><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/configuring-the-arm-nn-sdk-build-environment-for-onnx">Configuring the Arm NN SDK build environment for ONNX</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/configuring-the-arm-nn-sdk-build-environment-for-tensorflow">Configuring the Arm NN SDK build environment for TensorFlow</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/configuring-the-arm-nn-sdk-build-environment-for-tensorflow-lite">Configuring the Arm NN SDK build environment for TensorFlow Lite</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/deploying-a-caffe-model-on-openmv-using-cmsis-nn">Deploying a Caffe Model on OpenMV using CMSIS-NN</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/deploying-cloud-based-ml-for-speech-transcription">Deploying cloud-based ML for speech transcription</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/configuring-the-arm-nn-sdk-build-environment-for-caffe">Configuring the Arm NN SDK build environment for Caffe</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/deploying-a-tensorflow-mnist-model-on-arm-nn">Deploying a TensorFlow MNIST model on Arm NN</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/deploying-a-caffe-mnist-model-using-the-arm-nn-sdk">Deploying a Caffe MNIST model using the Arm NN SDK</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/profiling-alexnet-on-raspberry-pi-and-hikey-960-with-the-compute-library">Profiling AlexNet on Raspberry Pi and HiKey 960 with the Compute Library</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/optimizing-neural-networks-for-mobile-and-embedded-devices-with-tensorflow">Optimizing neural networks for mobile and embedded devices with TensorFlow</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/running-alexnet-on-raspberry-pi-with-compute-library">Running AlexNet on Raspberry Pi with Compute Library</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/improving-your-machine-learning-workflow-using-the-arm-nn-sdk">Improving your machine learning workflow using the Arm NN SDK</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/quantizing-neural-networks-to-8-bit-using-tensorflow">Quantizing neural networks to 8-bit using TensorFlow</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/cross-compiling-arm-nn-for-the-raspberry-pi-and-tensorflow">Cross-compiling Arm NN for the Raspberry Pi and TensorFlow</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/teach-your-raspberry-pi-yeah-world">Teach your Raspberry Pi - Yeah world</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/teach-your-raspberry-pi-multi-gesture">Teach your Raspberry Pi - Multi-gesture</a></li></ul></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/webinars">Webinars</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/software-for-machine-learning-on-arm">Software for Machine Learning on Arm</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/research-papers">Research papers</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/white-papers">White papers</a></li></ul></li><li><a class="" href="/technologies/machine-learning-on-arm/ecosystem-partners">Ecosystem partners</a></li><li><a class="" href="https://community.arm.com/p/ml-blog">Blog</a></li></ul></li><li class="has-dropdown"><a class="" href="/technologies/security-on-arm">Security on Arm</a><ul class="dropdown"><li><a class="" href="/technologies/security-on-arm/arm-technologies">Arm technologies</a></li><li><a class="" href="/technologies/security-on-arm/how-do-i-implement">How do I implement</a></li><li><a class="" href="/technologies/security-on-arm/types-of-attack-and-counter-measures">Types of attack and counter-measures</a></li><li class="has-dropdown"><a class="" href="/technologies/security-on-arm/arm-security-developer-community">Arm Security Developer Community</a><ul class="dropdown"><li><a class="" href="/technologies/security-on-arm/arm-security-developer-community/arm-security-partners">Arm Security Partners</a></li></ul></li></ul></li><li><a class="" href="/technologies/uefi-drivers">UEFI Drivers</a></li><li><a class="" href="/technologies/dynamiq">DynamIQ</a></li><li class="has-dropdown"><a class="" href="/technologies/graphics-technologies">Graphics Technologies</a><ul class="dropdown"><li><a class="" href="/technologies/graphics-technologies/adaptive-scalable-texture-compression">Adaptive Scalable Texture Compression</a></li><li><a class="" href="/technologies/graphics-technologies/arm-frame-buffer-compression">Arm Frame Buffer Compression</a></li><li><a class="" href="/technologies/graphics-technologies/transaction-elimination">Transaction Elimination</a></li></ul></li><li class="has-dropdown"><a class="" href="/technologies/trustzone">TrustZone</a><ul class="dropdown"><li><a class="" href="/technologies/trustzone/webinar-how-to-implement-a-secure-iot-system-on-armv8-m">Webinar - How to implement a secure IoT system on Armv8-M</a></li></ul></li><li><a class="" href="/technologies/compute-library">Compute Library</a></li><li><a class="" href="/technologies/floating-point">Floating Point</a></li></ul>
+</ul>
+
+                </section>
+                
+
+            </nav>
+            
+
+            <script>
+                (function() {
+                    var $globalMenu = document.querySelector('.arm-global-menu');
+                    if (!$globalMenu) return;
+                    var computedStyles = getComputedStyle($globalMenu);
+                    var height = computedStyles.getPropertyValue('height');
+                    var $parent = $globalMenu.parentElement;
+                    if ($parent) $parent.style.height = height;
+                })();
+            </script>
+
+        </div>
+    </div>
+
+    
+<div class="main-header">
+    <div class="row">
+        <div class="large-12 columns">
+            <ul class="breadcrumbs">
+                <li>
+                    <a href="/" title="Home">Home</a>
+                </li>
+                <li>
+                    <a href="/technologies" title="Technologies">Technologies</a>
+                </li>
+                <li>
+                    <a href="/technologies/neon" title="NEON">NEON</a>
+                </li>
+                <li class="current">NEON Intrinsics Reference</li>
+</ul>
+
+
+<h1>NEON Intrinsics Reference</h1>        </div>
+    </div>
+</div>
+
+
+<div class="c-contextual-navigation-wrapper show-for-large-up">
+    <style>
+        .c-contextual-navigation-wrapper,
+        .c-contextual-navigation {
+            min-height: 55px;
+            position: relative;
+        }
+        .c-contextual-navigation {
+            width: 100%;
+        }
+        .c-contextual-navigation.is-stuck {
+            position: fixed;
+            z-index: 999;
+        }
+    </style>
+    <div id="middle" class="c-contextual-navigation full-width-nav">
+        <div class="contain-to-grid">
+            <nav class="top-bar mid-navigation" data-topbar="" role="navigation">
+                <ul class="title-area">
+                    <li class="name"></li>
+                    <li class="toggle-topbar menu-icon"><a href="#"><span></span></a></li>
+                </ul>
+                <section class="top-bar-section mid-nav">
+<ul class="left"><li><a class="" href="/technologies">Overview</a></li><li><a class="" href="/technologies/big-little">big.LITTLE</a></li><li><a class="active" href="/technologies/neon">NEON</a></li><li><a class="" href="/embedded/cmsis">CMSIS</a></li><li class="has-dropdown"><a class="" href="/technologies/dsp">DSP</a><ul class="dropdown"><li><a class="" href="/technologies/dsp">DSP Overview</a></li><li><a class="" href="/technologies/dsp/arm-dsp-ecosystem-partners">Arm DSP ecosystem partners</a></li><li><a class="" href="/technologies/dsp/dsp-for-cortex-r">DSP for Cortex-R</a></li><li><a class="" href="/technologies/dsp/dsp-for-cortex-m">DSP for Cortex-M</a></li><li><a class="" href="/technologies/neon">NEON for Cortex-A and Cortex-R52</a></li></ul></li><li class="has-dropdown"><a class="" href="/technologies/machine-learning-on-arm">Machine Learning on Arm</a><ul class="dropdown"><li class="has-dropdown"><a class="" href="/technologies/machine-learning-on-arm/developer-material">Developer material</a><ul class="dropdown"><li class="has-dropdown"><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides">How-to guides</a><ul class="dropdown"><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/configuring-the-arm-nn-sdk-build-environment-for-onnx">Configuring the Arm NN SDK build environment for ONNX</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/configuring-the-arm-nn-sdk-build-environment-for-tensorflow">Configuring the Arm NN SDK build environment for TensorFlow</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/configuring-the-arm-nn-sdk-build-environment-for-tensorflow-lite">Configuring the Arm NN SDK build environment for TensorFlow Lite</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/deploying-a-caffe-model-on-openmv-using-cmsis-nn">Deploying a Caffe Model on OpenMV using CMSIS-NN</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/deploying-cloud-based-ml-for-speech-transcription">Deploying cloud-based ML for speech transcription</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/configuring-the-arm-nn-sdk-build-environment-for-caffe">Configuring the Arm NN SDK build environment for Caffe</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/deploying-a-tensorflow-mnist-model-on-arm-nn">Deploying a TensorFlow MNIST model on Arm NN</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/deploying-a-caffe-mnist-model-using-the-arm-nn-sdk">Deploying a Caffe MNIST model using the Arm NN SDK</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/profiling-alexnet-on-raspberry-pi-and-hikey-960-with-the-compute-library">Profiling AlexNet on Raspberry Pi and HiKey 960 with the Compute Library</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/optimizing-neural-networks-for-mobile-and-embedded-devices-with-tensorflow">Optimizing neural networks for mobile and embedded devices with TensorFlow</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/running-alexnet-on-raspberry-pi-with-compute-library">Running AlexNet on Raspberry Pi with Compute Library</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/improving-your-machine-learning-workflow-using-the-arm-nn-sdk">Improving your machine learning workflow using the Arm NN SDK</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/quantizing-neural-networks-to-8-bit-using-tensorflow">Quantizing neural networks to 8-bit using TensorFlow</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/cross-compiling-arm-nn-for-the-raspberry-pi-and-tensorflow">Cross-compiling Arm NN for the Raspberry Pi and TensorFlow</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/teach-your-raspberry-pi-yeah-world">Teach your Raspberry Pi - Yeah world</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/how-to-guides/teach-your-raspberry-pi-multi-gesture">Teach your Raspberry Pi - Multi-gesture</a></li></ul></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/webinars">Webinars</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/software-for-machine-learning-on-arm">Software for Machine Learning on Arm</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/research-papers">Research papers</a></li><li><a class="" href="/technologies/machine-learning-on-arm/developer-material/white-papers">White papers</a></li></ul></li><li><a class="" href="/technologies/machine-learning-on-arm/ecosystem-partners">Ecosystem partners</a></li><li><a class="" href="https://community.arm.com/p/ml-blog">Blog</a></li></ul></li><li class="has-dropdown"><a class="" href="/technologies/security-on-arm">Security on Arm</a><ul class="dropdown"><li><a class="" href="/technologies/security-on-arm/arm-technologies">Arm technologies</a></li><li><a class="" href="/technologies/security-on-arm/how-do-i-implement">How do I implement</a></li><li><a class="" href="/technologies/security-on-arm/types-of-attack-and-counter-measures">Types of attack and counter-measures</a></li><li class="has-dropdown"><a class="" href="/technologies/security-on-arm/arm-security-developer-community">Arm Security Developer Community</a><ul class="dropdown"><li><a class="" href="/technologies/security-on-arm/arm-security-developer-community/arm-security-partners">Arm Security Partners</a></li></ul></li></ul></li><li><a class="" href="/technologies/uefi-drivers">UEFI Drivers</a></li><li><a class="" href="/technologies/dynamiq">DynamIQ</a></li><li class="has-dropdown"><a class="" href="/technologies/graphics-technologies">Graphics Technologies</a><ul class="dropdown"><li><a class="" href="/technologies/graphics-technologies/adaptive-scalable-texture-compression">Adaptive Scalable Texture Compression</a></li><li><a class="" href="/technologies/graphics-technologies/arm-frame-buffer-compression">Arm Frame Buffer Compression</a></li><li><a class="" href="/technologies/graphics-technologies/transaction-elimination">Transaction Elimination</a></li></ul></li><li class="has-dropdown"><a class="" href="/technologies/trustzone">TrustZone</a><ul class="dropdown"><li><a class="" href="/technologies/trustzone/webinar-how-to-implement-a-secure-iot-system-on-armv8-m">Webinar - How to implement a secure IoT system on Armv8-M</a></li></ul></li><li><a class="" href="/technologies/compute-library">Compute Library</a></li><li><a class="" href="/technologies/floating-point">Floating Point</a></li></ul>                </section>
+            </nav>
+        </div>
+    </div>
+    <script>
+        (function() {
+            var $contextualNavigation;
+            var $contextualNavigationWrapper;
+
+            function setHeight() {
+                var computedStyles = getComputedStyle($contextualNavigation);
+                var height = computedStyles.getPropertyValue('height');
+                $contextualNavigationWrapper.style.height = height;
+            };
+
+            function setPosition(evt) {
+                var $globalNavigationWrapper = document.querySelector('.arm-global-menu-wrapper');
+                var $notificationsWrapper = document.querySelector('.c-notifications-wrapper');
+
+                var pageOffset = window.pageYOffset;
+
+                var globalNavigationHeight = (!!$globalNavigationWrapper) ? $globalNavigationWrapper.clientHeight : 0;
+                var notificationsHeight = (!!$notificationsWrapper) ? $notificationsWrapper.clientHeight : 0;
+
+                var sum = globalNavigationHeight + notificationsHeight;
+                if (pageOffset >= sum) {
+                    $contextualNavigation.classList.add('is-stuck');
+                    $contextualNavigation.style.top = sum + 'px';
+                } else {
+                    $contextualNavigation.classList.remove('is-stuck');
+                    $contextualNavigation.style.top = 0;
+                }
+            };
+
+            function repaint(evt) {
+                $contextualNavigation = document.querySelector('.c-contextual-navigation');
+                $contextualNavigationWrapper = $contextualNavigation.parentElement;
+                setHeight();
+                setPosition();
+            };
+
+            window.addEventListener('scroll', repaint);
+            window.addEventListener('resize', repaint);
+
+            repaint();
+        })();
+    </script>
+</div>
+</header>
+
+
+<main class="c-component c-content" id="content" role="main">
+    <!-- START ProductItemContent -->
+<div>
+    
+
+
+<div id="c-d6ca4787-3a02-4c15-91dd-387f84915495" class="o-widget c-generic-content small-text-center large-text-left"  data-widget="generic-content-variation-1">
+    <article class="row">
+        <section class="columns">
+            <h2 class="c-panel__subtitle">NEON Intrinsics</h2>
+            <p>Click on the intrinsic name to display more information about the intrinsic. To search for an intrinsic, enter the name of the intrinsic in the search box. As you type, the matching intrinsics will be displayed.</p>
+        </section>
+    </article>
+</div>
+
+
+<div id="c-5e6d165c-46c9-4b6a-aae4-2aeb4639d070" class="c-widget c-html-snippet"  data-deprecated>
+    <div class="row">
+        <div class="columns">
+            <h2></h2>
+
+<style>
+.intrinsics-search input {
+    height: 2.4rem;
+	width: 80%;
+	display: inline;
+}
+
+.intrinsics-search button {
+    width: 20%;
+    height: 2.4rem;
+	display: inline;
+}
+
+.intrinsic-accordion label,
+.intrinsic-accordion input[type="checkbox"] + label
+{
+	font-family: Consolas, monospace;
+	padding: 0.25em 0.5em;
+	font-size: 1em;
+	position: relative;
+    display: block;
+    cursor: pointer;
+    background: #EFEFEF;
+    border: none;
+	margin: 0;
+    color: #565b5b;
+}
+
+.intrinsic-accordion label b {
+	color: #009fc1;
+}
+
+.intrinsic-accordion .right {
+	float:right;
+	font-family: sans-serif;
+	font-size: 0.8em;
+}
+
+.intrinsic-accordion .intrinsic_name {
+	color: blue;
+}
+
+.intrinsic-accordion .intrinsic a {
+	color: #009fc1;
+    text-decoration: underline;
+    text-decoration-style: solid;
+    font-weight: bold;
+}
+
+.intrinsic-accordion label:hover {
+    background: #F3F3F3;
+}
+
+.intrinsic-accordion input:checked + label,
+.intrinsic-accordion input:checked + label:hover {
+    background: #CDECC5;
+}
+.intrinsic-accordion input {
+    display: none;
+}
+.intrinsic-accordion article {
+    background: rgb(255, 255, 255);
+    /*overflow: hidden;*/
+    display: none;
+    -webkit-transition: all 0.3s ease-in-out;
+    -moz-transition: all 0.3s ease-in-out;
+    -o-transition: all 0.3s ease-in-out;
+    -ms-transition: all 0.3s ease-in-out;
+    transition: all 0.3s ease-in-out;
+}
+
+.intrinsic-accordion input:checked ~ article {
+    -webkit-transition: all 0.5s ease-in-out;
+    -moz-transition: all 0.5s ease-in-out;
+    -o-transition: all 0.5s ease-in-out;
+    -ms-transition: all 0.5s ease-in-out;
+    transition: all 0.5s ease-in-out;
+	display: block;
+}
+
+.intrinsic-accordion .intrinsic {
+    border: 1px solid #ededed;
+}
+
+.intrinsic-accordion .intrinsic article {
+    margin: 1rem;
+}
+
+</style>
+
+<script>
+
+</script>
+<script>
+$(document).ready(function(){
+  $("#js-intrinsics-query").keyup(function(){
+   var bla = $('#js-intrinsics-query').val();
+   $( ".intrinsic" ).each(function(){
+        var htxt=$(this).text();
+        if (htxt.toLowerCase().indexOf(bla.toLowerCase()) > -1) {
+            $(this).show();
+        } else {
+            $(this).hide();
+        }
+    });
+  });
+});
+
+</script>
+</head>
+<body>
+<div class="row">
+  <div class="large-6 columns spacing-2 intrinsics-search">
+	<input id="js-intrinsics-query"><button class="tiny"><i class="fa fa-search"></i></button>
+  </div>
+</div>
+
+<section class="intrinsic-accordion">
+<div class="intrinsic"><input id="vadd_s8" type="checkbox"><label for="vadd_s8"><div>int8x8_t <b><b>vadd_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_s8" type="checkbox"><label for="vaddq_s8"><div>int8x16_t <b><b>vaddq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vadd_s16" type="checkbox"><label for="vadd_s16"><div>int16x4_t <b><b>vadd_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_s16" type="checkbox"><label for="vaddq_s16"><div>int16x8_t <b><b>vaddq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vadd_s32" type="checkbox"><label for="vadd_s32"><div>int32x2_t <b><b>vadd_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_s32" type="checkbox"><label for="vaddq_s32"><div>int32x4_t <b><b>vaddq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vadd_s64" type="checkbox"><label for="vadd_s64"><div>int64x1_t <b><b>vadd_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_s64" type="checkbox"><label for="vaddq_s64"><div>int64x2_t <b><b>vaddq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vadd_u8" type="checkbox"><label for="vadd_u8"><div>uint8x8_t <b><b>vadd_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_u8" type="checkbox"><label for="vaddq_u8"><div>uint8x16_t <b><b>vaddq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vadd_u16" type="checkbox"><label for="vadd_u16"><div>uint16x4_t <b><b>vadd_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_u16" type="checkbox"><label for="vaddq_u16"><div>uint16x8_t <b><b>vaddq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vadd_u32" type="checkbox"><label for="vadd_u32"><div>uint32x2_t <b><b>vadd_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_u32" type="checkbox"><label for="vaddq_u32"><div>uint32x4_t <b><b>vaddq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vadd_u64" type="checkbox"><label for="vadd_u64"><div>uint64x1_t <b><b>vadd_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_u64" type="checkbox"><label for="vaddq_u64"><div>uint64x2_t <b><b>vaddq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vadd_f32" type="checkbox"><label for="vadd_f32"><div>float32x2_t <b><b>vadd_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add (vector). This instruction adds corresponding vector elements in the two source SIMD&amp;FP registers, writes the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fadd-vector-floating-point-add-vector">FADD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_f32" type="checkbox"><label for="vaddq_f32"><div>float32x4_t <b><b>vaddq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add (vector). This instruction adds corresponding vector elements in the two source SIMD&amp;FP registers, writes the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fadd-vector-floating-point-add-vector">FADD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vadd_f64" type="checkbox"><label for="vadd_f64"><div>float64x1_t <b><b>vadd_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add (vector). This instruction adds corresponding vector elements in the two source SIMD&amp;FP registers, writes the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fadd-vector-floating-point-add-vector">FADD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddq_f64" type="checkbox"><label for="vaddq_f64"><div>float64x2_t <b><b>vaddq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add (vector). This instruction adds corresponding vector elements in the two source SIMD&amp;FP registers, writes the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fadd-vector-floating-point-add-vector">FADD</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddd_s64" type="checkbox"><label for="vaddd_s64"><div>int64_t <b><b>vaddd_s64</b></b> (int64_t a, int64_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddd_u64" type="checkbox"><label for="vaddd_u64"><div>uint64_t <b><b>vaddd_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add (vector). This instruction adds corresponding elements in the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/add-vector-add-vector">ADD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_s8" type="checkbox"><label for="vaddl_s8"><div>int16x8_t <b><b>vaddl_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.  The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddl-saddl2-signed-add-long-vector">SADDL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_s16" type="checkbox"><label for="vaddl_s16"><div>int32x4_t <b><b>vaddl_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.  The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddl-saddl2-signed-add-long-vector">SADDL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_s32" type="checkbox"><label for="vaddl_s32"><div>int64x2_t <b><b>vaddl_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.  The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddl-saddl2-signed-add-long-vector">SADDL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_u8" type="checkbox"><label for="vaddl_u8"><div>uint16x8_t <b><b>vaddl_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddl-uaddl2-unsigned-add-long-vector">UADDL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_u16" type="checkbox"><label for="vaddl_u16"><div>uint32x4_t <b><b>vaddl_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddl-uaddl2-unsigned-add-long-vector">UADDL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_u32" type="checkbox"><label for="vaddl_u32"><div>uint64x2_t <b><b>vaddl_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddl-uaddl2-unsigned-add-long-vector">UADDL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_high_s8" type="checkbox"><label for="vaddl_high_s8"><div>int16x8_t <b><b>vaddl_high_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.  The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddl-saddl2-signed-add-long-vector">SADDL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_high_s16" type="checkbox"><label for="vaddl_high_s16"><div>int32x4_t <b><b>vaddl_high_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.  The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddl-saddl2-signed-add-long-vector">SADDL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_high_s32" type="checkbox"><label for="vaddl_high_s32"><div>int64x2_t <b><b>vaddl_high_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.  The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddl-saddl2-signed-add-long-vector">SADDL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_high_u8" type="checkbox"><label for="vaddl_high_u8"><div>uint16x8_t <b><b>vaddl_high_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddl-uaddl2-unsigned-add-long-vector">UADDL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_high_u16" type="checkbox"><label for="vaddl_high_u16"><div>uint32x4_t <b><b>vaddl_high_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddl-uaddl2-unsigned-add-long-vector">UADDL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddl_high_u32" type="checkbox"><label for="vaddl_high_u32"><div>uint64x2_t <b><b>vaddl_high_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long (vector). This instruction adds each vector element in the lower or upper half of the first source SIMD&amp;FP register to the corresponding vector element of the second source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddl-uaddl2-unsigned-add-long-vector">UADDL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_s8" type="checkbox"><label for="vaddw_s8"><div>int16x8_t <b><b>vaddw_s8</b></b> (int16x8_t a, int8x8_t b)<span class="right">Signed add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Wide. This instruction adds vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddw-saddw2-signed-add-wide">SADDW</a> Vd.8H,Vn.8H,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_s16" type="checkbox"><label for="vaddw_s16"><div>int32x4_t <b><b>vaddw_s16</b></b> (int32x4_t a, int16x4_t b)<span class="right">Signed add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Wide. This instruction adds vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddw-saddw2-signed-add-wide">SADDW</a> Vd.4S,Vn.4S,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_s32" type="checkbox"><label for="vaddw_s32"><div>int64x2_t <b><b>vaddw_s32</b></b> (int64x2_t a, int32x2_t b)<span class="right">Signed add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Wide. This instruction adds vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddw-saddw2-signed-add-wide">SADDW</a> Vd.2D,Vn.2D,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_u8" type="checkbox"><label for="vaddw_u8"><div>uint16x8_t <b><b>vaddw_u8</b></b> (uint16x8_t a, uint8x8_t b)<span class="right">Unsigned add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Wide. This instruction adds the vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. The vector elements of the destination register and the first source register are twice as long as the vector elements of the second source register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddw-uaddw2-unsigned-add-wide">UADDW</a> Vd.8H,Vn.8H,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_u16" type="checkbox"><label for="vaddw_u16"><div>uint32x4_t <b><b>vaddw_u16</b></b> (uint32x4_t a, uint16x4_t b)<span class="right">Unsigned add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Wide. This instruction adds the vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. The vector elements of the destination register and the first source register are twice as long as the vector elements of the second source register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddw-uaddw2-unsigned-add-wide">UADDW</a> Vd.4S,Vn.4S,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_u32" type="checkbox"><label for="vaddw_u32"><div>uint64x2_t <b><b>vaddw_u32</b></b> (uint64x2_t a, uint32x2_t b)<span class="right">Unsigned add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Wide. This instruction adds the vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. The vector elements of the destination register and the first source register are twice as long as the vector elements of the second source register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddw-uaddw2-unsigned-add-wide">UADDW</a> Vd.2D,Vn.2D,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_high_s8" type="checkbox"><label for="vaddw_high_s8"><div>int16x8_t <b><b>vaddw_high_s8</b></b> (int16x8_t a, int8x16_t b)<span class="right">Signed add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Wide. This instruction adds vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddw-saddw2-signed-add-wide">SADDW2</a> Vd.8H,Vn.8H,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_high_s16" type="checkbox"><label for="vaddw_high_s16"><div>int32x4_t <b><b>vaddw_high_s16</b></b> (int32x4_t a, int16x8_t b)<span class="right">Signed add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Wide. This instruction adds vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddw-saddw2-signed-add-wide">SADDW2</a> Vd.4S,Vn.4S,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_high_s32" type="checkbox"><label for="vaddw_high_s32"><div>int64x2_t <b><b>vaddw_high_s32</b></b> (int64x2_t a, int32x4_t b)<span class="right">Signed add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Wide. This instruction adds vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddw-saddw2-signed-add-wide">SADDW2</a> Vd.2D,Vn.2D,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_high_u8" type="checkbox"><label for="vaddw_high_u8"><div>uint16x8_t <b><b>vaddw_high_u8</b></b> (uint16x8_t a, uint8x16_t b)<span class="right">Unsigned add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Wide. This instruction adds the vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. The vector elements of the destination register and the first source register are twice as long as the vector elements of the second source register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddw-uaddw2-unsigned-add-wide">UADDW2</a> Vd.8H,Vn.8H,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_high_u16" type="checkbox"><label for="vaddw_high_u16"><div>uint32x4_t <b><b>vaddw_high_u16</b></b> (uint32x4_t a, uint16x8_t b)<span class="right">Unsigned add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Wide. This instruction adds the vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. The vector elements of the destination register and the first source register are twice as long as the vector elements of the second source register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddw-uaddw2-unsigned-add-wide">UADDW2</a> Vd.4S,Vn.4S,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddw_high_u32" type="checkbox"><label for="vaddw_high_u32"><div>uint64x2_t <b><b>vaddw_high_u32</b></b> (uint64x2_t a, uint32x4_t b)<span class="right">Unsigned add wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Wide. This instruction adds the vector elements of the first source SIMD&amp;FP register to the corresponding vector elements in the lower or upper half of the second source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. The vector elements of the destination register and the first source register are twice as long as the vector elements of the second source register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddw-uaddw2-unsigned-add-wide">UADDW2</a> Vd.2D,Vn.2D,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vhadd_s8" type="checkbox"><label for="vhadd_s8"><div>int8x8_t <b><b>vhadd_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shadd-signed-halving-add">SHADD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhaddq_s8" type="checkbox"><label for="vhaddq_s8"><div>int8x16_t <b><b>vhaddq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shadd-signed-halving-add">SHADD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhadd_s16" type="checkbox"><label for="vhadd_s16"><div>int16x4_t <b><b>vhadd_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shadd-signed-halving-add">SHADD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhaddq_s16" type="checkbox"><label for="vhaddq_s16"><div>int16x8_t <b><b>vhaddq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shadd-signed-halving-add">SHADD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhadd_s32" type="checkbox"><label for="vhadd_s32"><div>int32x2_t <b><b>vhadd_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shadd-signed-halving-add">SHADD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhaddq_s32" type="checkbox"><label for="vhaddq_s32"><div>int32x4_t <b><b>vhaddq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shadd-signed-halving-add">SHADD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhadd_u8" type="checkbox"><label for="vhadd_u8"><div>uint8x8_t <b><b>vhadd_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhadd-unsigned-halving-add">UHADD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhaddq_u8" type="checkbox"><label for="vhaddq_u8"><div>uint8x16_t <b><b>vhaddq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhadd-unsigned-halving-add">UHADD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhadd_u16" type="checkbox"><label for="vhadd_u16"><div>uint16x4_t <b><b>vhadd_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhadd-unsigned-halving-add">UHADD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhaddq_u16" type="checkbox"><label for="vhaddq_u16"><div>uint16x8_t <b><b>vhaddq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhadd-unsigned-halving-add">UHADD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhadd_u32" type="checkbox"><label for="vhadd_u32"><div>uint32x2_t <b><b>vhadd_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhadd-unsigned-halving-add">UHADD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhaddq_u32" type="checkbox"><label for="vhaddq_u32"><div>uint32x4_t <b><b>vhaddq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhadd-unsigned-halving-add">UHADD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhadd_s8" type="checkbox"><label for="vrhadd_s8"><div>int8x8_t <b><b>vrhadd_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srhadd-signed-rounding-halving-add">SRHADD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhaddq_s8" type="checkbox"><label for="vrhaddq_s8"><div>int8x16_t <b><b>vrhaddq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srhadd-signed-rounding-halving-add">SRHADD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhadd_s16" type="checkbox"><label for="vrhadd_s16"><div>int16x4_t <b><b>vrhadd_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srhadd-signed-rounding-halving-add">SRHADD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhaddq_s16" type="checkbox"><label for="vrhaddq_s16"><div>int16x8_t <b><b>vrhaddq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srhadd-signed-rounding-halving-add">SRHADD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhadd_s32" type="checkbox"><label for="vrhadd_s32"><div>int32x2_t <b><b>vrhadd_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srhadd-signed-rounding-halving-add">SRHADD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhaddq_s32" type="checkbox"><label for="vrhaddq_s32"><div>int32x4_t <b><b>vrhaddq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Halving Add. This instruction adds corresponding signed integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srhadd-signed-rounding-halving-add">SRHADD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhadd_u8" type="checkbox"><label for="vrhadd_u8"><div>uint8x8_t <b><b>vrhadd_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urhadd-unsigned-rounding-halving-add">URHADD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhaddq_u8" type="checkbox"><label for="vrhaddq_u8"><div>uint8x16_t <b><b>vrhaddq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urhadd-unsigned-rounding-halving-add">URHADD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhadd_u16" type="checkbox"><label for="vrhadd_u16"><div>uint16x4_t <b><b>vrhadd_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urhadd-unsigned-rounding-halving-add">URHADD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhaddq_u16" type="checkbox"><label for="vrhaddq_u16"><div>uint16x8_t <b><b>vrhaddq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urhadd-unsigned-rounding-halving-add">URHADD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhadd_u32" type="checkbox"><label for="vrhadd_u32"><div>uint32x2_t <b><b>vrhadd_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urhadd-unsigned-rounding-halving-add">URHADD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrhaddq_u32" type="checkbox"><label for="vrhaddq_u32"><div>uint32x4_t <b><b>vrhaddq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned rounding halving add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Halving Add. This instruction adds corresponding unsigned integer values from the two source SIMD&amp;FP registers, shifts each result right one bit, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urhadd-unsigned-rounding-halving-add">URHADD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (element1+element2+1)&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqadd_s8" type="checkbox"><label for="vqadd_s8"><div>int8x8_t <b><b>vqadd_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqaddq_s8" type="checkbox"><label for="vqaddq_s8"><div>int8x16_t <b><b>vqaddq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqadd_s16" type="checkbox"><label for="vqadd_s16"><div>int16x4_t <b><b>vqadd_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqaddq_s16" type="checkbox"><label for="vqaddq_s16"><div>int16x8_t <b><b>vqaddq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqadd_s32" type="checkbox"><label for="vqadd_s32"><div>int32x2_t <b><b>vqadd_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqaddq_s32" type="checkbox"><label for="vqaddq_s32"><div>int32x4_t <b><b>vqaddq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqadd_s64" type="checkbox"><label for="vqadd_s64"><div>int64x1_t <b><b>vqadd_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqaddq_s64" type="checkbox"><label for="vqaddq_s64"><div>int64x2_t <b><b>vqaddq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqadd_u8" type="checkbox"><label for="vqadd_u8"><div>uint8x8_t <b><b>vqadd_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqaddq_u8" type="checkbox"><label for="vqaddq_u8"><div>uint8x16_t <b><b>vqaddq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqadd_u16" type="checkbox"><label for="vqadd_u16"><div>uint16x4_t <b><b>vqadd_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqaddq_u16" type="checkbox"><label for="vqaddq_u16"><div>uint16x8_t <b><b>vqaddq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqadd_u32" type="checkbox"><label for="vqadd_u32"><div>uint32x2_t <b><b>vqadd_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqaddq_u32" type="checkbox"><label for="vqaddq_u32"><div>uint32x4_t <b><b>vqaddq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqadd_u64" type="checkbox"><label for="vqadd_u64"><div>uint64x1_t <b><b>vqadd_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqaddq_u64" type="checkbox"><label for="vqaddq_u64"><div>uint64x2_t <b><b>vqaddq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqaddb_s8" type="checkbox"><label for="vqaddb_s8"><div>int8_t <b><b>vqaddb_s8</b></b> (int8_t a, int8_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Bd,Bn,Bm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+b &rarr; Bm </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqaddh_s16" type="checkbox"><label for="vqaddh_s16"><div>int16_t <b><b>vqaddh_s16</b></b> (int16_t a, int16_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqadds_s32" type="checkbox"><label for="vqadds_s32"><div>int32_t <b><b>vqadds_s32</b></b> (int32_t a, int32_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqaddd_s64" type="checkbox"><label for="vqaddd_s64"><div>int64_t <b><b>vqaddd_s64</b></b> (int64_t a, int64_t b)<span class="right">Signed saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqadd-signed-saturating-add">SQADD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqaddb_u8" type="checkbox"><label for="vqaddb_u8"><div>uint8_t <b><b>vqaddb_u8</b></b> (uint8_t a, uint8_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Bd,Bn,Bm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+b &rarr; Bm </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqaddh_u16" type="checkbox"><label for="vqaddh_u16"><div>uint16_t <b><b>vqaddh_u16</b></b> (uint16_t a, uint16_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqadds_u32" type="checkbox"><label for="vqadds_u32"><div>uint32_t <b><b>vqadds_u32</b></b> (uint32_t a, uint32_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqaddd_u64" type="checkbox"><label for="vqaddd_u64"><div>uint64_t <b><b>vqaddd_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Unsigned saturating add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Add. This instruction adds the values of corresponding elements of the two source SIMD&amp;FP registers, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqadd-unsigned-saturating-add">UQADD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer sum;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    sum = element1 + element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(sum, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqadd_s8" type="checkbox"><label for="vuqadd_s8"><div>int8x8_t <b><b>vuqadd_s8</b></b> (int8x8_t a, uint8x8_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqaddq_s8" type="checkbox"><label for="vuqaddq_s8"><div>int8x16_t <b><b>vuqaddq_s8</b></b> (int8x16_t a, uint8x16_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqadd_s16" type="checkbox"><label for="vuqadd_s16"><div>int16x4_t <b><b>vuqadd_s16</b></b> (int16x4_t a, uint16x4_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqaddq_s16" type="checkbox"><label for="vuqaddq_s16"><div>int16x8_t <b><b>vuqaddq_s16</b></b> (int16x8_t a, uint16x8_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqadd_s32" type="checkbox"><label for="vuqadd_s32"><div>int32x2_t <b><b>vuqadd_s32</b></b> (int32x2_t a, uint32x2_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqaddq_s32" type="checkbox"><label for="vuqaddq_s32"><div>int32x4_t <b><b>vuqaddq_s32</b></b> (int32x4_t a, uint32x4_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqadd_s64" type="checkbox"><label for="vuqadd_s64"><div>int64x1_t <b><b>vuqadd_s64</b></b> (int64x1_t a, uint64x1_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqaddq_s64" type="checkbox"><label for="vuqaddq_s64"><div>int64x2_t <b><b>vuqaddq_s64</b></b> (int64x2_t a, uint64x2_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqaddb_s8" type="checkbox"><label for="vuqaddb_s8"><div>int8_t <b><b>vuqaddb_s8</b></b> (int8_t a, uint8_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Bd,Bn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bd <br />
+b &rarr; Bn </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqaddh_s16" type="checkbox"><label for="vuqaddh_s16"><div>int16_t <b><b>vuqaddh_s16</b></b> (int16_t a, uint16_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Hd,Hn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hd <br />
+b &rarr; Hn </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqadds_s32" type="checkbox"><label for="vuqadds_s32"><div>int32_t <b><b>vuqadds_s32</b></b> (int32_t a, uint32_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuqaddd_s64" type="checkbox"><label for="vuqaddd_s64"><div>int64_t <b><b>vuqaddd_s64</b></b> (int64_t a, uint64_t b)<span class="right">Signed saturating accumulate of unsigned value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Accumulate of Unsigned value. This instruction adds the unsigned integer values of the vector elements in the source SIMD&amp;FP register to corresponding signed integer values of the vector elements in the destination SIMD&amp;FP register, and writes the resulting signed integer values to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/suqadd-signed-saturating-accumulate-of-unsigned-value">SUQADD</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqadd_u8" type="checkbox"><label for="vsqadd_u8"><div>uint8x8_t <b><b>vsqadd_u8</b></b> (uint8x8_t a, int8x8_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqaddq_u8" type="checkbox"><label for="vsqaddq_u8"><div>uint8x16_t <b><b>vsqaddq_u8</b></b> (uint8x16_t a, int8x16_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqadd_u16" type="checkbox"><label for="vsqadd_u16"><div>uint16x4_t <b><b>vsqadd_u16</b></b> (uint16x4_t a, int16x4_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqaddq_u16" type="checkbox"><label for="vsqaddq_u16"><div>uint16x8_t <b><b>vsqaddq_u16</b></b> (uint16x8_t a, int16x8_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqadd_u32" type="checkbox"><label for="vsqadd_u32"><div>uint32x2_t <b><b>vsqadd_u32</b></b> (uint32x2_t a, int32x2_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqaddq_u32" type="checkbox"><label for="vsqaddq_u32"><div>uint32x4_t <b><b>vsqaddq_u32</b></b> (uint32x4_t a, int32x4_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqadd_u64" type="checkbox"><label for="vsqadd_u64"><div>uint64x1_t <b><b>vsqadd_u64</b></b> (uint64x1_t a, int64x1_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqaddq_u64" type="checkbox"><label for="vsqaddq_u64"><div>uint64x2_t <b><b>vsqaddq_u64</b></b> (uint64x2_t a, int64x2_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqaddb_u8" type="checkbox"><label for="vsqaddb_u8"><div>uint8_t <b><b>vsqaddb_u8</b></b> (uint8_t a, int8_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Bd,Bn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bd <br />
+b &rarr; Bn </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqaddh_u16" type="checkbox"><label for="vsqaddh_u16"><div>uint16_t <b><b>vsqaddh_u16</b></b> (uint16_t a, int16_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Hd,Hn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hd <br />
+b &rarr; Hn </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqadds_u32" type="checkbox"><label for="vsqadds_u32"><div>uint32_t <b><b>vsqadds_u32</b></b> (uint32_t a, int32_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqaddd_u64" type="checkbox"><label for="vsqaddd_u64"><div>uint64_t <b><b>vsqaddd_u64</b></b> (uint64_t a, int64_t b)<span class="right">Unsigned saturating accumulate of signed value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Accumulate of Signed value. This instruction adds the signed integer values of the vector elements in the source SIMD&amp;FP register to corresponding unsigned integer values of the vector elements in the destination SIMD&amp;FP register, and accumulates the resulting unsigned integer values with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usqadd-unsigned-saturating-accumulate-of-signed-value">USQADD</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+integer op1;
+integer op2;
+boolean sat;
+
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], !unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(op1 + op2, esize, unsigned);
+    if sat then FPSR.QC = '1';
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_s16" type="checkbox"><label for="vaddhn_s16"><div>int8x8_t <b><b>vaddhn_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN</a> Vd.8B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_s32" type="checkbox"><label for="vaddhn_s32"><div>int16x4_t <b><b>vaddhn_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN</a> Vd.4H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_s64" type="checkbox"><label for="vaddhn_s64"><div>int32x2_t <b><b>vaddhn_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN</a> Vd.2S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_u16" type="checkbox"><label for="vaddhn_u16"><div>uint8x8_t <b><b>vaddhn_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN</a> Vd.8B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_u32" type="checkbox"><label for="vaddhn_u32"><div>uint16x4_t <b><b>vaddhn_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN</a> Vd.4H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_u64" type="checkbox"><label for="vaddhn_u64"><div>uint32x2_t <b><b>vaddhn_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN</a> Vd.2S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_high_s16" type="checkbox"><label for="vaddhn_high_s16"><div>int8x16_t <b><b>vaddhn_high_s16</b></b> (int8x8_t r, int16x8_t a, int16x8_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN2</a> Vd.16B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_high_s32" type="checkbox"><label for="vaddhn_high_s32"><div>int16x8_t <b><b>vaddhn_high_s32</b></b> (int16x4_t r, int32x4_t a, int32x4_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN2</a> Vd.8H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_high_s64" type="checkbox"><label for="vaddhn_high_s64"><div>int32x4_t <b><b>vaddhn_high_s64</b></b> (int32x2_t r, int64x2_t a, int64x2_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN2</a> Vd.4S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_high_u16" type="checkbox"><label for="vaddhn_high_u16"><div>uint8x16_t <b><b>vaddhn_high_u16</b></b> (uint8x8_t r, uint16x8_t a, uint16x8_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN2</a> Vd.16B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_high_u32" type="checkbox"><label for="vaddhn_high_u32"><div>uint16x8_t <b><b>vaddhn_high_u32</b></b> (uint16x4_t r, uint32x4_t a, uint32x4_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN2</a> Vd.8H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddhn_high_u64" type="checkbox"><label for="vaddhn_high_u64"><div>uint32x4_t <b><b>vaddhn_high_u64</b></b> (uint32x2_t r, uint64x2_t a, uint64x2_t b)<span class="right">Add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addhn-addhn2-add-returning-high-narrow">ADDHN2</a> Vd.4S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_s16" type="checkbox"><label for="vraddhn_s16"><div>int8x8_t <b><b>vraddhn_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN</a> Vd.8B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_s32" type="checkbox"><label for="vraddhn_s32"><div>int16x4_t <b><b>vraddhn_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN</a> Vd.4H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_s64" type="checkbox"><label for="vraddhn_s64"><div>int32x2_t <b><b>vraddhn_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN</a> Vd.2S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_u16" type="checkbox"><label for="vraddhn_u16"><div>uint8x8_t <b><b>vraddhn_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN</a> Vd.8B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_u32" type="checkbox"><label for="vraddhn_u32"><div>uint16x4_t <b><b>vraddhn_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN</a> Vd.4H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_u64" type="checkbox"><label for="vraddhn_u64"><div>uint32x2_t <b><b>vraddhn_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN</a> Vd.2S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_high_s16" type="checkbox"><label for="vraddhn_high_s16"><div>int8x16_t <b><b>vraddhn_high_s16</b></b> (int8x8_t r, int16x8_t a, int16x8_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN2</a> Vd.16B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_high_s32" type="checkbox"><label for="vraddhn_high_s32"><div>int16x8_t <b><b>vraddhn_high_s32</b></b> (int16x4_t r, int32x4_t a, int32x4_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN2</a> Vd.8H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_high_s64" type="checkbox"><label for="vraddhn_high_s64"><div>int32x4_t <b><b>vraddhn_high_s64</b></b> (int32x2_t r, int64x2_t a, int64x2_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN2</a> Vd.4S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_high_u16" type="checkbox"><label for="vraddhn_high_u16"><div>uint8x16_t <b><b>vraddhn_high_u16</b></b> (uint8x8_t r, uint16x8_t a, uint16x8_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN2</a> Vd.16B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_high_u32" type="checkbox"><label for="vraddhn_high_u32"><div>uint16x8_t <b><b>vraddhn_high_u32</b></b> (uint16x4_t r, uint32x4_t a, uint32x4_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN2</a> Vd.8H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vraddhn_high_u64" type="checkbox"><label for="vraddhn_high_u64"><div>uint32x4_t <b><b>vraddhn_high_u64</b></b> (uint32x2_t r, uint64x2_t a, uint64x2_t b)<span class="right">Rounding add returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Add returning High Narrow. This instruction adds each vector element in the first source SIMD&amp;FP register to the corresponding vector element in the second source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/raddhn-raddhn2-rounding-add-returning-high-narrow">RADDHN2</a> Vd.4S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmul_s8" type="checkbox"><label for="vmul_s8"><div>int8x8_t <b><b>vmul_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_s8" type="checkbox"><label for="vmulq_s8"><div>int8x16_t <b><b>vmulq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_s16" type="checkbox"><label for="vmul_s16"><div>int16x4_t <b><b>vmul_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_s16" type="checkbox"><label for="vmulq_s16"><div>int16x8_t <b><b>vmulq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_s32" type="checkbox"><label for="vmul_s32"><div>int32x2_t <b><b>vmul_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_s32" type="checkbox"><label for="vmulq_s32"><div>int32x4_t <b><b>vmulq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_u8" type="checkbox"><label for="vmul_u8"><div>uint8x8_t <b><b>vmul_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_u8" type="checkbox"><label for="vmulq_u8"><div>uint8x16_t <b><b>vmulq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_u16" type="checkbox"><label for="vmul_u16"><div>uint16x4_t <b><b>vmul_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_u16" type="checkbox"><label for="vmulq_u16"><div>uint16x8_t <b><b>vmulq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_u32" type="checkbox"><label for="vmul_u32"><div>uint32x2_t <b><b>vmul_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_u32" type="checkbox"><label for="vmulq_u32"><div>uint32x4_t <b><b>vmulq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_f32" type="checkbox"><label for="vmul_f32"><div>float32x2_t <b><b>vmul_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_f32" type="checkbox"><label for="vmulq_f32"><div>float32x4_t <b><b>vmulq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_p8" type="checkbox"><label for="vmul_p8"><div>poly8x8_t <b><b>vmul_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Polynomial multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Polynomial Multiply. This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/pmul-polynomial-multiply">PMUL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_p8" type="checkbox"><label for="vmulq_p8"><div>poly8x16_t <b><b>vmulq_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Polynomial multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Polynomial Multiply. This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/pmul-polynomial-multiply">PMUL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_f64" type="checkbox"><label for="vmul_f64"><div>float64x1_t <b><b>vmul_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_f64" type="checkbox"><label for="vmulq_f64"><div>float64x2_t <b><b>vmulq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulx_f32" type="checkbox"><label for="vmulx_f32"><div>float32x2_t <b><b>vmulx_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxq_f32" type="checkbox"><label for="vmulxq_f32"><div>float32x4_t <b><b>vmulxq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulx_f64" type="checkbox"><label for="vmulx_f64"><div>float64x1_t <b><b>vmulx_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxq_f64" type="checkbox"><label for="vmulxq_f64"><div>float64x2_t <b><b>vmulxq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxs_f32" type="checkbox"><label for="vmulxs_f32"><div>float32_t <b><b>vmulxs_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxd_f64" type="checkbox"><label for="vmulxd_f64"><div>float64_t <b><b>vmulxd_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulx_lane_f32" type="checkbox"><label for="vmulx_lane_f32"><div>float32x2_t <b><b>vmulx_lane_f32</b></b> (float32x2_t a, float32x2_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxq_lane_f32" type="checkbox"><label for="vmulxq_lane_f32"><div>float32x4_t <b><b>vmulxq_lane_f32</b></b> (float32x4_t a, float32x2_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulx_lane_f64" type="checkbox"><label for="vmulx_lane_f64"><div>float64x1_t <b><b>vmulx_lane_f64</b></b> (float64x1_t a, float64x1_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxq_lane_f64" type="checkbox"><label for="vmulxq_lane_f64"><div>float64x2_t <b><b>vmulxq_lane_f64</b></b> (float64x2_t a, float64x1_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Vd.2D,Vn.2D,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxs_lane_f32" type="checkbox"><label for="vmulxs_lane_f32"><div>float32_t <b><b>vmulxs_lane_f32</b></b> (float32_t a, float32x2_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxd_lane_f64" type="checkbox"><label for="vmulxd_lane_f64"><div>float64_t <b><b>vmulxd_lane_f64</b></b> (float64_t a, float64x1_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulx_laneq_f32" type="checkbox"><label for="vmulx_laneq_f32"><div>float32x2_t <b><b>vmulx_laneq_f32</b></b> (float32x2_t a, float32x4_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxq_laneq_f32" type="checkbox"><label for="vmulxq_laneq_f32"><div>float32x4_t <b><b>vmulxq_laneq_f32</b></b> (float32x4_t a, float32x4_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulx_laneq_f64" type="checkbox"><label for="vmulx_laneq_f64"><div>float64x1_t <b><b>vmulx_laneq_f64</b></b> (float64x1_t a, float64x2_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxq_laneq_f64" type="checkbox"><label for="vmulxq_laneq_f64"><div>float64x2_t <b><b>vmulxq_laneq_f64</b></b> (float64x2_t a, float64x2_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Vd.2D,Vn.2D,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxs_laneq_f32" type="checkbox"><label for="vmulxs_laneq_f32"><div>float32_t <b><b>vmulxs_laneq_f32</b></b> (float32_t a, float32x4_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulxd_laneq_f64" type="checkbox"><label for="vmulxd_laneq_f64"><div>float64_t <b><b>vmulxd_laneq_f64</b></b> (float64_t a, float64x2_t v, const int lane)<span class="right">Floating-point multiply extended (by element)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply extended (by element). This instruction multiplies the floating-point values in the vector elements in the first source SIMD&amp;FP register by the specified floating-point value in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmulx-by-element-floating-point-multiply-extended-by-element">FMULX</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(idxdsize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, index, esize];
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    if mulx_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulX.3" title="function: bits(N) FPMulX(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulX</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdiv_f32" type="checkbox"><label for="vdiv_f32"><div>float32x2_t <b><b>vdiv_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point divide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Divide (vector). This instruction divides the floating-point values in the elements in the first source SIMD&amp;FP register, by the floating-point values in the corresponding elements in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fdiv-vector-floating-point-divide-vector">FDIV</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPDiv.3" title="function: bits(N) FPDiv(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPDiv</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdivq_f32" type="checkbox"><label for="vdivq_f32"><div>float32x4_t <b><b>vdivq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point divide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Divide (vector). This instruction divides the floating-point values in the elements in the first source SIMD&amp;FP register, by the floating-point values in the corresponding elements in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fdiv-vector-floating-point-divide-vector">FDIV</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPDiv.3" title="function: bits(N) FPDiv(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPDiv</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdiv_f64" type="checkbox"><label for="vdiv_f64"><div>float64x1_t <b><b>vdiv_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point divide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Divide (vector). This instruction divides the floating-point values in the elements in the first source SIMD&amp;FP register, by the floating-point values in the corresponding elements in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fdiv-vector-floating-point-divide-vector">FDIV</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPDiv.3" title="function: bits(N) FPDiv(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPDiv</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdivq_f64" type="checkbox"><label for="vdivq_f64"><div>float64x2_t <b><b>vdivq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point divide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Divide (vector). This instruction divides the floating-point values in the elements in the first source SIMD&amp;FP register, by the floating-point values in the corresponding elements in the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fdiv-vector-floating-point-divide-vector">FDIV</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPDiv.3" title="function: bits(N) FPDiv(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPDiv</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmla_s8" type="checkbox"><label for="vmla_s8"><div>int8x8_t <b><b>vmla_s8</b></b> (int8x8_t a, int8x8_t b, int8x8_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_s8" type="checkbox"><label for="vmlaq_s8"><div>int8x16_t <b><b>vmlaq_s8</b></b> (int8x16_t a, int8x16_t b, int8x16_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_s16" type="checkbox"><label for="vmla_s16"><div>int16x4_t <b><b>vmla_s16</b></b> (int16x4_t a, int16x4_t b, int16x4_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_s16" type="checkbox"><label for="vmlaq_s16"><div>int16x8_t <b><b>vmlaq_s16</b></b> (int16x8_t a, int16x8_t b, int16x8_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_s32" type="checkbox"><label for="vmla_s32"><div>int32x2_t <b><b>vmla_s32</b></b> (int32x2_t a, int32x2_t b, int32x2_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_s32" type="checkbox"><label for="vmlaq_s32"><div>int32x4_t <b><b>vmlaq_s32</b></b> (int32x4_t a, int32x4_t b, int32x4_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_u8" type="checkbox"><label for="vmla_u8"><div>uint8x8_t <b><b>vmla_u8</b></b> (uint8x8_t a, uint8x8_t b, uint8x8_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_u8" type="checkbox"><label for="vmlaq_u8"><div>uint8x16_t <b><b>vmlaq_u8</b></b> (uint8x16_t a, uint8x16_t b, uint8x16_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_u16" type="checkbox"><label for="vmla_u16"><div>uint16x4_t <b><b>vmla_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16x4_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_u16" type="checkbox"><label for="vmlaq_u16"><div>uint16x8_t <b><b>vmlaq_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16x8_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_u32" type="checkbox"><label for="vmla_u32"><div>uint32x2_t <b><b>vmla_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32x2_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_u32" type="checkbox"><label for="vmlaq_u32"><div>uint32x4_t <b><b>vmlaq_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32x4_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_f32" type="checkbox"><label for="vmla_f32"><div>float32x2_t <b><b>vmla_f32</b></b> (float32x2_t a, float32x2_t b, float32x2_t c)<span class="right">Floating-point multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_f32" type="checkbox"><label for="vmlaq_f32"><div>float32x4_t <b><b>vmlaq_f32</b></b> (float32x4_t a, float32x4_t b, float32x4_t c)<span class="right">Floating-point multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 3
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_f64" type="checkbox"><label for="vmla_f64"><div>float64x1_t <b><b>vmla_f64</b></b> (float64x1_t a, float64x1_t b, float64x1_t c)<span class="right">Floating-point multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * c[i]) for i = 0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_f64" type="checkbox"><label for="vmlaq_f64"><div>float64x2_t <b><b>vmlaq_f64</b></b> (float64x2_t a, float64x2_t b, float64x2_t c)<span class="right">Floating-point multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_s8" type="checkbox"><label for="vmlal_s8"><div>int16x8_t <b><b>vmlal_s8</b></b> (int16x8_t a, int8x8_t b, int8x8_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_s16" type="checkbox"><label for="vmlal_s16"><div>int32x4_t <b><b>vmlal_s16</b></b> (int32x4_t a, int16x4_t b, int16x4_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_s32" type="checkbox"><label for="vmlal_s32"><div>int64x2_t <b><b>vmlal_s32</b></b> (int64x2_t a, int32x2_t b, int32x2_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_u8" type="checkbox"><label for="vmlal_u8"><div>uint16x8_t <b><b>vmlal_u8</b></b> (uint16x8_t a, uint8x8_t b, uint8x8_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_u16" type="checkbox"><label for="vmlal_u16"><div>uint32x4_t <b><b>vmlal_u16</b></b> (uint32x4_t a, uint16x4_t b, uint16x4_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_u32" type="checkbox"><label for="vmlal_u32"><div>uint64x2_t <b><b>vmlal_u32</b></b> (uint64x2_t a, uint32x2_t b, uint32x2_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_s8" type="checkbox"><label for="vmlal_high_s8"><div>int16x8_t <b><b>vmlal_high_s8</b></b> (int16x8_t a, int8x16_t b, int8x16_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_s16" type="checkbox"><label for="vmlal_high_s16"><div>int32x4_t <b><b>vmlal_high_s16</b></b> (int32x4_t a, int16x8_t b, int16x8_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_s32" type="checkbox"><label for="vmlal_high_s32"><div>int64x2_t <b><b>vmlal_high_s32</b></b> (int64x2_t a, int32x4_t b, int32x4_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_u8" type="checkbox"><label for="vmlal_high_u8"><div>uint16x8_t <b><b>vmlal_high_u8</b></b> (uint16x8_t a, uint8x16_t b, uint8x16_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_u16" type="checkbox"><label for="vmlal_high_u16"><div>uint32x4_t <b><b>vmlal_high_u16</b></b> (uint32x4_t a, uint16x8_t b, uint16x8_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_u32" type="checkbox"><label for="vmlal_high_u32"><div>uint64x2_t <b><b>vmlal_high_u32</b></b> (uint64x2_t a, uint32x4_t b, uint32x4_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmls_s8" type="checkbox"><label for="vmls_s8"><div>int8x8_t <b><b>vmls_s8</b></b> (int8x8_t a, int8x8_t b, int8x8_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_s8" type="checkbox"><label for="vmlsq_s8"><div>int8x16_t <b><b>vmlsq_s8</b></b> (int8x16_t a, int8x16_t b, int8x16_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_s16" type="checkbox"><label for="vmls_s16"><div>int16x4_t <b><b>vmls_s16</b></b> (int16x4_t a, int16x4_t b, int16x4_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_s16" type="checkbox"><label for="vmlsq_s16"><div>int16x8_t <b><b>vmlsq_s16</b></b> (int16x8_t a, int16x8_t b, int16x8_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_s32" type="checkbox"><label for="vmls_s32"><div>int32x2_t <b><b>vmls_s32</b></b> (int32x2_t a, int32x2_t b, int32x2_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_s32" type="checkbox"><label for="vmlsq_s32"><div>int32x4_t <b><b>vmlsq_s32</b></b> (int32x4_t a, int32x4_t b, int32x4_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_u8" type="checkbox"><label for="vmls_u8"><div>uint8x8_t <b><b>vmls_u8</b></b> (uint8x8_t a, uint8x8_t b, uint8x8_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_u8" type="checkbox"><label for="vmlsq_u8"><div>uint8x16_t <b><b>vmlsq_u8</b></b> (uint8x16_t a, uint8x16_t b, uint8x16_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_u16" type="checkbox"><label for="vmls_u16"><div>uint16x4_t <b><b>vmls_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16x4_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_u16" type="checkbox"><label for="vmlsq_u16"><div>uint16x8_t <b><b>vmlsq_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16x8_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_u32" type="checkbox"><label for="vmls_u32"><div>uint32x2_t <b><b>vmls_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32x2_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_u32" type="checkbox"><label for="vmlsq_u32"><div>uint32x4_t <b><b>vmlsq_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32x4_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_f32" type="checkbox"><label for="vmls_f32"><div>float32x2_t <b><b>vmls_f32</b></b> (float32x2_t a, float32x2_t b, float32x2_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_f32" type="checkbox"><label for="vmlsq_f32"><div>float32x4_t <b><b>vmlsq_f32</b></b> (float32x4_t a, float32x4_t b, float32x4_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 3
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_f64" type="checkbox"><label for="vmls_f64"><div>float64x1_t <b><b>vmls_f64</b></b> (float64x1_t a, float64x1_t b, float64x1_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * c[i]) for i = 0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_f64" type="checkbox"><label for="vmlsq_f64"><div>float64x2_t <b><b>vmlsq_f64</b></b> (float64x2_t a, float64x2_t b, float64x2_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_s8" type="checkbox"><label for="vmlsl_s8"><div>int16x8_t <b><b>vmlsl_s8</b></b> (int16x8_t a, int8x8_t b, int8x8_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_s16" type="checkbox"><label for="vmlsl_s16"><div>int32x4_t <b><b>vmlsl_s16</b></b> (int32x4_t a, int16x4_t b, int16x4_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_s32" type="checkbox"><label for="vmlsl_s32"><div>int64x2_t <b><b>vmlsl_s32</b></b> (int64x2_t a, int32x2_t b, int32x2_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_u8" type="checkbox"><label for="vmlsl_u8"><div>uint16x8_t <b><b>vmlsl_u8</b></b> (uint16x8_t a, uint8x8_t b, uint8x8_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_u16" type="checkbox"><label for="vmlsl_u16"><div>uint32x4_t <b><b>vmlsl_u16</b></b> (uint32x4_t a, uint16x4_t b, uint16x4_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_u32" type="checkbox"><label for="vmlsl_u32"><div>uint64x2_t <b><b>vmlsl_u32</b></b> (uint64x2_t a, uint32x2_t b, uint32x2_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_s8" type="checkbox"><label for="vmlsl_high_s8"><div>int16x8_t <b><b>vmlsl_high_s8</b></b> (int16x8_t a, int8x16_t b, int8x16_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_s16" type="checkbox"><label for="vmlsl_high_s16"><div>int32x4_t <b><b>vmlsl_high_s16</b></b> (int32x4_t a, int16x8_t b, int16x8_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_s32" type="checkbox"><label for="vmlsl_high_s32"><div>int64x2_t <b><b>vmlsl_high_s32</b></b> (int64x2_t a, int32x4_t b, int32x4_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_u8" type="checkbox"><label for="vmlsl_high_u8"><div>uint16x8_t <b><b>vmlsl_high_u8</b></b> (uint16x8_t a, uint8x16_t b, uint8x16_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_u16" type="checkbox"><label for="vmlsl_high_u16"><div>uint32x4_t <b><b>vmlsl_high_u16</b></b> (uint32x4_t a, uint16x8_t b, uint16x8_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_u32" type="checkbox"><label for="vmlsl_high_u32"><div>uint64x2_t <b><b>vmlsl_high_u32</b></b> (uint64x2_t a, uint32x4_t b, uint32x4_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfma_f32" type="checkbox"><label for="vfma_f32"><div>float32x2_t <b><b>vfma_f32</b></b> (float32x2_t a, float32x2_t b, float32x2_t c)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vfmaq_f32" type="checkbox"><label for="vfmaq_f32"><div>float32x4_t <b><b>vfmaq_f32</b></b> (float32x4_t a, float32x4_t b, float32x4_t c)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vfma_f64" type="checkbox"><label for="vfma_f64"><div>float64x1_t <b><b>vfma_f64</b></b> (float64x1_t a, float64x1_t b, float64x1_t c)<span class="right">Floating-point fused multiply-add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add (scalar). This instruction multiplies the values of the first two SIMD&amp;FP source registers, adds the product to the value of the third SIMD&amp;FP source register, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmadd-floating-point-fused-multiply-add-scalar">FMADD</a> Dd,Dn,Dm,Da
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Da <br />
+b &rarr; Dn <br />
+c &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) result;
+bits(datasize) operanda = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[a];
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(operanda, operand1, operand2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmaq_f64" type="checkbox"><label for="vfmaq_f64"><div>float64x2_t <b><b>vfmaq_f64</b></b> (float64x2_t a, float64x2_t b, float64x2_t c)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+c &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfma_lane_f32" type="checkbox"><label for="vfma_lane_f32"><div>float32x2_t <b><b>vfma_lane_f32</b></b> (float32x2_t a, float32x2_t b, float32x2_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmaq_lane_f32" type="checkbox"><label for="vfmaq_lane_f32"><div>float32x4_t <b><b>vfmaq_lane_f32</b></b> (float32x4_t a, float32x4_t b, float32x2_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfma_lane_f64" type="checkbox"><label for="vfma_lane_f64"><div>float64x1_t <b><b>vfma_lane_f64</b></b> (float64x1_t a, float64x1_t b, float64x1_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmaq_lane_f64" type="checkbox"><label for="vfmaq_lane_f64"><div>float64x2_t <b><b>vfmaq_lane_f64</b></b> (float64x2_t a, float64x2_t b, float64x1_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.2D,Vn.2D,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmas_lane_f32" type="checkbox"><label for="vfmas_lane_f32"><div>float32_t <b><b>vfmas_lane_f32</b></b> (float32_t a, float32_t b, float32x2_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Sn <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmad_lane_f64" type="checkbox"><label for="vfmad_lane_f64"><div>float64_t <b><b>vfmad_lane_f64</b></b> (float64_t a, float64_t b, float64x1_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfma_laneq_f32" type="checkbox"><label for="vfma_laneq_f32"><div>float32x2_t <b><b>vfma_laneq_f32</b></b> (float32x2_t a, float32x2_t b, float32x4_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmaq_laneq_f32" type="checkbox"><label for="vfmaq_laneq_f32"><div>float32x4_t <b><b>vfmaq_laneq_f32</b></b> (float32x4_t a, float32x4_t b, float32x4_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfma_laneq_f64" type="checkbox"><label for="vfma_laneq_f64"><div>float64x1_t <b><b>vfma_laneq_f64</b></b> (float64x1_t a, float64x1_t b, float64x2_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmaq_laneq_f64" type="checkbox"><label for="vfmaq_laneq_f64"><div>float64x2_t <b><b>vfmaq_laneq_f64</b></b> (float64x2_t a, float64x2_t b, float64x2_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.2D,Vn.2D,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmas_laneq_f32" type="checkbox"><label for="vfmas_laneq_f32"><div>float32_t <b><b>vfmas_laneq_f32</b></b> (float32_t a, float32_t b, float32x4_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Sn <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmad_laneq_f64" type="checkbox"><label for="vfmad_laneq_f64"><div>float64_t <b><b>vfmad_laneq_f64</b></b> (float64_t a, float64_t b, float64x2_t v, const int lane)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfms_f32" type="checkbox"><label for="vfms_f32"><div>float32x2_t <b><b>vfms_f32</b></b> (float32x2_t a, float32x2_t b, float32x2_t c)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vfmsq_f32" type="checkbox"><label for="vfmsq_f32"><div>float32x4_t <b><b>vfmsq_f32</b></b> (float32x4_t a, float32x4_t b, float32x4_t c)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vfms_f64" type="checkbox"><label for="vfms_f64"><div>float64x1_t <b><b>vfms_f64</b></b> (float64x1_t a, float64x1_t b, float64x1_t c)<span class="right">Floating-point fused multiply-subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Fused Multiply-Subtract (scalar). This instruction multiplies the values of the first two SIMD&amp;FP source registers, negates the product, adds that to the value of the third SIMD&amp;FP source register, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmsub-floating-point-fused-multiply-subtract-scalar">FMSUB</a> Dd,Dn,Dm,Da
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Da <br />
+b &rarr; Dn <br />
+c &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) result;
+bits(datasize) operanda = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[a];
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(operand1);
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(operanda, operand1, operand2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmsq_f64" type="checkbox"><label for="vfmsq_f64"><div>float64x2_t <b><b>vfmsq_f64</b></b> (float64x2_t a, float64x2_t b, float64x2_t c)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+c &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfms_lane_f32" type="checkbox"><label for="vfms_lane_f32"><div>float32x2_t <b><b>vfms_lane_f32</b></b> (float32x2_t a, float32x2_t b, float32x2_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmsq_lane_f32" type="checkbox"><label for="vfmsq_lane_f32"><div>float32x4_t <b><b>vfmsq_lane_f32</b></b> (float32x4_t a, float32x4_t b, float32x2_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfms_lane_f64" type="checkbox"><label for="vfms_lane_f64"><div>float64x1_t <b><b>vfms_lane_f64</b></b> (float64x1_t a, float64x1_t b, float64x1_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmsq_lane_f64" type="checkbox"><label for="vfmsq_lane_f64"><div>float64x2_t <b><b>vfmsq_lane_f64</b></b> (float64x2_t a, float64x2_t b, float64x1_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.2D,Vn.2D,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmss_lane_f32" type="checkbox"><label for="vfmss_lane_f32"><div>float32_t <b><b>vfmss_lane_f32</b></b> (float32_t a, float32_t b, float32x2_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Sn <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmsd_lane_f64" type="checkbox"><label for="vfmsd_lane_f64"><div>float64_t <b><b>vfmsd_lane_f64</b></b> (float64_t a, float64_t b, float64x1_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfms_laneq_f32" type="checkbox"><label for="vfms_laneq_f32"><div>float32x2_t <b><b>vfms_laneq_f32</b></b> (float32x2_t a, float32x2_t b, float32x4_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmsq_laneq_f32" type="checkbox"><label for="vfmsq_laneq_f32"><div>float32x4_t <b><b>vfmsq_laneq_f32</b></b> (float32x4_t a, float32x4_t b, float32x4_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfms_laneq_f64" type="checkbox"><label for="vfms_laneq_f64"><div>float64x1_t <b><b>vfms_laneq_f64</b></b> (float64x1_t a, float64x1_t b, float64x2_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmsq_laneq_f64" type="checkbox"><label for="vfmsq_laneq_f64"><div>float64x2_t <b><b>vfmsq_laneq_f64</b></b> (float64x2_t a, float64x2_t b, float64x2_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.2D,Vn.2D,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmss_laneq_f32" type="checkbox"><label for="vfmss_laneq_f32"><div>float32_t <b><b>vfmss_laneq_f32</b></b> (float32_t a, float32_t b, float32x4_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Sn <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmsd_laneq_f64" type="checkbox"><label for="vfmsd_laneq_f64"><div>float64_t <b><b>vfmsd_laneq_f64</b></b> (float64_t a, float64_t b, float64x2_t v, const int lane)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulh_s16" type="checkbox"><label for="vqdmulh_s16"><div>int16x4_t <b><b>vqdmulh_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhq_s16" type="checkbox"><label for="vqdmulhq_s16"><div>int16x8_t <b><b>vqdmulhq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulh_s32" type="checkbox"><label for="vqdmulh_s32"><div>int32x2_t <b><b>vqdmulh_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhq_s32" type="checkbox"><label for="vqdmulhq_s32"><div>int32x4_t <b><b>vqdmulhq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhh_s16" type="checkbox"><label for="vqdmulhh_s16"><div>int16_t <b><b>vqdmulhh_s16</b></b> (int16_t a, int16_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhs_s32" type="checkbox"><label for="vqdmulhs_s32"><div>int32_t <b><b>vqdmulhs_s32</b></b> (int32_t a, int32_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulh_s16" type="checkbox"><label for="vqrdmulh_s16"><div>int16x4_t <b><b>vqrdmulh_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhq_s16" type="checkbox"><label for="vqrdmulhq_s16"><div>int16x8_t <b><b>vqrdmulhq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulh_s32" type="checkbox"><label for="vqrdmulh_s32"><div>int32x2_t <b><b>vqrdmulh_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhq_s32" type="checkbox"><label for="vqrdmulhq_s32"><div>int32x4_t <b><b>vqrdmulhq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhh_s16" type="checkbox"><label for="vqrdmulhh_s16"><div>int16_t <b><b>vqrdmulhh_s16</b></b> (int16_t a, int16_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhs_s32" type="checkbox"><label for="vqrdmulhs_s32"><div>int32_t <b><b>vqrdmulhs_s32</b></b> (int32_t a, int32_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_s16" type="checkbox"><label for="vqdmlal_s16"><div>int32x4_t <b><b>vqdmlal_s16</b></b> (int32x4_t a, int16x4_t b, int16x4_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_s32" type="checkbox"><label for="vqdmlal_s32"><div>int64x2_t <b><b>vqdmlal_s32</b></b> (int64x2_t a, int32x2_t b, int32x2_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlalh_s16" type="checkbox"><label for="vqdmlalh_s16"><div>int32_t <b><b>vqdmlalh_s16</b></b> (int32_t a, int16_t b, int16_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Sd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Hn <br />
+c &rarr; Hm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlals_s32" type="checkbox"><label for="vqdmlals_s32"><div>int64_t <b><b>vqdmlals_s32</b></b> (int64_t a, int32_t b, int32_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Dd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Sn <br />
+c &rarr; Sm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_high_s16" type="checkbox"><label for="vqdmlal_high_s16"><div>int32x4_t <b><b>vqdmlal_high_s16</b></b> (int32x4_t a, int16x8_t b, int16x8_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_high_s32" type="checkbox"><label for="vqdmlal_high_s32"><div>int64x2_t <b><b>vqdmlal_high_s32</b></b> (int64x2_t a, int32x4_t b, int32x4_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_s16" type="checkbox"><label for="vqdmlsl_s16"><div>int32x4_t <b><b>vqdmlsl_s16</b></b> (int32x4_t a, int16x4_t b, int16x4_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_s32" type="checkbox"><label for="vqdmlsl_s32"><div>int64x2_t <b><b>vqdmlsl_s32</b></b> (int64x2_t a, int32x2_t b, int32x2_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlslh_s16" type="checkbox"><label for="vqdmlslh_s16"><div>int32_t <b><b>vqdmlslh_s16</b></b> (int32_t a, int16_t b, int16_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Sd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Hn <br />
+c &rarr; Hm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsls_s32" type="checkbox"><label for="vqdmlsls_s32"><div>int64_t <b><b>vqdmlsls_s32</b></b> (int64_t a, int32_t b, int32_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Dd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Sn <br />
+c &rarr; Sm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_high_s16" type="checkbox"><label for="vqdmlsl_high_s16"><div>int32x4_t <b><b>vqdmlsl_high_s16</b></b> (int32x4_t a, int16x8_t b, int16x8_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_high_s32" type="checkbox"><label for="vqdmlsl_high_s32"><div>int64x2_t <b><b>vqdmlsl_high_s32</b></b> (int64x2_t a, int32x4_t b, int32x4_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_s8" type="checkbox"><label for="vmull_s8"><div>int16x8_t <b><b>vmull_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_s16" type="checkbox"><label for="vmull_s16"><div>int32x4_t <b><b>vmull_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_s32" type="checkbox"><label for="vmull_s32"><div>int64x2_t <b><b>vmull_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_u8" type="checkbox"><label for="vmull_u8"><div>uint16x8_t <b><b>vmull_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_u16" type="checkbox"><label for="vmull_u16"><div>uint32x4_t <b><b>vmull_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_u32" type="checkbox"><label for="vmull_u32"><div>uint64x2_t <b><b>vmull_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_p8" type="checkbox"><label for="vmull_p8"><div>poly16x8_t <b><b>vmull_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Polynomial multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Polynomial Multiply Long. This instruction multiplies corresponding elements in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/pmull-pmull2-polynomial-multiply-long">PMULL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_s8" type="checkbox"><label for="vmull_high_s8"><div>int16x8_t <b><b>vmull_high_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_s16" type="checkbox"><label for="vmull_high_s16"><div>int32x4_t <b><b>vmull_high_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_s32" type="checkbox"><label for="vmull_high_s32"><div>int64x2_t <b><b>vmull_high_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_u8" type="checkbox"><label for="vmull_high_u8"><div>uint16x8_t <b><b>vmull_high_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_u16" type="checkbox"><label for="vmull_high_u16"><div>uint32x4_t <b><b>vmull_high_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_u32" type="checkbox"><label for="vmull_high_u32"><div>uint64x2_t <b><b>vmull_high_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_p8" type="checkbox"><label for="vmull_high_p8"><div>poly16x8_t <b><b>vmull_high_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Polynomial multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Polynomial Multiply Long. This instruction multiplies corresponding elements in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/pmull-pmull2-polynomial-multiply-long">PMULL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_s16" type="checkbox"><label for="vqdmull_s16"><div>int32x4_t <b><b>vqdmull_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_s32" type="checkbox"><label for="vqdmull_s32"><div>int64x2_t <b><b>vqdmull_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmullh_s16" type="checkbox"><label for="vqdmullh_s16"><div>int32_t <b><b>vqdmullh_s16</b></b> (int16_t a, int16_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Sd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulls_s32" type="checkbox"><label for="vqdmulls_s32"><div>int64_t <b><b>vqdmulls_s32</b></b> (int32_t a, int32_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Dd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_high_s16" type="checkbox"><label for="vqdmull_high_s16"><div>int32x4_t <b><b>vqdmull_high_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_high_s32" type="checkbox"><label for="vqdmull_high_s32"><div>int64x2_t <b><b>vqdmull_high_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsub_s8" type="checkbox"><label for="vsub_s8"><div>int8x8_t <b><b>vsub_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_s8" type="checkbox"><label for="vsubq_s8"><div>int8x16_t <b><b>vsubq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsub_s16" type="checkbox"><label for="vsub_s16"><div>int16x4_t <b><b>vsub_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_s16" type="checkbox"><label for="vsubq_s16"><div>int16x8_t <b><b>vsubq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsub_s32" type="checkbox"><label for="vsub_s32"><div>int32x2_t <b><b>vsub_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_s32" type="checkbox"><label for="vsubq_s32"><div>int32x4_t <b><b>vsubq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsub_s64" type="checkbox"><label for="vsub_s64"><div>int64x1_t <b><b>vsub_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_s64" type="checkbox"><label for="vsubq_s64"><div>int64x2_t <b><b>vsubq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsub_u8" type="checkbox"><label for="vsub_u8"><div>uint8x8_t <b><b>vsub_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_u8" type="checkbox"><label for="vsubq_u8"><div>uint8x16_t <b><b>vsubq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsub_u16" type="checkbox"><label for="vsub_u16"><div>uint16x4_t <b><b>vsub_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_u16" type="checkbox"><label for="vsubq_u16"><div>uint16x8_t <b><b>vsubq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsub_u32" type="checkbox"><label for="vsub_u32"><div>uint32x2_t <b><b>vsub_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_u32" type="checkbox"><label for="vsubq_u32"><div>uint32x4_t <b><b>vsubq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsub_u64" type="checkbox"><label for="vsub_u64"><div>uint64x1_t <b><b>vsub_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_u64" type="checkbox"><label for="vsubq_u64"><div>uint64x2_t <b><b>vsubq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsub_f32" type="checkbox"><label for="vsub_f32"><div>float32x2_t <b><b>vsub_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Subtract (vector). This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register, from the corresponding elements in the vector in the first source SIMD&amp;FP register, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fsub-vector-floating-point-subtract-vector">FSUB</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_f32" type="checkbox"><label for="vsubq_f32"><div>float32x4_t <b><b>vsubq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Subtract (vector). This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register, from the corresponding elements in the vector in the first source SIMD&amp;FP register, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fsub-vector-floating-point-subtract-vector">FSUB</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsub_f64" type="checkbox"><label for="vsub_f64"><div>float64x1_t <b><b>vsub_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Subtract (vector). This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register, from the corresponding elements in the vector in the first source SIMD&amp;FP register, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fsub-vector-floating-point-subtract-vector">FSUB</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubq_f64" type="checkbox"><label for="vsubq_f64"><div>float64x2_t <b><b>vsubq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Subtract (vector). This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register, from the corresponding elements in the vector in the first source SIMD&amp;FP register, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fsub-vector-floating-point-subtract-vector">FSUB</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubd_s64" type="checkbox"><label for="vsubd_s64"><div>int64_t <b><b>vsubd_s64</b></b> (int64_t a, int64_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubd_u64" type="checkbox"><label for="vsubd_u64"><div>uint64_t <b><b>vsubd_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract (vector). This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sub-vector-subtract-vector">SUB</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 - element2;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_s8" type="checkbox"><label for="vsubl_s8"><div>int16x8_t <b><b>vsubl_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubl-ssubl2-signed-subtract-long">SSUBL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_s16" type="checkbox"><label for="vsubl_s16"><div>int32x4_t <b><b>vsubl_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubl-ssubl2-signed-subtract-long">SSUBL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_s32" type="checkbox"><label for="vsubl_s32"><div>int64x2_t <b><b>vsubl_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubl-ssubl2-signed-subtract-long">SSUBL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_u8" type="checkbox"><label for="vsubl_u8"><div>uint16x8_t <b><b>vsubl_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubl-usubl2-unsigned-subtract-long">USUBL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_u16" type="checkbox"><label for="vsubl_u16"><div>uint32x4_t <b><b>vsubl_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubl-usubl2-unsigned-subtract-long">USUBL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_u32" type="checkbox"><label for="vsubl_u32"><div>uint64x2_t <b><b>vsubl_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubl-usubl2-unsigned-subtract-long">USUBL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_high_s8" type="checkbox"><label for="vsubl_high_s8"><div>int16x8_t <b><b>vsubl_high_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubl-ssubl2-signed-subtract-long">SSUBL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_high_s16" type="checkbox"><label for="vsubl_high_s16"><div>int32x4_t <b><b>vsubl_high_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubl-ssubl2-signed-subtract-long">SSUBL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_high_s32" type="checkbox"><label for="vsubl_high_s32"><div>int64x2_t <b><b>vsubl_high_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubl-ssubl2-signed-subtract-long">SSUBL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_high_u8" type="checkbox"><label for="vsubl_high_u8"><div>uint16x8_t <b><b>vsubl_high_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubl-usubl2-unsigned-subtract-long">USUBL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_high_u16" type="checkbox"><label for="vsubl_high_u16"><div>uint32x4_t <b><b>vsubl_high_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubl-usubl2-unsigned-subtract-long">USUBL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubl_high_u32" type="checkbox"><label for="vsubl_high_u32"><div>uint64x2_t <b><b>vsubl_high_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Long. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubl-usubl2-unsigned-subtract-long">USUBL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_s8" type="checkbox"><label for="vsubw_s8"><div>int16x8_t <b><b>vsubw_s8</b></b> (int16x8_t a, int8x8_t b)<span class="right">Signed subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Wide. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubw-ssubw2-signed-subtract-wide">SSUBW</a> Vd.8H,Vn.8H,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_s16" type="checkbox"><label for="vsubw_s16"><div>int32x4_t <b><b>vsubw_s16</b></b> (int32x4_t a, int16x4_t b)<span class="right">Signed subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Wide. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubw-ssubw2-signed-subtract-wide">SSUBW</a> Vd.4S,Vn.4S,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_s32" type="checkbox"><label for="vsubw_s32"><div>int64x2_t <b><b>vsubw_s32</b></b> (int64x2_t a, int32x2_t b)<span class="right">Signed subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Wide. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubw-ssubw2-signed-subtract-wide">SSUBW</a> Vd.2D,Vn.2D,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_u8" type="checkbox"><label for="vsubw_u8"><div>uint16x8_t <b><b>vsubw_u8</b></b> (uint16x8_t a, uint8x8_t b)<span class="right">Unsigned subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Wide. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element in the lower or upper half of the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubw-usubw2-unsigned-subtract-wide">USUBW</a> Vd.8H,Vn.8H,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_u16" type="checkbox"><label for="vsubw_u16"><div>uint32x4_t <b><b>vsubw_u16</b></b> (uint32x4_t a, uint16x4_t b)<span class="right">Unsigned subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Wide. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element in the lower or upper half of the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubw-usubw2-unsigned-subtract-wide">USUBW</a> Vd.4S,Vn.4S,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_u32" type="checkbox"><label for="vsubw_u32"><div>uint64x2_t <b><b>vsubw_u32</b></b> (uint64x2_t a, uint32x2_t b)<span class="right">Unsigned subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Wide. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element in the lower or upper half of the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubw-usubw2-unsigned-subtract-wide">USUBW</a> Vd.2D,Vn.2D,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_high_s8" type="checkbox"><label for="vsubw_high_s8"><div>int16x8_t <b><b>vsubw_high_s8</b></b> (int16x8_t a, int8x16_t b)<span class="right">Signed subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Wide. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubw-ssubw2-signed-subtract-wide">SSUBW2</a> Vd.8H,Vn.8H,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_high_s16" type="checkbox"><label for="vsubw_high_s16"><div>int32x4_t <b><b>vsubw_high_s16</b></b> (int32x4_t a, int16x8_t b)<span class="right">Signed subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Wide. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubw-ssubw2-signed-subtract-wide">SSUBW2</a> Vd.4S,Vn.4S,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_high_s32" type="checkbox"><label for="vsubw_high_s32"><div>int64x2_t <b><b>vsubw_high_s32</b></b> (int64x2_t a, int32x4_t b)<span class="right">Signed subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Subtract Wide. This instruction subtracts each vector element in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssubw-ssubw2-signed-subtract-wide">SSUBW2</a> Vd.2D,Vn.2D,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_high_u8" type="checkbox"><label for="vsubw_high_u8"><div>uint16x8_t <b><b>vsubw_high_u8</b></b> (uint16x8_t a, uint8x16_t b)<span class="right">Unsigned subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Wide. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element in the lower or upper half of the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubw-usubw2-unsigned-subtract-wide">USUBW2</a> Vd.8H,Vn.8H,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_high_u16" type="checkbox"><label for="vsubw_high_u16"><div>uint32x4_t <b><b>vsubw_high_u16</b></b> (uint32x4_t a, uint16x8_t b)<span class="right">Unsigned subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Wide. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element in the lower or upper half of the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubw-usubw2-unsigned-subtract-wide">USUBW2</a> Vd.4S,Vn.4S,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubw_high_u32" type="checkbox"><label for="vsubw_high_u32"><div>uint64x2_t <b><b>vsubw_high_u32</b></b> (uint64x2_t a, uint32x4_t b)<span class="right">Unsigned subtract wide</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Subtract Wide. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element in the lower or upper half of the first source SIMD&amp;FP register, places the result in a vector, and writes the vector to the SIMD&amp;FP destination register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usubw-usubw2-unsigned-subtract-wide">USUBW2</a> Vd.2D,Vn.2D,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+integer sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = sum&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vhsub_s8" type="checkbox"><label for="vhsub_s8"><div>int8x8_t <b><b>vhsub_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Subtract. This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register from the corresponding elements in the vector in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shsub-signed-halving-subtract">SHSUB</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsubq_s8" type="checkbox"><label for="vhsubq_s8"><div>int8x16_t <b><b>vhsubq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Subtract. This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register from the corresponding elements in the vector in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shsub-signed-halving-subtract">SHSUB</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsub_s16" type="checkbox"><label for="vhsub_s16"><div>int16x4_t <b><b>vhsub_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Subtract. This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register from the corresponding elements in the vector in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shsub-signed-halving-subtract">SHSUB</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsubq_s16" type="checkbox"><label for="vhsubq_s16"><div>int16x8_t <b><b>vhsubq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Subtract. This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register from the corresponding elements in the vector in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shsub-signed-halving-subtract">SHSUB</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsub_s32" type="checkbox"><label for="vhsub_s32"><div>int32x2_t <b><b>vhsub_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Subtract. This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register from the corresponding elements in the vector in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shsub-signed-halving-subtract">SHSUB</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsubq_s32" type="checkbox"><label for="vhsubq_s32"><div>int32x4_t <b><b>vhsubq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Halving Subtract. This instruction subtracts the elements in the vector in the second source SIMD&amp;FP register from the corresponding elements in the vector in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into elements of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shsub-signed-halving-subtract">SHSUB</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsub_u8" type="checkbox"><label for="vhsub_u8"><div>uint8x8_t <b><b>vhsub_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Subtract. This instruction subtracts the vector elements in the second source SIMD&amp;FP register from the corresponding vector elements in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhsub-unsigned-halving-subtract">UHSUB</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsubq_u8" type="checkbox"><label for="vhsubq_u8"><div>uint8x16_t <b><b>vhsubq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Subtract. This instruction subtracts the vector elements in the second source SIMD&amp;FP register from the corresponding vector elements in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhsub-unsigned-halving-subtract">UHSUB</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsub_u16" type="checkbox"><label for="vhsub_u16"><div>uint16x4_t <b><b>vhsub_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Subtract. This instruction subtracts the vector elements in the second source SIMD&amp;FP register from the corresponding vector elements in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhsub-unsigned-halving-subtract">UHSUB</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsubq_u16" type="checkbox"><label for="vhsubq_u16"><div>uint16x8_t <b><b>vhsubq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Subtract. This instruction subtracts the vector elements in the second source SIMD&amp;FP register from the corresponding vector elements in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhsub-unsigned-halving-subtract">UHSUB</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsub_u32" type="checkbox"><label for="vhsub_u32"><div>uint32x2_t <b><b>vhsub_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Subtract. This instruction subtracts the vector elements in the second source SIMD&amp;FP register from the corresponding vector elements in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhsub-unsigned-halving-subtract">UHSUB</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vhsubq_u32" type="checkbox"><label for="vhsubq_u32"><div>uint32x4_t <b><b>vhsubq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned halving subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Halving Subtract. This instruction subtracts the vector elements in the second source SIMD&amp;FP register from the corresponding vector elements in the first source SIMD&amp;FP register, shifts each result right one bit, places each result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uhsub-unsigned-halving-subtract">UHSUB</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = diff&lt;esize:1&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsub_s8" type="checkbox"><label for="vqsub_s8"><div>int8x8_t <b><b>vqsub_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsubq_s8" type="checkbox"><label for="vqsubq_s8"><div>int8x16_t <b><b>vqsubq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsub_s16" type="checkbox"><label for="vqsub_s16"><div>int16x4_t <b><b>vqsub_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsubq_s16" type="checkbox"><label for="vqsubq_s16"><div>int16x8_t <b><b>vqsubq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsub_s32" type="checkbox"><label for="vqsub_s32"><div>int32x2_t <b><b>vqsub_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsubq_s32" type="checkbox"><label for="vqsubq_s32"><div>int32x4_t <b><b>vqsubq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsub_s64" type="checkbox"><label for="vqsub_s64"><div>int64x1_t <b><b>vqsub_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsubq_s64" type="checkbox"><label for="vqsubq_s64"><div>int64x2_t <b><b>vqsubq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsub_u8" type="checkbox"><label for="vqsub_u8"><div>uint8x8_t <b><b>vqsub_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsubq_u8" type="checkbox"><label for="vqsubq_u8"><div>uint8x16_t <b><b>vqsubq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsub_u16" type="checkbox"><label for="vqsub_u16"><div>uint16x4_t <b><b>vqsub_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsubq_u16" type="checkbox"><label for="vqsubq_u16"><div>uint16x8_t <b><b>vqsubq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsub_u32" type="checkbox"><label for="vqsub_u32"><div>uint32x2_t <b><b>vqsub_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsubq_u32" type="checkbox"><label for="vqsubq_u32"><div>uint32x4_t <b><b>vqsubq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsub_u64" type="checkbox"><label for="vqsub_u64"><div>uint64x1_t <b><b>vqsub_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsubq_u64" type="checkbox"><label for="vqsubq_u64"><div>uint64x2_t <b><b>vqsubq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqsubb_s8" type="checkbox"><label for="vqsubb_s8"><div>int8_t <b><b>vqsubb_s8</b></b> (int8_t a, int8_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Bd,Bn,Bm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+b &rarr; Bm </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqsubh_s16" type="checkbox"><label for="vqsubh_s16"><div>int16_t <b><b>vqsubh_s16</b></b> (int16_t a, int16_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqsubs_s32" type="checkbox"><label for="vqsubs_s32"><div>int32_t <b><b>vqsubs_s32</b></b> (int32_t a, int32_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqsubd_s64" type="checkbox"><label for="vqsubd_s64"><div>int64_t <b><b>vqsubd_s64</b></b> (int64_t a, int64_t b)<span class="right">Signed saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqsub-signed-saturating-subtract">SQSUB</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqsubb_u8" type="checkbox"><label for="vqsubb_u8"><div>uint8_t <b><b>vqsubb_u8</b></b> (uint8_t a, uint8_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Bd,Bn,Bm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+b &rarr; Bm </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqsubh_u16" type="checkbox"><label for="vqsubh_u16"><div>uint16_t <b><b>vqsubh_u16</b></b> (uint16_t a, uint16_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqsubs_u32" type="checkbox"><label for="vqsubs_u32"><div>uint32_t <b><b>vqsubs_u32</b></b> (uint32_t a, uint32_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqsubd_u64" type="checkbox"><label for="vqsubd_u64"><div>uint64_t <b><b>vqsubd_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Unsigned saturating subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Subtract. This instruction subtracts the element values of the second source SIMD&amp;FP register from the corresponding element values of the first source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqsub-unsigned-saturating-subtract">UQSUB</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer diff;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    diff = element1 - element2;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(diff, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_s16" type="checkbox"><label for="vsubhn_s16"><div>int8x8_t <b><b>vsubhn_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN</a> Vd.8B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_s32" type="checkbox"><label for="vsubhn_s32"><div>int16x4_t <b><b>vsubhn_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN</a> Vd.4H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_s64" type="checkbox"><label for="vsubhn_s64"><div>int32x2_t <b><b>vsubhn_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN</a> Vd.2S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_u16" type="checkbox"><label for="vsubhn_u16"><div>uint8x8_t <b><b>vsubhn_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN</a> Vd.8B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_u32" type="checkbox"><label for="vsubhn_u32"><div>uint16x4_t <b><b>vsubhn_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN</a> Vd.4H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_u64" type="checkbox"><label for="vsubhn_u64"><div>uint32x2_t <b><b>vsubhn_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN</a> Vd.2S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_high_s16" type="checkbox"><label for="vsubhn_high_s16"><div>int8x16_t <b><b>vsubhn_high_s16</b></b> (int8x8_t r, int16x8_t a, int16x8_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN2</a> Vd.16B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_high_s32" type="checkbox"><label for="vsubhn_high_s32"><div>int16x8_t <b><b>vsubhn_high_s32</b></b> (int16x4_t r, int32x4_t a, int32x4_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN2</a> Vd.8H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_high_s64" type="checkbox"><label for="vsubhn_high_s64"><div>int32x4_t <b><b>vsubhn_high_s64</b></b> (int32x2_t r, int64x2_t a, int64x2_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN2</a> Vd.4S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_high_u16" type="checkbox"><label for="vsubhn_high_u16"><div>uint8x16_t <b><b>vsubhn_high_u16</b></b> (uint8x8_t r, uint16x8_t a, uint16x8_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN2</a> Vd.16B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_high_u32" type="checkbox"><label for="vsubhn_high_u32"><div>uint16x8_t <b><b>vsubhn_high_u32</b></b> (uint16x4_t r, uint32x4_t a, uint32x4_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN2</a> Vd.8H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsubhn_high_u64" type="checkbox"><label for="vsubhn_high_u64"><div>uint32x4_t <b><b>vsubhn_high_u64</b></b> (uint32x2_t r, uint64x2_t a, uint64x2_t b)<span class="right">Subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Subtract returning High Narrow. This instruction subtracts each vector element in the second source SIMD&amp;FP register from the corresponding vector element in the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/subhn-subhn2-subtract-returning-high-narrow">SUBHN2</a> Vd.4S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_s16" type="checkbox"><label for="vrsubhn_s16"><div>int8x8_t <b><b>vrsubhn_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN</a> Vd.8B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_s32" type="checkbox"><label for="vrsubhn_s32"><div>int16x4_t <b><b>vrsubhn_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN</a> Vd.4H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_s64" type="checkbox"><label for="vrsubhn_s64"><div>int32x2_t <b><b>vrsubhn_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN</a> Vd.2S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_u16" type="checkbox"><label for="vrsubhn_u16"><div>uint8x8_t <b><b>vrsubhn_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN</a> Vd.8B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_u32" type="checkbox"><label for="vrsubhn_u32"><div>uint16x4_t <b><b>vrsubhn_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN</a> Vd.4H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_u64" type="checkbox"><label for="vrsubhn_u64"><div>uint32x2_t <b><b>vrsubhn_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN</a> Vd.2S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_high_s16" type="checkbox"><label for="vrsubhn_high_s16"><div>int8x16_t <b><b>vrsubhn_high_s16</b></b> (int8x8_t r, int16x8_t a, int16x8_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN2</a> Vd.16B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_high_s32" type="checkbox"><label for="vrsubhn_high_s32"><div>int16x8_t <b><b>vrsubhn_high_s32</b></b> (int16x4_t r, int32x4_t a, int32x4_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN2</a> Vd.8H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_high_s64" type="checkbox"><label for="vrsubhn_high_s64"><div>int32x4_t <b><b>vrsubhn_high_s64</b></b> (int32x2_t r, int64x2_t a, int64x2_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN2</a> Vd.4S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_high_u16" type="checkbox"><label for="vrsubhn_high_u16"><div>uint8x16_t <b><b>vrsubhn_high_u16</b></b> (uint8x8_t r, uint16x8_t a, uint16x8_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN2</a> Vd.16B,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_high_u32" type="checkbox"><label for="vrsubhn_high_u32"><div>uint16x8_t <b><b>vrsubhn_high_u32</b></b> (uint16x4_t r, uint32x4_t a, uint32x4_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN2</a> Vd.8H,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsubhn_high_u64" type="checkbox"><label for="vrsubhn_high_u64"><div>uint32x4_t <b><b>vrsubhn_high_u64</b></b> (uint32x2_t r, uint64x2_t a, uint64x2_t b)<span class="right">Rounding subtract returning high narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Subtract returning High Narrow. This instruction subtracts each vector element of the second source SIMD&amp;FP register from the corresponding vector element of the first source SIMD&amp;FP register, places the most significant half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rsubhn-rsubhn2-rounding-subtract-returning-high-narrow">RSUBHN2</a> Vd.4S,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(2*datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if round then 1 &lt;&lt; (esize - 1) else 0;
+bits(2*esize) element1;
+bits(2*esize) element2;
+bits(2*esize) sum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 2*esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, 2*esize];
+    if sub_op then
+        sum = element1 - element2;
+    else
+        sum = element1 + element2;
+    sum = sum + round_const;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = sum&lt;2*esize-1:esize&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceq_s8" type="checkbox"><label for="vceq_s8"><div>uint8x8_t <b><b>vceq_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_s8" type="checkbox"><label for="vceqq_s8"><div>uint8x16_t <b><b>vceqq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceq_s16" type="checkbox"><label for="vceq_s16"><div>uint16x4_t <b><b>vceq_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_s16" type="checkbox"><label for="vceqq_s16"><div>uint16x8_t <b><b>vceqq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceq_s32" type="checkbox"><label for="vceq_s32"><div>uint32x2_t <b><b>vceq_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_s32" type="checkbox"><label for="vceqq_s32"><div>uint32x4_t <b><b>vceqq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceq_u8" type="checkbox"><label for="vceq_u8"><div>uint8x8_t <b><b>vceq_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_u8" type="checkbox"><label for="vceqq_u8"><div>uint8x16_t <b><b>vceqq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceq_u16" type="checkbox"><label for="vceq_u16"><div>uint16x4_t <b><b>vceq_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_u16" type="checkbox"><label for="vceqq_u16"><div>uint16x8_t <b><b>vceqq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceq_u32" type="checkbox"><label for="vceq_u32"><div>uint32x2_t <b><b>vceq_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_u32" type="checkbox"><label for="vceqq_u32"><div>uint32x4_t <b><b>vceqq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceq_f32" type="checkbox"><label for="vceq_f32"><div>uint32x2_t <b><b>vceq_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_f32" type="checkbox"><label for="vceqq_f32"><div>uint32x4_t <b><b>vceqq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceq_p8" type="checkbox"><label for="vceq_p8"><div>uint8x8_t <b><b>vceq_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_p8" type="checkbox"><label for="vceqq_p8"><div>uint8x16_t <b><b>vceqq_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceq_s64" type="checkbox"><label for="vceq_s64"><div>uint64x1_t <b><b>vceq_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_s64" type="checkbox"><label for="vceqq_s64"><div>uint64x2_t <b><b>vceqq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceq_u64" type="checkbox"><label for="vceq_u64"><div>uint64x1_t <b><b>vceq_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_u64" type="checkbox"><label for="vceqq_u64"><div>uint64x2_t <b><b>vceqq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceq_p64" type="checkbox"><label for="vceq_p64"><div>uint64x1_t <b><b>vceq_p64</b></b> (poly64x1_t a, poly64x1_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_p64" type="checkbox"><label for="vceqq_p64"><div>uint64x2_t <b><b>vceqq_p64</b></b> (poly64x2_t a, poly64x2_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceq_f64" type="checkbox"><label for="vceq_f64"><div>uint64x1_t <b><b>vceq_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqq_f64" type="checkbox"><label for="vceqq_f64"><div>uint64x2_t <b><b>vceqq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqd_s64" type="checkbox"><label for="vceqd_s64"><div>uint64_t <b><b>vceqd_s64</b></b> (int64_t a, int64_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqd_u64" type="checkbox"><label for="vceqd_u64"><div>uint64_t <b><b>vceqd_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqs_f32" type="checkbox"><label for="vceqs_f32"><div>uint32_t <b><b>vceqs_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqd_f64" type="checkbox"><label for="vceqd_f64"><div>uint64_t <b><b>vceqd_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_s8" type="checkbox"><label for="vceqz_s8"><div>uint8x8_t <b><b>vceqz_s8</b></b> (int8x8_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8B,Vn.8B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_s8" type="checkbox"><label for="vceqzq_s8"><div>uint8x16_t <b><b>vceqzq_s8</b></b> (int8x16_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.16B,Vn.16B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_s16" type="checkbox"><label for="vceqz_s16"><div>uint16x4_t <b><b>vceqz_s16</b></b> (int16x4_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.4H,Vn.4H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_s16" type="checkbox"><label for="vceqzq_s16"><div>uint16x8_t <b><b>vceqzq_s16</b></b> (int16x8_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8H,Vn.8H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_s32" type="checkbox"><label for="vceqz_s32"><div>uint32x2_t <b><b>vceqz_s32</b></b> (int32x2_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_s32" type="checkbox"><label for="vceqzq_s32"><div>uint32x4_t <b><b>vceqzq_s32</b></b> (int32x4_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_u8" type="checkbox"><label for="vceqz_u8"><div>uint8x8_t <b><b>vceqz_u8</b></b> (uint8x8_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8B,Vn.8B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_u8" type="checkbox"><label for="vceqzq_u8"><div>uint8x16_t <b><b>vceqzq_u8</b></b> (uint8x16_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.16B,Vn.16B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_u16" type="checkbox"><label for="vceqz_u16"><div>uint16x4_t <b><b>vceqz_u16</b></b> (uint16x4_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.4H,Vn.4H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_u16" type="checkbox"><label for="vceqzq_u16"><div>uint16x8_t <b><b>vceqzq_u16</b></b> (uint16x8_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8H,Vn.8H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_u32" type="checkbox"><label for="vceqz_u32"><div>uint32x2_t <b><b>vceqz_u32</b></b> (uint32x2_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_u32" type="checkbox"><label for="vceqzq_u32"><div>uint32x4_t <b><b>vceqzq_u32</b></b> (uint32x4_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_f32" type="checkbox"><label for="vceqz_f32"><div>uint32x2_t <b><b>vceqz_f32</b></b> (float32x2_t a)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_f32" type="checkbox"><label for="vceqzq_f32"><div>uint32x4_t <b><b>vceqzq_f32</b></b> (float32x4_t a)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_p8" type="checkbox"><label for="vceqz_p8"><div>uint8x8_t <b><b>vceqz_p8</b></b> (poly8x8_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.8B,Vn.8B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_p8" type="checkbox"><label for="vceqzq_p8"><div>uint8x16_t <b><b>vceqzq_p8</b></b> (poly8x16_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.16B,Vn.16B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_s64" type="checkbox"><label for="vceqz_s64"><div>uint64x1_t <b><b>vceqz_s64</b></b> (int64x1_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_s64" type="checkbox"><label for="vceqzq_s64"><div>uint64x2_t <b><b>vceqzq_s64</b></b> (int64x2_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_u64" type="checkbox"><label for="vceqz_u64"><div>uint64x1_t <b><b>vceqz_u64</b></b> (uint64x1_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_u64" type="checkbox"><label for="vceqzq_u64"><div>uint64x2_t <b><b>vceqzq_u64</b></b> (uint64x2_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_p64" type="checkbox"><label for="vceqz_p64"><div>uint64x1_t <b><b>vceqz_p64</b></b> (poly64x1_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_p64" type="checkbox"><label for="vceqzq_p64"><div>uint64x2_t <b><b>vceqzq_p64</b></b> (poly64x2_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vceqz_f64" type="checkbox"><label for="vceqz_f64"><div>uint64x1_t <b><b>vceqz_f64</b></b> (float64x1_t a)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzq_f64" type="checkbox"><label for="vceqzq_f64"><div>uint64x2_t <b><b>vceqzq_f64</b></b> (float64x2_t a)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzd_s64" type="checkbox"><label for="vceqzd_s64"><div>uint64_t <b><b>vceqzd_s64</b></b> (int64_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzd_u64" type="checkbox"><label for="vceqzd_u64"><div>uint64_t <b><b>vceqzd_u64</b></b> (uint64_t a)<span class="right">Compare bitwise equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmeq-zero-compare-bitwise-equal-to-zero-vector">CMEQ</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzs_f32" type="checkbox"><label for="vceqzs_f32"><div>uint32_t <b><b>vceqzs_f32</b></b> (float32_t a)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Sd,Sn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vceqzd_f64" type="checkbox"><label for="vceqzd_f64"><div>uint64_t <b><b>vceqzd_f64</b></b> (float64_t a)<span class="right">Floating-point compare equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmeq-zero-floating-point-compare-equal-to-zero-vector">FCMEQ</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcge_s8" type="checkbox"><label for="vcge_s8"><div>uint8x8_t <b><b>vcge_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.8B,Vm.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_s8" type="checkbox"><label for="vcgeq_s8"><div>uint8x16_t <b><b>vcgeq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.16B,Vm.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcge_s16" type="checkbox"><label for="vcge_s16"><div>uint16x4_t <b><b>vcge_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.4H,Vm.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_s16" type="checkbox"><label for="vcgeq_s16"><div>uint16x8_t <b><b>vcgeq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.8H,Vm.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcge_s32" type="checkbox"><label for="vcge_s32"><div>uint32x2_t <b><b>vcge_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_s32" type="checkbox"><label for="vcgeq_s32"><div>uint32x4_t <b><b>vcgeq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcge_u8" type="checkbox"><label for="vcge_u8"><div>uint8x8_t <b><b>vcge_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.8B,Vm.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_u8" type="checkbox"><label for="vcgeq_u8"><div>uint8x16_t <b><b>vcgeq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.16B,Vm.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcge_u16" type="checkbox"><label for="vcge_u16"><div>uint16x4_t <b><b>vcge_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.4H,Vm.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_u16" type="checkbox"><label for="vcgeq_u16"><div>uint16x8_t <b><b>vcgeq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.8H,Vm.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcge_u32" type="checkbox"><label for="vcge_u32"><div>uint32x2_t <b><b>vcge_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_u32" type="checkbox"><label for="vcgeq_u32"><div>uint32x4_t <b><b>vcgeq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcge_f32" type="checkbox"><label for="vcge_f32"><div>uint32x2_t <b><b>vcge_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_f32" type="checkbox"><label for="vcgeq_f32"><div>uint32x4_t <b><b>vcgeq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcge_s64" type="checkbox"><label for="vcge_s64"><div>uint64x1_t <b><b>vcge_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_s64" type="checkbox"><label for="vcgeq_s64"><div>uint64x2_t <b><b>vcgeq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcge_u64" type="checkbox"><label for="vcge_u64"><div>uint64x1_t <b><b>vcge_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_u64" type="checkbox"><label for="vcgeq_u64"><div>uint64x2_t <b><b>vcgeq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcge_f64" type="checkbox"><label for="vcge_f64"><div>uint64x1_t <b><b>vcge_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgeq_f64" type="checkbox"><label for="vcgeq_f64"><div>uint64x2_t <b><b>vcgeq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcged_s64" type="checkbox"><label for="vcged_s64"><div>uint64_t <b><b>vcged_s64</b></b> (int64_t a, int64_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcged_u64" type="checkbox"><label for="vcged_u64"><div>uint64_t <b><b>vcged_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcges_f32" type="checkbox"><label for="vcges_f32"><div>uint32_t <b><b>vcges_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcged_f64" type="checkbox"><label for="vcged_f64"><div>uint64_t <b><b>vcged_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgez_s8" type="checkbox"><label for="vcgez_s8"><div>uint8x8_t <b><b>vcgez_s8</b></b> (int8x8_t a)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.8B,Vn.8B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgezq_s8" type="checkbox"><label for="vcgezq_s8"><div>uint8x16_t <b><b>vcgezq_s8</b></b> (int8x16_t a)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.16B,Vn.16B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgez_s16" type="checkbox"><label for="vcgez_s16"><div>uint16x4_t <b><b>vcgez_s16</b></b> (int16x4_t a)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.4H,Vn.4H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgezq_s16" type="checkbox"><label for="vcgezq_s16"><div>uint16x8_t <b><b>vcgezq_s16</b></b> (int16x8_t a)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.8H,Vn.8H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgez_s32" type="checkbox"><label for="vcgez_s32"><div>uint32x2_t <b><b>vcgez_s32</b></b> (int32x2_t a)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgezq_s32" type="checkbox"><label for="vcgezq_s32"><div>uint32x4_t <b><b>vcgezq_s32</b></b> (int32x4_t a)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgez_s64" type="checkbox"><label for="vcgez_s64"><div>uint64x1_t <b><b>vcgez_s64</b></b> (int64x1_t a)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgezq_s64" type="checkbox"><label for="vcgezq_s64"><div>uint64x2_t <b><b>vcgezq_s64</b></b> (int64x2_t a)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgez_f32" type="checkbox"><label for="vcgez_f32"><div>uint32x2_t <b><b>vcgez_f32</b></b> (float32x2_t a)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgezq_f32" type="checkbox"><label for="vcgezq_f32"><div>uint32x4_t <b><b>vcgezq_f32</b></b> (float32x4_t a)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgez_f64" type="checkbox"><label for="vcgez_f64"><div>uint64x1_t <b><b>vcgez_f64</b></b> (float64x1_t a)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgezq_f64" type="checkbox"><label for="vcgezq_f64"><div>uint64x2_t <b><b>vcgezq_f64</b></b> (float64x2_t a)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgezd_s64" type="checkbox"><label for="vcgezd_s64"><div>uint64_t <b><b>vcgezd_s64</b></b> (int64_t a)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgezs_f32" type="checkbox"><label for="vcgezs_f32"><div>uint32_t <b><b>vcgezs_f32</b></b> (float32_t a)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Sd,Sn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgezd_f64" type="checkbox"><label for="vcgezd_f64"><div>uint64_t <b><b>vcgezd_f64</b></b> (float64_t a)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcle_s8" type="checkbox"><label for="vcle_s8"><div>uint8x8_t <b><b>vcle_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.8B,Vm.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_s8" type="checkbox"><label for="vcleq_s8"><div>uint8x16_t <b><b>vcleq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.16B,Vm.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcle_s16" type="checkbox"><label for="vcle_s16"><div>uint16x4_t <b><b>vcle_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.4H,Vm.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_s16" type="checkbox"><label for="vcleq_s16"><div>uint16x8_t <b><b>vcleq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.8H,Vm.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcle_s32" type="checkbox"><label for="vcle_s32"><div>uint32x2_t <b><b>vcle_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_s32" type="checkbox"><label for="vcleq_s32"><div>uint32x4_t <b><b>vcleq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcle_u8" type="checkbox"><label for="vcle_u8"><div>uint8x8_t <b><b>vcle_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.8B,Vm.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_u8" type="checkbox"><label for="vcleq_u8"><div>uint8x16_t <b><b>vcleq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.16B,Vm.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcle_u16" type="checkbox"><label for="vcle_u16"><div>uint16x4_t <b><b>vcle_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.4H,Vm.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_u16" type="checkbox"><label for="vcleq_u16"><div>uint16x8_t <b><b>vcleq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.8H,Vm.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcle_u32" type="checkbox"><label for="vcle_u32"><div>uint32x2_t <b><b>vcle_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_u32" type="checkbox"><label for="vcleq_u32"><div>uint32x4_t <b><b>vcleq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcle_f32" type="checkbox"><label for="vcle_f32"><div>uint32x2_t <b><b>vcle_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_f32" type="checkbox"><label for="vcleq_f32"><div>uint32x4_t <b><b>vcleq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcle_s64" type="checkbox"><label for="vcle_s64"><div>uint64x1_t <b><b>vcle_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_s64" type="checkbox"><label for="vcleq_s64"><div>uint64x2_t <b><b>vcleq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcle_u64" type="checkbox"><label for="vcle_u64"><div>uint64x1_t <b><b>vcle_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_u64" type="checkbox"><label for="vcleq_u64"><div>uint64x2_t <b><b>vcleq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcle_f64" type="checkbox"><label for="vcle_f64"><div>uint64x1_t <b><b>vcle_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcleq_f64" type="checkbox"><label for="vcleq_f64"><div>uint64x2_t <b><b>vcleq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcled_s64" type="checkbox"><label for="vcled_s64"><div>uint64_t <b><b>vcled_s64</b></b> (int64_t a, int64_t b)<span class="right">Compare signed greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmge-zero-compare-signed-greater-than-or-equal-to-zero-vector">CMGE</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcled_u64" type="checkbox"><label for="vcled_u64"><div>uint64_t <b><b>vcled_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Compare unsigned higher or same</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher or Same (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than or equal to the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcles_f32" type="checkbox"><label for="vcles_f32"><div>uint32_t <b><b>vcles_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Sd,Sm,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcled_f64" type="checkbox"><label for="vcled_f64"><div>uint64_t <b><b>vcled_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point compare greater than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmge-zero-floating-point-compare-greater-than-or-equal-to-zero-vector">FCMGE</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclez_s8" type="checkbox"><label for="vclez_s8"><div>uint8x8_t <b><b>vclez_s8</b></b> (int8x8_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Vd.8B,Vn.8B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclezq_s8" type="checkbox"><label for="vclezq_s8"><div>uint8x16_t <b><b>vclezq_s8</b></b> (int8x16_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Vd.16B,Vn.16B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclez_s16" type="checkbox"><label for="vclez_s16"><div>uint16x4_t <b><b>vclez_s16</b></b> (int16x4_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Vd.4H,Vn.4H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclezq_s16" type="checkbox"><label for="vclezq_s16"><div>uint16x8_t <b><b>vclezq_s16</b></b> (int16x8_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Vd.8H,Vn.8H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclez_s32" type="checkbox"><label for="vclez_s32"><div>uint32x2_t <b><b>vclez_s32</b></b> (int32x2_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclezq_s32" type="checkbox"><label for="vclezq_s32"><div>uint32x4_t <b><b>vclezq_s32</b></b> (int32x4_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclez_s64" type="checkbox"><label for="vclez_s64"><div>uint64x1_t <b><b>vclez_s64</b></b> (int64x1_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclezq_s64" type="checkbox"><label for="vclezq_s64"><div>uint64x2_t <b><b>vclezq_s64</b></b> (int64x2_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclez_f32" type="checkbox"><label for="vclez_f32"><div>uint32x2_t <b><b>vclez_f32</b></b> (float32x2_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclezq_f32" type="checkbox"><label for="vclezq_f32"><div>uint32x4_t <b><b>vclezq_f32</b></b> (float32x4_t a)<span class="right">Floating-point compare less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmle-zero-floating-point-compare-less-than-or-equal-to-zero-vector">FCMLE</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclez_f64" type="checkbox"><label for="vclez_f64"><div>uint64x1_t <b><b>vclez_f64</b></b> (float64x1_t a)<span class="right">Floating-point compare less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmle-zero-floating-point-compare-less-than-or-equal-to-zero-vector">FCMLE</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclezq_f64" type="checkbox"><label for="vclezq_f64"><div>uint64x2_t <b><b>vclezq_f64</b></b> (float64x2_t a)<span class="right">Floating-point compare less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmle-zero-floating-point-compare-less-than-or-equal-to-zero-vector">FCMLE</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclezd_s64" type="checkbox"><label for="vclezd_s64"><div>uint64_t <b><b>vclezd_s64</b></b> (int64_t a)<span class="right">Compare signed less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than or Equal to zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmle-zero-compare-signed-less-than-or-equal-to-zero-vector">CMLE</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclezs_f32" type="checkbox"><label for="vclezs_f32"><div>uint32_t <b><b>vclezs_f32</b></b> (float32_t a)<span class="right">Floating-point compare less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmle-zero-floating-point-compare-less-than-or-equal-to-zero-vector">FCMLE</a> Sd,Sn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclezd_f64" type="checkbox"><label for="vclezd_f64"><div>uint64_t <b><b>vclezd_f64</b></b> (float64_t a)<span class="right">Floating-point compare less than or equal to zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than or Equal to zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than or equal to zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmle-zero-floating-point-compare-less-than-or-equal-to-zero-vector">FCMLE</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_s8" type="checkbox"><label for="vcgt_s8"><div>uint8x8_t <b><b>vcgt_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_s8" type="checkbox"><label for="vcgtq_s8"><div>uint8x16_t <b><b>vcgtq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_s16" type="checkbox"><label for="vcgt_s16"><div>uint16x4_t <b><b>vcgt_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_s16" type="checkbox"><label for="vcgtq_s16"><div>uint16x8_t <b><b>vcgtq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_s32" type="checkbox"><label for="vcgt_s32"><div>uint32x2_t <b><b>vcgt_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_s32" type="checkbox"><label for="vcgtq_s32"><div>uint32x4_t <b><b>vcgtq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_u8" type="checkbox"><label for="vcgt_u8"><div>uint8x8_t <b><b>vcgt_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_u8" type="checkbox"><label for="vcgtq_u8"><div>uint8x16_t <b><b>vcgtq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_u16" type="checkbox"><label for="vcgt_u16"><div>uint16x4_t <b><b>vcgt_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_u16" type="checkbox"><label for="vcgtq_u16"><div>uint16x8_t <b><b>vcgtq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_u32" type="checkbox"><label for="vcgt_u32"><div>uint32x2_t <b><b>vcgt_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_u32" type="checkbox"><label for="vcgtq_u32"><div>uint32x4_t <b><b>vcgtq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_f32" type="checkbox"><label for="vcgt_f32"><div>uint32x2_t <b><b>vcgt_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_f32" type="checkbox"><label for="vcgtq_f32"><div>uint32x4_t <b><b>vcgtq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_s64" type="checkbox"><label for="vcgt_s64"><div>uint64x1_t <b><b>vcgt_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_s64" type="checkbox"><label for="vcgtq_s64"><div>uint64x2_t <b><b>vcgtq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_u64" type="checkbox"><label for="vcgt_u64"><div>uint64x1_t <b><b>vcgt_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_u64" type="checkbox"><label for="vcgtq_u64"><div>uint64x2_t <b><b>vcgtq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgt_f64" type="checkbox"><label for="vcgt_f64"><div>uint64x1_t <b><b>vcgt_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtq_f64" type="checkbox"><label for="vcgtq_f64"><div>uint64x2_t <b><b>vcgtq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtd_s64" type="checkbox"><label for="vcgtd_s64"><div>uint64_t <b><b>vcgtd_s64</b></b> (int64_t a, int64_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtd_u64" type="checkbox"><label for="vcgtd_u64"><div>uint64_t <b><b>vcgtd_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgts_f32" type="checkbox"><label for="vcgts_f32"><div>uint32_t <b><b>vcgts_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtd_f64" type="checkbox"><label for="vcgtd_f64"><div>uint64_t <b><b>vcgtd_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtz_s8" type="checkbox"><label for="vcgtz_s8"><div>uint8x8_t <b><b>vcgtz_s8</b></b> (int8x8_t a)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.8B,Vn.8B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtzq_s8" type="checkbox"><label for="vcgtzq_s8"><div>uint8x16_t <b><b>vcgtzq_s8</b></b> (int8x16_t a)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.16B,Vn.16B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtz_s16" type="checkbox"><label for="vcgtz_s16"><div>uint16x4_t <b><b>vcgtz_s16</b></b> (int16x4_t a)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.4H,Vn.4H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtzq_s16" type="checkbox"><label for="vcgtzq_s16"><div>uint16x8_t <b><b>vcgtzq_s16</b></b> (int16x8_t a)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.8H,Vn.8H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtz_s32" type="checkbox"><label for="vcgtz_s32"><div>uint32x2_t <b><b>vcgtz_s32</b></b> (int32x2_t a)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtzq_s32" type="checkbox"><label for="vcgtzq_s32"><div>uint32x4_t <b><b>vcgtzq_s32</b></b> (int32x4_t a)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtz_s64" type="checkbox"><label for="vcgtz_s64"><div>uint64x1_t <b><b>vcgtz_s64</b></b> (int64x1_t a)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtzq_s64" type="checkbox"><label for="vcgtzq_s64"><div>uint64x2_t <b><b>vcgtzq_s64</b></b> (int64x2_t a)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtz_f32" type="checkbox"><label for="vcgtz_f32"><div>uint32x2_t <b><b>vcgtz_f32</b></b> (float32x2_t a)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtzq_f32" type="checkbox"><label for="vcgtzq_f32"><div>uint32x4_t <b><b>vcgtzq_f32</b></b> (float32x4_t a)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtz_f64" type="checkbox"><label for="vcgtz_f64"><div>uint64x1_t <b><b>vcgtz_f64</b></b> (float64x1_t a)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtzq_f64" type="checkbox"><label for="vcgtzq_f64"><div>uint64x2_t <b><b>vcgtzq_f64</b></b> (float64x2_t a)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtzd_s64" type="checkbox"><label for="vcgtzd_s64"><div>uint64_t <b><b>vcgtzd_s64</b></b> (int64_t a)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtzs_f32" type="checkbox"><label for="vcgtzs_f32"><div>uint32_t <b><b>vcgtzs_f32</b></b> (float32_t a)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Sd,Sn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcgtzd_f64" type="checkbox"><label for="vcgtzd_f64"><div>uint64_t <b><b>vcgtzd_f64</b></b> (float64_t a)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclt_s8" type="checkbox"><label for="vclt_s8"><div>uint8x8_t <b><b>vclt_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.8B,Vm.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_s8" type="checkbox"><label for="vcltq_s8"><div>uint8x16_t <b><b>vcltq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.16B,Vm.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclt_s16" type="checkbox"><label for="vclt_s16"><div>uint16x4_t <b><b>vclt_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.4H,Vm.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_s16" type="checkbox"><label for="vcltq_s16"><div>uint16x8_t <b><b>vcltq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.8H,Vm.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclt_s32" type="checkbox"><label for="vclt_s32"><div>uint32x2_t <b><b>vclt_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_s32" type="checkbox"><label for="vcltq_s32"><div>uint32x4_t <b><b>vcltq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclt_u8" type="checkbox"><label for="vclt_u8"><div>uint8x8_t <b><b>vclt_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.8B,Vm.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_u8" type="checkbox"><label for="vcltq_u8"><div>uint8x16_t <b><b>vcltq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.16B,Vm.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclt_u16" type="checkbox"><label for="vclt_u16"><div>uint16x4_t <b><b>vclt_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.4H,Vm.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_u16" type="checkbox"><label for="vcltq_u16"><div>uint16x8_t <b><b>vcltq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.8H,Vm.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclt_u32" type="checkbox"><label for="vclt_u32"><div>uint32x2_t <b><b>vclt_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_u32" type="checkbox"><label for="vcltq_u32"><div>uint32x4_t <b><b>vcltq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclt_f32" type="checkbox"><label for="vclt_f32"><div>uint32x2_t <b><b>vclt_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_f32" type="checkbox"><label for="vcltq_f32"><div>uint32x4_t <b><b>vcltq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclt_s64" type="checkbox"><label for="vclt_s64"><div>uint64x1_t <b><b>vclt_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_s64" type="checkbox"><label for="vcltq_s64"><div>uint64x2_t <b><b>vcltq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclt_u64" type="checkbox"><label for="vclt_u64"><div>uint64x1_t <b><b>vclt_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_u64" type="checkbox"><label for="vcltq_u64"><div>uint64x2_t <b><b>vcltq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclt_f64" type="checkbox"><label for="vclt_f64"><div>uint64x1_t <b><b>vclt_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltq_f64" type="checkbox"><label for="vcltq_f64"><div>uint64x2_t <b><b>vcltq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltd_s64" type="checkbox"><label for="vcltd_s64"><div>uint64_t <b><b>vcltd_s64</b></b> (int64_t a, int64_t b)<span class="right">Compare signed greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Greater than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmgt-zero-compare-signed-greater-than-zero-vector">CMGT</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltd_u64" type="checkbox"><label for="vcltd_u64"><div>uint64_t <b><b>vcltd_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Compare unsigned higher</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare unsigned Higher (vector). This instruction compares each vector element in the first source SIMD&amp;FP register with the corresponding vector element in the second source SIMD&amp;FP register and if the first unsigned integer value is greater than the second unsigned integer value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhi-register-compare-unsigned-higher-vector">CMHI</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    test_passed = if cmp_eq then element1 &gt;= element2 else element1 &gt; element2;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vclts_f32" type="checkbox"><label for="vclts_f32"><div>uint32_t <b><b>vclts_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Sd,Sm,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltd_f64" type="checkbox"><label for="vcltd_f64"><div>uint64_t <b><b>vcltd_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point compare greater than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Greater than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is greater than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmgt-zero-floating-point-compare-greater-than-zero-vector">FCMGT</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltz_s8" type="checkbox"><label for="vcltz_s8"><div>uint8x8_t <b><b>vcltz_s8</b></b> (int8x8_t a)<span class="right">Compare signed less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmlt-zero-compare-signed-less-than-zero-vector">CMLT</a> Vd.8B,Vn.8B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltzq_s8" type="checkbox"><label for="vcltzq_s8"><div>uint8x16_t <b><b>vcltzq_s8</b></b> (int8x16_t a)<span class="right">Compare signed less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmlt-zero-compare-signed-less-than-zero-vector">CMLT</a> Vd.16B,Vn.16B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltz_s16" type="checkbox"><label for="vcltz_s16"><div>uint16x4_t <b><b>vcltz_s16</b></b> (int16x4_t a)<span class="right">Compare signed less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmlt-zero-compare-signed-less-than-zero-vector">CMLT</a> Vd.4H,Vn.4H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltzq_s16" type="checkbox"><label for="vcltzq_s16"><div>uint16x8_t <b><b>vcltzq_s16</b></b> (int16x8_t a)<span class="right">Compare signed less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmlt-zero-compare-signed-less-than-zero-vector">CMLT</a> Vd.8H,Vn.8H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltz_s32" type="checkbox"><label for="vcltz_s32"><div>uint32x2_t <b><b>vcltz_s32</b></b> (int32x2_t a)<span class="right">Compare signed less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmlt-zero-compare-signed-less-than-zero-vector">CMLT</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltzq_s32" type="checkbox"><label for="vcltzq_s32"><div>uint32x4_t <b><b>vcltzq_s32</b></b> (int32x4_t a)<span class="right">Compare signed less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmlt-zero-compare-signed-less-than-zero-vector">CMLT</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltz_s64" type="checkbox"><label for="vcltz_s64"><div>uint64x1_t <b><b>vcltz_s64</b></b> (int64x1_t a)<span class="right">Compare signed less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmlt-zero-compare-signed-less-than-zero-vector">CMLT</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltzq_s64" type="checkbox"><label for="vcltzq_s64"><div>uint64x2_t <b><b>vcltzq_s64</b></b> (int64x2_t a)<span class="right">Compare signed less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmlt-zero-compare-signed-less-than-zero-vector">CMLT</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltz_f32" type="checkbox"><label for="vcltz_f32"><div>uint32x2_t <b><b>vcltz_f32</b></b> (float32x2_t a)<span class="right">Floating-point compare less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmlt-zero-floating-point-compare-less-than-zero-vector">FCMLT</a> Vd.2S,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltzq_f32" type="checkbox"><label for="vcltzq_f32"><div>uint32x4_t <b><b>vcltzq_f32</b></b> (float32x4_t a)<span class="right">Floating-point compare less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmlt-zero-floating-point-compare-less-than-zero-vector">FCMLT</a> Vd.4S,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltz_f64" type="checkbox"><label for="vcltz_f64"><div>uint64x1_t <b><b>vcltz_f64</b></b> (float64x1_t a)<span class="right">Floating-point compare less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmlt-zero-floating-point-compare-less-than-zero-vector">FCMLT</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltzq_f64" type="checkbox"><label for="vcltzq_f64"><div>uint64x2_t <b><b>vcltzq_f64</b></b> (float64x2_t a)<span class="right">Floating-point compare less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmlt-zero-floating-point-compare-less-than-zero-vector">FCMLT</a> Vd.2D,Vn.2D,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltzd_s64" type="checkbox"><label for="vcltzd_s64"><div>uint64_t <b><b>vcltzd_s64</b></b> (int64_t a)<span class="right">Compare signed less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare signed Less than zero (vector). This instruction reads each vector element in the source SIMD&amp;FP register and if the signed integer value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmlt-zero-compare-signed-less-than-zero-vector">CMLT</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = element &gt; 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = element &gt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = element == 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = element &lt;= 0;
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = element &lt; 0;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltzs_f32" type="checkbox"><label for="vcltzs_f32"><div>uint32_t <b><b>vcltzs_f32</b></b> (float32_t a)<span class="right">Floating-point compare less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmlt-zero-floating-point-compare-less-than-zero-vector">FCMLT</a> Sd,Sn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcltzd_f64" type="checkbox"><label for="vcltzd_f64"><div>uint64_t <b><b>vcltzd_f64</b></b> (float64_t a)<span class="right">Floating-point compare less than zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Compare Less than zero (vector). This instruction reads each floating-point value in the source SIMD&amp;FP register and if the value is less than zero sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcmlt-zero-floating-point-compare-less-than-zero-vector">FCMLT</a> Dd,Dn,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) zero = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPZero.1" title="function: bits(N) FPZero(bit sign)">FPZero</a>('0');
+bits(esize) element;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    case comparison of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element, zero, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(zero, element, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_LT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_LT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(zero, element, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcage_f32" type="checkbox"><label for="vcage_f32"><div>uint32x2_t <b><b>vcage_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcageq_f32" type="checkbox"><label for="vcageq_f32"><div>uint32x4_t <b><b>vcageq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcage_f64" type="checkbox"><label for="vcage_f64"><div>uint64x1_t <b><b>vcage_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcageq_f64" type="checkbox"><label for="vcageq_f64"><div>uint64x2_t <b><b>vcageq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcages_f32" type="checkbox"><label for="vcages_f32"><div>uint32_t <b><b>vcages_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcaged_f64" type="checkbox"><label for="vcaged_f64"><div>uint64_t <b><b>vcaged_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcale_f32" type="checkbox"><label for="vcale_f32"><div>uint32x2_t <b><b>vcale_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcaleq_f32" type="checkbox"><label for="vcaleq_f32"><div>uint32x4_t <b><b>vcaleq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcale_f64" type="checkbox"><label for="vcale_f64"><div>uint64x1_t <b><b>vcale_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcaleq_f64" type="checkbox"><label for="vcaleq_f64"><div>uint64x2_t <b><b>vcaleq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Vd.2D,Vm.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcales_f32" type="checkbox"><label for="vcales_f32"><div>uint32_t <b><b>vcales_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Sd,Sm,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcaled_f64" type="checkbox"><label for="vcaled_f64"><div>uint64_t <b><b>vcaled_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point absolute compare greater than or equal</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than or Equal (vector). This instruction compares the absolute value of each floating-point value in the first source SIMD&amp;FP register with the absolute value of the corresponding floating-point value in the second source SIMD&amp;FP register and if the first value is greater than or equal to the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facge-floating-point-absolute-compare-greater-than-or-equal-vector">FACGE</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcagt_f32" type="checkbox"><label for="vcagt_f32"><div>uint32x2_t <b><b>vcagt_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcagtq_f32" type="checkbox"><label for="vcagtq_f32"><div>uint32x4_t <b><b>vcagtq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcagt_f64" type="checkbox"><label for="vcagt_f64"><div>uint64x1_t <b><b>vcagt_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcagtq_f64" type="checkbox"><label for="vcagtq_f64"><div>uint64x2_t <b><b>vcagtq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcagts_f32" type="checkbox"><label for="vcagts_f32"><div>uint32_t <b><b>vcagts_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcagtd_f64" type="checkbox"><label for="vcagtd_f64"><div>uint64_t <b><b>vcagtd_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcalt_f32" type="checkbox"><label for="vcalt_f32"><div>uint32x2_t <b><b>vcalt_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Vd.2S,Vm.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcaltq_f32" type="checkbox"><label for="vcaltq_f32"><div>uint32x4_t <b><b>vcaltq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Vd.4S,Vm.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcalt_f64" type="checkbox"><label for="vcalt_f64"><div>uint64x1_t <b><b>vcalt_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcaltq_f64" type="checkbox"><label for="vcaltq_f64"><div>uint64x2_t <b><b>vcaltq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcalts_f32" type="checkbox"><label for="vcalts_f32"><div>uint32_t <b><b>vcalts_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Sd,Sm,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcaltd_f64" type="checkbox"><label for="vcaltd_f64"><div>uint64_t <b><b>vcaltd_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point absolute compare greater than</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Compare Greater than (vector). This instruction compares the absolute value of each vector element in the first source SIMD&amp;FP register with the absolute value of the corresponding vector element in the second source SIMD&amp;FP register and if the first value is greater than the second value sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/facgt-floating-point-absolute-compare-greater-than-vector">FACGT</a> Dd,Dm,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if abs then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element1);
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element2);
+    case cmp of
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_EQ" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_EQ</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareEQ.3" title="function: boolean FPCompareEQ(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareEQ</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GE" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GE</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGE.3" title="function: boolean FPCompareGE(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGE</a>(element1, element2, FPCR);
+        when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CompareOp_GT" title="enumeration CompareOp   {CompareOp_GT, CompareOp_GE, CompareOp_EQ,
+ CompareOp_LE, CompareOp_LT}">CompareOp_GT</a> test_passed = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPCompareGT.3" title="function: boolean FPCompareGT(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPCompareGT</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtst_s8" type="checkbox"><label for="vtst_s8"><div>uint8x8_t <b><b>vtst_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_s8" type="checkbox"><label for="vtstq_s8"><div>uint8x16_t <b><b>vtstq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtst_s16" type="checkbox"><label for="vtst_s16"><div>uint16x4_t <b><b>vtst_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_s16" type="checkbox"><label for="vtstq_s16"><div>uint16x8_t <b><b>vtstq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtst_s32" type="checkbox"><label for="vtst_s32"><div>uint32x2_t <b><b>vtst_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_s32" type="checkbox"><label for="vtstq_s32"><div>uint32x4_t <b><b>vtstq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtst_u8" type="checkbox"><label for="vtst_u8"><div>uint8x8_t <b><b>vtst_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_u8" type="checkbox"><label for="vtstq_u8"><div>uint8x16_t <b><b>vtstq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtst_u16" type="checkbox"><label for="vtst_u16"><div>uint16x4_t <b><b>vtst_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_u16" type="checkbox"><label for="vtstq_u16"><div>uint16x8_t <b><b>vtstq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtst_u32" type="checkbox"><label for="vtst_u32"><div>uint32x2_t <b><b>vtst_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_u32" type="checkbox"><label for="vtstq_u32"><div>uint32x4_t <b><b>vtstq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtst_p8" type="checkbox"><label for="vtst_p8"><div>uint8x8_t <b><b>vtst_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_p8" type="checkbox"><label for="vtstq_p8"><div>uint8x16_t <b><b>vtstq_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtst_s64" type="checkbox"><label for="vtst_s64"><div>uint64x1_t <b><b>vtst_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_s64" type="checkbox"><label for="vtstq_s64"><div>uint64x2_t <b><b>vtstq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtst_u64" type="checkbox"><label for="vtst_u64"><div>uint64x1_t <b><b>vtst_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_u64" type="checkbox"><label for="vtstq_u64"><div>uint64x2_t <b><b>vtstq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtst_p64" type="checkbox"><label for="vtst_p64"><div>uint64x1_t <b><b>vtst_p64</b></b> (poly64x1_t a, poly64x1_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtstq_p64" type="checkbox"><label for="vtstq_p64"><div>uint64x2_t <b><b>vtstq_p64</b></b> (poly64x2_t a, poly64x2_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtstd_s64" type="checkbox"><label for="vtstd_s64"><div>uint64_t <b><b>vtstd_s64</b></b> (int64_t a, int64_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtstd_u64" type="checkbox"><label for="vtstd_u64"><div>uint64_t <b><b>vtstd_u64</b></b> (uint64_t a, uint64_t b)<span class="right">Compare bitwise test bits nonzero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Compare bitwise Test bits nonzero (vector). This instruction reads each vector element in the first source SIMD&amp;FP register, performs an AND with the corresponding vector element in the second source SIMD&amp;FP register, and if the result is not zero, sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to one, otherwise sets every bit of the corresponding vector element in the destination SIMD&amp;FP register to zero.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmtst-compare-bitwise-test-bits-nonzero-vector">CMTST</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+boolean test_passed;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if and_test then
+        test_passed = !<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.IsZero.1" title="function: boolean IsZero(bits(N) x)">IsZero</a>(element1 AND element2);
+    else
+        test_passed = (element1 == element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if test_passed then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabd_s8" type="checkbox"><label for="vabd_s8"><div>int8x8_t <b><b>vabd_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabd-signed-absolute-difference">SABD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdq_s8" type="checkbox"><label for="vabdq_s8"><div>int8x16_t <b><b>vabdq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabd-signed-absolute-difference">SABD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabd_s16" type="checkbox"><label for="vabd_s16"><div>int16x4_t <b><b>vabd_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabd-signed-absolute-difference">SABD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdq_s16" type="checkbox"><label for="vabdq_s16"><div>int16x8_t <b><b>vabdq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabd-signed-absolute-difference">SABD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabd_s32" type="checkbox"><label for="vabd_s32"><div>int32x2_t <b><b>vabd_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabd-signed-absolute-difference">SABD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdq_s32" type="checkbox"><label for="vabdq_s32"><div>int32x4_t <b><b>vabdq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabd-signed-absolute-difference">SABD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabd_u8" type="checkbox"><label for="vabd_u8"><div>uint8x8_t <b><b>vabd_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference (vector). This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabd-unsigned-absolute-difference-vector">UABD</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdq_u8" type="checkbox"><label for="vabdq_u8"><div>uint8x16_t <b><b>vabdq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference (vector). This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabd-unsigned-absolute-difference-vector">UABD</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabd_u16" type="checkbox"><label for="vabd_u16"><div>uint16x4_t <b><b>vabd_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference (vector). This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabd-unsigned-absolute-difference-vector">UABD</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdq_u16" type="checkbox"><label for="vabdq_u16"><div>uint16x8_t <b><b>vabdq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference (vector). This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabd-unsigned-absolute-difference-vector">UABD</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabd_u32" type="checkbox"><label for="vabd_u32"><div>uint32x2_t <b><b>vabd_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference (vector). This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabd-unsigned-absolute-difference-vector">UABD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdq_u32" type="checkbox"><label for="vabdq_u32"><div>uint32x4_t <b><b>vabdq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference (vector). This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, places the the absolute values of the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabd-unsigned-absolute-difference-vector">UABD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabd_f32" type="checkbox"><label for="vabd_f32"><div>float32x2_t <b><b>vabd_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Difference (vector). This instruction subtracts the floating-point values in the elements of the second source SIMD&amp;FP register, from the corresponding floating-point values in the elements of the first source SIMD&amp;FP register, places the absolute value of each result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabd-floating-point-absolute-difference-vector">FABD</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdq_f32" type="checkbox"><label for="vabdq_f32"><div>float32x4_t <b><b>vabdq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Difference (vector). This instruction subtracts the floating-point values in the elements of the second source SIMD&amp;FP register, from the corresponding floating-point values in the elements of the first source SIMD&amp;FP register, places the absolute value of each result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabd-floating-point-absolute-difference-vector">FABD</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabd_f64" type="checkbox"><label for="vabd_f64"><div>float64x1_t <b><b>vabd_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Difference (vector). This instruction subtracts the floating-point values in the elements of the second source SIMD&amp;FP register, from the corresponding floating-point values in the elements of the first source SIMD&amp;FP register, places the absolute value of each result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabd-floating-point-absolute-difference-vector">FABD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabdq_f64" type="checkbox"><label for="vabdq_f64"><div>float64x2_t <b><b>vabdq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Difference (vector). This instruction subtracts the floating-point values in the elements of the second source SIMD&amp;FP register, from the corresponding floating-point values in the elements of the first source SIMD&amp;FP register, places the absolute value of each result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabd-floating-point-absolute-difference-vector">FABD</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabds_f32" type="checkbox"><label for="vabds_f32"><div>float32_t <b><b>vabds_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Difference (vector). This instruction subtracts the floating-point values in the elements of the second source SIMD&amp;FP register, from the corresponding floating-point values in the elements of the first source SIMD&amp;FP register, places the absolute value of each result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabd-floating-point-absolute-difference-vector">FABD</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabdd_f64" type="checkbox"><label for="vabdd_f64"><div>float64_t <b><b>vabdd_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point absolute difference</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute Difference (vector). This instruction subtracts the floating-point values in the elements of the second source SIMD&amp;FP register, from the corresponding floating-point values in the elements of the first source SIMD&amp;FP register, places the absolute value of each result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabd-floating-point-absolute-difference-vector">FABD</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) diff;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    diff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSub.3" title="function: bits(N) FPSub(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPSub</a>(element1, element2, FPCR);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = if abs then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(diff) else diff;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_s8" type="checkbox"><label for="vabdl_s8"><div>int16x8_t <b><b>vabdl_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference Long. This instruction subtracts the vector elements of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the results into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabdl-sabdl2-signed-absolute-difference-long">SABDL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_s16" type="checkbox"><label for="vabdl_s16"><div>int32x4_t <b><b>vabdl_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference Long. This instruction subtracts the vector elements of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the results into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabdl-sabdl2-signed-absolute-difference-long">SABDL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_s32" type="checkbox"><label for="vabdl_s32"><div>int64x2_t <b><b>vabdl_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference Long. This instruction subtracts the vector elements of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the results into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabdl-sabdl2-signed-absolute-difference-long">SABDL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_u8" type="checkbox"><label for="vabdl_u8"><div>uint16x8_t <b><b>vabdl_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabdl-uabdl2-unsigned-absolute-difference-long">UABDL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_u16" type="checkbox"><label for="vabdl_u16"><div>uint32x4_t <b><b>vabdl_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabdl-uabdl2-unsigned-absolute-difference-long">UABDL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_u32" type="checkbox"><label for="vabdl_u32"><div>uint64x2_t <b><b>vabdl_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabdl-uabdl2-unsigned-absolute-difference-long">UABDL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_high_s8" type="checkbox"><label for="vabdl_high_s8"><div>int16x8_t <b><b>vabdl_high_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference Long. This instruction subtracts the vector elements of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the results into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabdl-sabdl2-signed-absolute-difference-long">SABDL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_high_s16" type="checkbox"><label for="vabdl_high_s16"><div>int32x4_t <b><b>vabdl_high_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference Long. This instruction subtracts the vector elements of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the results into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabdl-sabdl2-signed-absolute-difference-long">SABDL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_high_s32" type="checkbox"><label for="vabdl_high_s32"><div>int64x2_t <b><b>vabdl_high_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute Difference Long. This instruction subtracts the vector elements of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the results into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabdl-sabdl2-signed-absolute-difference-long">SABDL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_high_u8" type="checkbox"><label for="vabdl_high_u8"><div>uint16x8_t <b><b>vabdl_high_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabdl-uabdl2-unsigned-absolute-difference-long">UABDL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_high_u16" type="checkbox"><label for="vabdl_high_u16"><div>uint32x4_t <b><b>vabdl_high_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabdl-uabdl2-unsigned-absolute-difference-long">UABDL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabdl_high_u32" type="checkbox"><label for="vabdl_high_u32"><div>uint64x2_t <b><b>vabdl_high_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned absolute difference long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute Difference Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, places the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabdl-uabdl2-unsigned-absolute-difference-long">UABDL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaba_s8" type="checkbox"><label for="vaba_s8"><div>int8x8_t <b><b>vaba_s8</b></b> (int8x8_t a, int8x8_t b, int8x8_t c)<span class="right">Signed absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saba-signed-absolute-difference-and-accumulate">SABA</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabaq_s8" type="checkbox"><label for="vabaq_s8"><div>int8x16_t <b><b>vabaq_s8</b></b> (int8x16_t a, int8x16_t b, int8x16_t c)<span class="right">Signed absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saba-signed-absolute-difference-and-accumulate">SABA</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaba_s16" type="checkbox"><label for="vaba_s16"><div>int16x4_t <b><b>vaba_s16</b></b> (int16x4_t a, int16x4_t b, int16x4_t c)<span class="right">Signed absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saba-signed-absolute-difference-and-accumulate">SABA</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabaq_s16" type="checkbox"><label for="vabaq_s16"><div>int16x8_t <b><b>vabaq_s16</b></b> (int16x8_t a, int16x8_t b, int16x8_t c)<span class="right">Signed absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saba-signed-absolute-difference-and-accumulate">SABA</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaba_s32" type="checkbox"><label for="vaba_s32"><div>int32x2_t <b><b>vaba_s32</b></b> (int32x2_t a, int32x2_t b, int32x2_t c)<span class="right">Signed absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saba-signed-absolute-difference-and-accumulate">SABA</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabaq_s32" type="checkbox"><label for="vabaq_s32"><div>int32x4_t <b><b>vabaq_s32</b></b> (int32x4_t a, int32x4_t b, int32x4_t c)<span class="right">Signed absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saba-signed-absolute-difference-and-accumulate">SABA</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaba_u8" type="checkbox"><label for="vaba_u8"><div>uint8x8_t <b><b>vaba_u8</b></b> (uint8x8_t a, uint8x8_t b, uint8x8_t c)<span class="right">Unsigned absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaba-unsigned-absolute-difference-and-accumulate">UABA</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabaq_u8" type="checkbox"><label for="vabaq_u8"><div>uint8x16_t <b><b>vabaq_u8</b></b> (uint8x16_t a, uint8x16_t b, uint8x16_t c)<span class="right">Unsigned absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaba-unsigned-absolute-difference-and-accumulate">UABA</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaba_u16" type="checkbox"><label for="vaba_u16"><div>uint16x4_t <b><b>vaba_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16x4_t c)<span class="right">Unsigned absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaba-unsigned-absolute-difference-and-accumulate">UABA</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabaq_u16" type="checkbox"><label for="vabaq_u16"><div>uint16x8_t <b><b>vabaq_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16x8_t c)<span class="right">Unsigned absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaba-unsigned-absolute-difference-and-accumulate">UABA</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaba_u32" type="checkbox"><label for="vaba_u32"><div>uint32x2_t <b><b>vaba_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32x2_t c)<span class="right">Unsigned absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaba-unsigned-absolute-difference-and-accumulate">UABA</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabaq_u32" type="checkbox"><label for="vabaq_u32"><div>uint32x4_t <b><b>vabaq_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32x4_t c)<span class="right">Unsigned absolute difference and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate. This instruction subtracts the elements of the vector of the second source SIMD&amp;FP register from the corresponding elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the elements of the vector of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaba-unsigned-absolute-difference-and-accumulate">UABA</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+bits(esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabal_s8" type="checkbox"><label for="vabal_s8"><div>int16x8_t <b><b>vabal_s8</b></b> (int16x8_t a, int8x8_t b, int8x8_t c)<span class="right">Signed absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabal-sabal2-signed-absolute-difference-and-accumulate-long">SABAL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabal_s16" type="checkbox"><label for="vabal_s16"><div>int32x4_t <b><b>vabal_s16</b></b> (int32x4_t a, int16x4_t b, int16x4_t c)<span class="right">Signed absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabal-sabal2-signed-absolute-difference-and-accumulate-long">SABAL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabal_s32" type="checkbox"><label for="vabal_s32"><div>int64x2_t <b><b>vabal_s32</b></b> (int64x2_t a, int32x2_t b, int32x2_t c)<span class="right">Signed absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabal-sabal2-signed-absolute-difference-and-accumulate-long">SABAL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabal_u8" type="checkbox"><label for="vabal_u8"><div>uint16x8_t <b><b>vabal_u8</b></b> (uint16x8_t a, uint8x8_t b, uint8x8_t c)<span class="right">Unsigned absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabal-uabal2-unsigned-absolute-difference-and-accumulate-long">UABAL</a> Vd.8H,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabal_u16" type="checkbox"><label for="vabal_u16"><div>uint32x4_t <b><b>vabal_u16</b></b> (uint32x4_t a, uint16x4_t b, uint16x4_t c)<span class="right">Unsigned absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabal-uabal2-unsigned-absolute-difference-and-accumulate-long">UABAL</a> Vd.4S,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabal_u32" type="checkbox"><label for="vabal_u32"><div>uint64x2_t <b><b>vabal_u32</b></b> (uint64x2_t a, uint32x2_t b, uint32x2_t c)<span class="right">Unsigned absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabal-uabal2-unsigned-absolute-difference-and-accumulate-long">UABAL</a> Vd.2D,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabal_high_s8" type="checkbox"><label for="vabal_high_s8"><div>int16x8_t <b><b>vabal_high_s8</b></b> (int16x8_t a, int8x16_t b, int8x16_t c)<span class="right">Signed absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabal-sabal2-signed-absolute-difference-and-accumulate-long">SABAL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabal_high_s16" type="checkbox"><label for="vabal_high_s16"><div>int32x4_t <b><b>vabal_high_s16</b></b> (int32x4_t a, int16x8_t b, int16x8_t c)<span class="right">Signed absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabal-sabal2-signed-absolute-difference-and-accumulate-long">SABAL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabal_high_s32" type="checkbox"><label for="vabal_high_s32"><div>int64x2_t <b><b>vabal_high_s32</b></b> (int64x2_t a, int32x4_t b, int32x4_t c)<span class="right">Signed absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sabal-sabal2-signed-absolute-difference-and-accumulate-long">SABAL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabal_high_u8" type="checkbox"><label for="vabal_high_u8"><div>uint16x8_t <b><b>vabal_high_u8</b></b> (uint16x8_t a, uint8x16_t b, uint8x16_t c)<span class="right">Unsigned absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabal-uabal2-unsigned-absolute-difference-and-accumulate-long">UABAL2</a> Vd.8H,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabal_high_u16" type="checkbox"><label for="vabal_high_u16"><div>uint32x4_t <b><b>vabal_high_u16</b></b> (uint32x4_t a, uint16x8_t b, uint16x8_t c)<span class="right">Unsigned absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabal-uabal2-unsigned-absolute-difference-and-accumulate-long">UABAL2</a> Vd.4S,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabal_high_u32" type="checkbox"><label for="vabal_high_u32"><div>uint64x2_t <b><b>vabal_high_u32</b></b> (uint64x2_t a, uint32x4_t b, uint32x4_t c)<span class="right">Unsigned absolute difference and accumulate long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Absolute difference and Accumulate Long. This instruction subtracts the vector elements in the lower or upper half of the second source SIMD&amp;FP register from the corresponding vector elements of the first source SIMD&amp;FP register, and accumulates the absolute values of the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uabal-uabal2-unsigned-absolute-difference-and-accumulate-long">UABAL2</a> Vd.2D,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) absdiff;
+
+result = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    absdiff = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element1-element2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + absdiff;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmax_s8" type="checkbox"><label for="vmax_s8"><div>int8x8_t <b><b>vmax_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smax-signed-maximum-vector">SMAX</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmaxq_s8" type="checkbox"><label for="vmaxq_s8"><div>int8x16_t <b><b>vmaxq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smax-signed-maximum-vector">SMAX</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmax_s16" type="checkbox"><label for="vmax_s16"><div>int16x4_t <b><b>vmax_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smax-signed-maximum-vector">SMAX</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmaxq_s16" type="checkbox"><label for="vmaxq_s16"><div>int16x8_t <b><b>vmaxq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smax-signed-maximum-vector">SMAX</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmax_s32" type="checkbox"><label for="vmax_s32"><div>int32x2_t <b><b>vmax_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smax-signed-maximum-vector">SMAX</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmaxq_s32" type="checkbox"><label for="vmaxq_s32"><div>int32x4_t <b><b>vmaxq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smax-signed-maximum-vector">SMAX</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmax_u8" type="checkbox"><label for="vmax_u8"><div>uint8x8_t <b><b>vmax_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umax-unsigned-maximum-vector">UMAX</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmaxq_u8" type="checkbox"><label for="vmaxq_u8"><div>uint8x16_t <b><b>vmaxq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umax-unsigned-maximum-vector">UMAX</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmax_u16" type="checkbox"><label for="vmax_u16"><div>uint16x4_t <b><b>vmax_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umax-unsigned-maximum-vector">UMAX</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmaxq_u16" type="checkbox"><label for="vmaxq_u16"><div>uint16x8_t <b><b>vmaxq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umax-unsigned-maximum-vector">UMAX</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmax_u32" type="checkbox"><label for="vmax_u32"><div>uint32x2_t <b><b>vmax_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umax-unsigned-maximum-vector">UMAX</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmaxq_u32" type="checkbox"><label for="vmaxq_u32"><div>uint32x4_t <b><b>vmaxq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the larger of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umax-unsigned-maximum-vector">UMAX</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmax_f32" type="checkbox"><label for="vmax_f32"><div>float32x2_t <b><b>vmax_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the larger of each of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmax-vector-floating-point-maximum-vector">FMAX</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmaxq_f32" type="checkbox"><label for="vmaxq_f32"><div>float32x4_t <b><b>vmaxq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the larger of each of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmax-vector-floating-point-maximum-vector">FMAX</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmax_f64" type="checkbox"><label for="vmax_f64"><div>float64x1_t <b><b>vmax_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the larger of each of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmax-vector-floating-point-maximum-vector">FMAX</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxq_f64" type="checkbox"><label for="vmaxq_f64"><div>float64x2_t <b><b>vmaxq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point maximum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the larger of each of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmax-vector-floating-point-maximum-vector">FMAX</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmin_s8" type="checkbox"><label for="vmin_s8"><div>int8x8_t <b><b>vmin_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smin-signed-minimum-vector">SMIN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vminq_s8" type="checkbox"><label for="vminq_s8"><div>int8x16_t <b><b>vminq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smin-signed-minimum-vector">SMIN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmin_s16" type="checkbox"><label for="vmin_s16"><div>int16x4_t <b><b>vmin_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smin-signed-minimum-vector">SMIN</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vminq_s16" type="checkbox"><label for="vminq_s16"><div>int16x8_t <b><b>vminq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smin-signed-minimum-vector">SMIN</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmin_s32" type="checkbox"><label for="vmin_s32"><div>int32x2_t <b><b>vmin_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smin-signed-minimum-vector">SMIN</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vminq_s32" type="checkbox"><label for="vminq_s32"><div>int32x4_t <b><b>vminq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smin-signed-minimum-vector">SMIN</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmin_u8" type="checkbox"><label for="vmin_u8"><div>uint8x8_t <b><b>vmin_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the smaller of each of the two unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umin-unsigned-minimum-vector">UMIN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vminq_u8" type="checkbox"><label for="vminq_u8"><div>uint8x16_t <b><b>vminq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the smaller of each of the two unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umin-unsigned-minimum-vector">UMIN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmin_u16" type="checkbox"><label for="vmin_u16"><div>uint16x4_t <b><b>vmin_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the smaller of each of the two unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umin-unsigned-minimum-vector">UMIN</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vminq_u16" type="checkbox"><label for="vminq_u16"><div>uint16x8_t <b><b>vminq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the smaller of each of the two unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umin-unsigned-minimum-vector">UMIN</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmin_u32" type="checkbox"><label for="vmin_u32"><div>uint32x2_t <b><b>vmin_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the smaller of each of the two unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umin-unsigned-minimum-vector">UMIN</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vminq_u32" type="checkbox"><label for="vminq_u32"><div>uint32x4_t <b><b>vminq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, places the smaller of each of the two unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umin-unsigned-minimum-vector">UMIN</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmin_f32" type="checkbox"><label for="vmin_f32"><div>float32x2_t <b><b>vmin_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmin-vector-floating-point-minimum-vector">FMIN</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vminq_f32" type="checkbox"><label for="vminq_f32"><div>float32x4_t <b><b>vminq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmin-vector-floating-point-minimum-vector">FMIN</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmin_f64" type="checkbox"><label for="vmin_f64"><div>float64x1_t <b><b>vmin_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmin-vector-floating-point-minimum-vector">FMIN</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminq_f64" type="checkbox"><label for="vminq_f64"><div>float64x2_t <b><b>vminq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point minimum</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point minimum (vector). This instruction compares corresponding elements in the vectors in the two source SIMD&amp;FP registers, places the smaller of each of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmin-vector-floating-point-minimum-vector">FMIN</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxnm_f32" type="checkbox"><label for="vmaxnm_f32"><div>float32x2_t <b><b>vmaxnm_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point maximum number</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, writes the larger of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnm-vector-floating-point-maximum-number-vector">FMAXNM</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmaxnmq_f32" type="checkbox"><label for="vmaxnmq_f32"><div>float32x4_t <b><b>vmaxnmq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point maximum number</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, writes the larger of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnm-vector-floating-point-maximum-number-vector">FMAXNM</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmaxnm_f64" type="checkbox"><label for="vmaxnm_f64"><div>float64x1_t <b><b>vmaxnm_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point maximum number</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, writes the larger of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnm-vector-floating-point-maximum-number-vector">FMAXNM</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxnmq_f64" type="checkbox"><label for="vmaxnmq_f64"><div>float64x2_t <b><b>vmaxnmq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point maximum number</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, writes the larger of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnm-vector-floating-point-maximum-number-vector">FMAXNM</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminnm_f32" type="checkbox"><label for="vminnm_f32"><div>float32x2_t <b><b>vminnm_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point minimum number</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, writes the smaller of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnm-vector-floating-point-minimum-number-vector">FMINNM</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vminnmq_f32" type="checkbox"><label for="vminnmq_f32"><div>float32x4_t <b><b>vminnmq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point minimum number</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, writes the smaller of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnm-vector-floating-point-minimum-number-vector">FMINNM</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vminnm_f64" type="checkbox"><label for="vminnm_f64"><div>float64x1_t <b><b>vminnm_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point minimum number</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, writes the smaller of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnm-vector-floating-point-minimum-number-vector">FMINNM</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminnmq_f64" type="checkbox"><label for="vminnmq_f64"><div>float64x2_t <b><b>vminnmq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point minimum number</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number (vector). This instruction compares corresponding vector elements in the two source SIMD&amp;FP registers, writes the smaller of the two floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnm-vector-floating-point-minimum-number-vector">FMINNM</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshl_s8" type="checkbox"><label for="vshl_s8"><div>int8x8_t <b><b>vshl_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts each value by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshl-signed-shift-left-register">SSHL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_s8" type="checkbox"><label for="vshlq_s8"><div>int8x16_t <b><b>vshlq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts each value by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshl-signed-shift-left-register">SSHL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_s16" type="checkbox"><label for="vshl_s16"><div>int16x4_t <b><b>vshl_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts each value by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshl-signed-shift-left-register">SSHL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_s16" type="checkbox"><label for="vshlq_s16"><div>int16x8_t <b><b>vshlq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts each value by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshl-signed-shift-left-register">SSHL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_s32" type="checkbox"><label for="vshl_s32"><div>int32x2_t <b><b>vshl_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts each value by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshl-signed-shift-left-register">SSHL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_s32" type="checkbox"><label for="vshlq_s32"><div>int32x4_t <b><b>vshlq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts each value by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshl-signed-shift-left-register">SSHL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_s64" type="checkbox"><label for="vshl_s64"><div>int64x1_t <b><b>vshl_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Signed shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts each value by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshl-signed-shift-left-register">SSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_s64" type="checkbox"><label for="vshlq_s64"><div>int64x2_t <b><b>vshlq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Signed shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts each value by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshl-signed-shift-left-register">SSHL</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_u8" type="checkbox"><label for="vshl_u8"><div>uint8x8_t <b><b>vshl_u8</b></b> (uint8x8_t a, int8x8_t b)<span class="right">Unsigned shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushl-unsigned-shift-left-register">USHL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_u8" type="checkbox"><label for="vshlq_u8"><div>uint8x16_t <b><b>vshlq_u8</b></b> (uint8x16_t a, int8x16_t b)<span class="right">Unsigned shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushl-unsigned-shift-left-register">USHL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_u16" type="checkbox"><label for="vshl_u16"><div>uint16x4_t <b><b>vshl_u16</b></b> (uint16x4_t a, int16x4_t b)<span class="right">Unsigned shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushl-unsigned-shift-left-register">USHL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_u16" type="checkbox"><label for="vshlq_u16"><div>uint16x8_t <b><b>vshlq_u16</b></b> (uint16x8_t a, int16x8_t b)<span class="right">Unsigned shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushl-unsigned-shift-left-register">USHL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_u32" type="checkbox"><label for="vshl_u32"><div>uint32x2_t <b><b>vshl_u32</b></b> (uint32x2_t a, int32x2_t b)<span class="right">Unsigned shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushl-unsigned-shift-left-register">USHL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_u32" type="checkbox"><label for="vshlq_u32"><div>uint32x4_t <b><b>vshlq_u32</b></b> (uint32x4_t a, int32x4_t b)<span class="right">Unsigned shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushl-unsigned-shift-left-register">USHL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_u64" type="checkbox"><label for="vshl_u64"><div>uint64x1_t <b><b>vshl_u64</b></b> (uint64x1_t a, int64x1_t b)<span class="right">Unsigned shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushl-unsigned-shift-left-register">USHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_u64" type="checkbox"><label for="vshlq_u64"><div>uint64x2_t <b><b>vshlq_u64</b></b> (uint64x2_t a, int64x2_t b)<span class="right">Unsigned shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushl-unsigned-shift-left-register">USHL</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshld_s64" type="checkbox"><label for="vshld_s64"><div>int64_t <b><b>vshld_s64</b></b> (int64_t a, int64_t b)<span class="right">Signed shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts each value by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshl-signed-shift-left-register">SSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshld_u64" type="checkbox"><label for="vshld_u64"><div>uint64_t <b><b>vshld_u64</b></b> (uint64_t a, int64_t b)<span class="right">Unsigned shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushl-unsigned-shift-left-register">USHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_s8" type="checkbox"><label for="vqshl_s8"><div>int8x8_t <b><b>vqshl_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_s8" type="checkbox"><label for="vqshlq_s8"><div>int8x16_t <b><b>vqshlq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_s16" type="checkbox"><label for="vqshl_s16"><div>int16x4_t <b><b>vqshl_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_s16" type="checkbox"><label for="vqshlq_s16"><div>int16x8_t <b><b>vqshlq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_s32" type="checkbox"><label for="vqshl_s32"><div>int32x2_t <b><b>vqshl_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_s32" type="checkbox"><label for="vqshlq_s32"><div>int32x4_t <b><b>vqshlq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_s64" type="checkbox"><label for="vqshl_s64"><div>int64x1_t <b><b>vqshl_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_s64" type="checkbox"><label for="vqshlq_s64"><div>int64x2_t <b><b>vqshlq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_u8" type="checkbox"><label for="vqshl_u8"><div>uint8x8_t <b><b>vqshl_u8</b></b> (uint8x8_t a, int8x8_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_u8" type="checkbox"><label for="vqshlq_u8"><div>uint8x16_t <b><b>vqshlq_u8</b></b> (uint8x16_t a, int8x16_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_u16" type="checkbox"><label for="vqshl_u16"><div>uint16x4_t <b><b>vqshl_u16</b></b> (uint16x4_t a, int16x4_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_u16" type="checkbox"><label for="vqshlq_u16"><div>uint16x8_t <b><b>vqshlq_u16</b></b> (uint16x8_t a, int16x8_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_u32" type="checkbox"><label for="vqshl_u32"><div>uint32x2_t <b><b>vqshl_u32</b></b> (uint32x2_t a, int32x2_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_u32" type="checkbox"><label for="vqshlq_u32"><div>uint32x4_t <b><b>vqshlq_u32</b></b> (uint32x4_t a, int32x4_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_u64" type="checkbox"><label for="vqshl_u64"><div>uint64x1_t <b><b>vqshl_u64</b></b> (uint64x1_t a, int64x1_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_u64" type="checkbox"><label for="vqshlq_u64"><div>uint64x2_t <b><b>vqshlq_u64</b></b> (uint64x2_t a, int64x2_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlb_s8" type="checkbox"><label for="vqshlb_s8"><div>int8_t <b><b>vqshlb_s8</b></b> (int8_t a, int8_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Bd,Bn,Bm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+b &rarr; Bm </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshlh_s16" type="checkbox"><label for="vqshlh_s16"><div>int16_t <b><b>vqshlh_s16</b></b> (int16_t a, int16_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshls_s32" type="checkbox"><label for="vqshls_s32"><div>int32_t <b><b>vqshls_s32</b></b> (int32_t a, int32_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshld_s64" type="checkbox"><label for="vqshld_s64"><div>int64_t <b><b>vqshld_s64</b></b> (int64_t a, int64_t b)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshlb_u8" type="checkbox"><label for="vqshlb_u8"><div>uint8_t <b><b>vqshlb_u8</b></b> (uint8_t a, int8_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Bd,Bn,Bm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+b &rarr; Bm </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshlh_u16" type="checkbox"><label for="vqshlh_u16"><div>uint16_t <b><b>vqshlh_u16</b></b> (uint16_t a, int16_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshls_u32" type="checkbox"><label for="vqshls_u32"><div>uint32_t <b><b>vqshls_u32</b></b> (uint32_t a, int32_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshld_u64" type="checkbox"><label for="vqshld_u64"><div>uint64_t <b><b>vqshld_u64</b></b> (uint64_t a, int64_t b)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshl_s8" type="checkbox"><label for="vrshl_s8"><div>int8x8_t <b><b>vrshl_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshl-signed-rounding-shift-left-register">SRSHL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshlq_s8" type="checkbox"><label for="vrshlq_s8"><div>int8x16_t <b><b>vrshlq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshl-signed-rounding-shift-left-register">SRSHL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshl_s16" type="checkbox"><label for="vrshl_s16"><div>int16x4_t <b><b>vrshl_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshl-signed-rounding-shift-left-register">SRSHL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshlq_s16" type="checkbox"><label for="vrshlq_s16"><div>int16x8_t <b><b>vrshlq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshl-signed-rounding-shift-left-register">SRSHL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshl_s32" type="checkbox"><label for="vrshl_s32"><div>int32x2_t <b><b>vrshl_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshl-signed-rounding-shift-left-register">SRSHL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshlq_s32" type="checkbox"><label for="vrshlq_s32"><div>int32x4_t <b><b>vrshlq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshl-signed-rounding-shift-left-register">SRSHL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshl_s64" type="checkbox"><label for="vrshl_s64"><div>int64x1_t <b><b>vrshl_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Signed rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshl-signed-rounding-shift-left-register">SRSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshlq_s64" type="checkbox"><label for="vrshlq_s64"><div>int64x2_t <b><b>vrshlq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Signed rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshl-signed-rounding-shift-left-register">SRSHL</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshl_u8" type="checkbox"><label for="vrshl_u8"><div>uint8x8_t <b><b>vrshl_u8</b></b> (uint8x8_t a, int8x8_t b)<span class="right">Unsigned rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshl-unsigned-rounding-shift-left-register">URSHL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshlq_u8" type="checkbox"><label for="vrshlq_u8"><div>uint8x16_t <b><b>vrshlq_u8</b></b> (uint8x16_t a, int8x16_t b)<span class="right">Unsigned rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshl-unsigned-rounding-shift-left-register">URSHL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshl_u16" type="checkbox"><label for="vrshl_u16"><div>uint16x4_t <b><b>vrshl_u16</b></b> (uint16x4_t a, int16x4_t b)<span class="right">Unsigned rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshl-unsigned-rounding-shift-left-register">URSHL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshlq_u16" type="checkbox"><label for="vrshlq_u16"><div>uint16x8_t <b><b>vrshlq_u16</b></b> (uint16x8_t a, int16x8_t b)<span class="right">Unsigned rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshl-unsigned-rounding-shift-left-register">URSHL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshl_u32" type="checkbox"><label for="vrshl_u32"><div>uint32x2_t <b><b>vrshl_u32</b></b> (uint32x2_t a, int32x2_t b)<span class="right">Unsigned rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshl-unsigned-rounding-shift-left-register">URSHL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshlq_u32" type="checkbox"><label for="vrshlq_u32"><div>uint32x4_t <b><b>vrshlq_u32</b></b> (uint32x4_t a, int32x4_t b)<span class="right">Unsigned rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshl-unsigned-rounding-shift-left-register">URSHL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshl_u64" type="checkbox"><label for="vrshl_u64"><div>uint64x1_t <b><b>vrshl_u64</b></b> (uint64x1_t a, int64x1_t b)<span class="right">Unsigned rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshl-unsigned-rounding-shift-left-register">URSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshlq_u64" type="checkbox"><label for="vrshlq_u64"><div>uint64x2_t <b><b>vrshlq_u64</b></b> (uint64x2_t a, int64x2_t b)<span class="right">Unsigned rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshl-unsigned-rounding-shift-left-register">URSHL</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshld_s64" type="checkbox"><label for="vrshld_s64"><div>int64_t <b><b>vrshld_s64</b></b> (int64_t a, int64_t b)<span class="right">Signed rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Left (register). This instruction takes each signed integer value in the vector of the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshl-signed-rounding-shift-left-register">SRSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshld_u64" type="checkbox"><label for="vrshld_u64"><div>uint64_t <b><b>vrshld_u64</b></b> (uint64_t a, int64_t b)<span class="right">Unsigned rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshl-unsigned-rounding-shift-left-register">URSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshl_s8" type="checkbox"><label for="vqrshl_s8"><div>int8x8_t <b><b>vqrshl_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlq_s8" type="checkbox"><label for="vqrshlq_s8"><div>int8x16_t <b><b>vqrshlq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshl_s16" type="checkbox"><label for="vqrshl_s16"><div>int16x4_t <b><b>vqrshl_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlq_s16" type="checkbox"><label for="vqrshlq_s16"><div>int16x8_t <b><b>vqrshlq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshl_s32" type="checkbox"><label for="vqrshl_s32"><div>int32x2_t <b><b>vqrshl_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlq_s32" type="checkbox"><label for="vqrshlq_s32"><div>int32x4_t <b><b>vqrshlq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshl_s64" type="checkbox"><label for="vqrshl_s64"><div>int64x1_t <b><b>vqrshl_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlq_s64" type="checkbox"><label for="vqrshlq_s64"><div>int64x2_t <b><b>vqrshlq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshl_u8" type="checkbox"><label for="vqrshl_u8"><div>uint8x8_t <b><b>vqrshl_u8</b></b> (uint8x8_t a, int8x8_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlq_u8" type="checkbox"><label for="vqrshlq_u8"><div>uint8x16_t <b><b>vqrshlq_u8</b></b> (uint8x16_t a, int8x16_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshl_u16" type="checkbox"><label for="vqrshl_u16"><div>uint16x4_t <b><b>vqrshl_u16</b></b> (uint16x4_t a, int16x4_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlq_u16" type="checkbox"><label for="vqrshlq_u16"><div>uint16x8_t <b><b>vqrshlq_u16</b></b> (uint16x8_t a, int16x8_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshl_u32" type="checkbox"><label for="vqrshl_u32"><div>uint32x2_t <b><b>vqrshl_u32</b></b> (uint32x2_t a, int32x2_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlq_u32" type="checkbox"><label for="vqrshlq_u32"><div>uint32x4_t <b><b>vqrshlq_u32</b></b> (uint32x4_t a, int32x4_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshl_u64" type="checkbox"><label for="vqrshl_u64"><div>uint64x1_t <b><b>vqrshl_u64</b></b> (uint64x1_t a, int64x1_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlq_u64" type="checkbox"><label for="vqrshlq_u64"><div>uint64x2_t <b><b>vqrshlq_u64</b></b> (uint64x2_t a, int64x2_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlb_s8" type="checkbox"><label for="vqrshlb_s8"><div>int8_t <b><b>vqrshlb_s8</b></b> (int8_t a, int8_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Bd,Bn,Bm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+b &rarr; Bm </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlh_s16" type="checkbox"><label for="vqrshlh_s16"><div>int16_t <b><b>vqrshlh_s16</b></b> (int16_t a, int16_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshls_s32" type="checkbox"><label for="vqrshls_s32"><div>int32_t <b><b>vqrshls_s32</b></b> (int32_t a, int32_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshld_s64" type="checkbox"><label for="vqrshld_s64"><div>int64_t <b><b>vqrshld_s64</b></b> (int64_t a, int64_t b)<span class="right">Signed saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Shift Left (register). This instruction takes each vector element in the first source SIMD&amp;FP register, shifts it by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshl-signed-saturating-rounding-shift-left-register">SQRSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlb_u8" type="checkbox"><label for="vqrshlb_u8"><div>uint8_t <b><b>vqrshlb_u8</b></b> (uint8_t a, int8_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Bd,Bn,Bm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+b &rarr; Bm </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshlh_u16" type="checkbox"><label for="vqrshlh_u16"><div>uint16_t <b><b>vqrshlh_u16</b></b> (uint16_t a, int16_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Hd,Hn,Hm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+b &rarr; Hm </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshls_u32" type="checkbox"><label for="vqrshls_u32"><div>uint32_t <b><b>vqrshls_u32</b></b> (uint32_t a, int32_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshld_u64" type="checkbox"><label for="vqrshld_u64"><div>uint64_t <b><b>vqrshld_u64</b></b> (uint64_t a, int64_t b)<span class="right">Unsigned saturating rounding shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounding Shift Left (register). This instruction takes each vector element of the first source SIMD&amp;FP register, shifts the vector element by a value from the least significant byte of the corresponding vector element of the second source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshl-unsigned-saturating-rounding-shift-left-register">UQRSHL</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshr_n_s8" type="checkbox"><label for="vshr_n_s8"><div>int8x8_t <b><b>vshr_n_s8</b></b> (int8x8_t a, const int n)<span class="right">Signed shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshr-signed-shift-right-immediate">SSHR</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrq_n_s8" type="checkbox"><label for="vshrq_n_s8"><div>int8x16_t <b><b>vshrq_n_s8</b></b> (int8x16_t a, const int n)<span class="right">Signed shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshr-signed-shift-right-immediate">SSHR</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshr_n_s16" type="checkbox"><label for="vshr_n_s16"><div>int16x4_t <b><b>vshr_n_s16</b></b> (int16x4_t a, const int n)<span class="right">Signed shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshr-signed-shift-right-immediate">SSHR</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrq_n_s16" type="checkbox"><label for="vshrq_n_s16"><div>int16x8_t <b><b>vshrq_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Signed shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshr-signed-shift-right-immediate">SSHR</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshr_n_s32" type="checkbox"><label for="vshr_n_s32"><div>int32x2_t <b><b>vshr_n_s32</b></b> (int32x2_t a, const int n)<span class="right">Signed shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshr-signed-shift-right-immediate">SSHR</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrq_n_s32" type="checkbox"><label for="vshrq_n_s32"><div>int32x4_t <b><b>vshrq_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshr-signed-shift-right-immediate">SSHR</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshr_n_s64" type="checkbox"><label for="vshr_n_s64"><div>int64x1_t <b><b>vshr_n_s64</b></b> (int64x1_t a, const int n)<span class="right">Signed shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshr-signed-shift-right-immediate">SSHR</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrq_n_s64" type="checkbox"><label for="vshrq_n_s64"><div>int64x2_t <b><b>vshrq_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Signed shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshr-signed-shift-right-immediate">SSHR</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshr_n_u8" type="checkbox"><label for="vshr_n_u8"><div>uint8x8_t <b><b>vshr_n_u8</b></b> (uint8x8_t a, const int n)<span class="right">Unsigned shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushr-unsigned-shift-right-immediate">USHR</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrq_n_u8" type="checkbox"><label for="vshrq_n_u8"><div>uint8x16_t <b><b>vshrq_n_u8</b></b> (uint8x16_t a, const int n)<span class="right">Unsigned shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushr-unsigned-shift-right-immediate">USHR</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshr_n_u16" type="checkbox"><label for="vshr_n_u16"><div>uint16x4_t <b><b>vshr_n_u16</b></b> (uint16x4_t a, const int n)<span class="right">Unsigned shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushr-unsigned-shift-right-immediate">USHR</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrq_n_u16" type="checkbox"><label for="vshrq_n_u16"><div>uint16x8_t <b><b>vshrq_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Unsigned shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushr-unsigned-shift-right-immediate">USHR</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshr_n_u32" type="checkbox"><label for="vshr_n_u32"><div>uint32x2_t <b><b>vshr_n_u32</b></b> (uint32x2_t a, const int n)<span class="right">Unsigned shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushr-unsigned-shift-right-immediate">USHR</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrq_n_u32" type="checkbox"><label for="vshrq_n_u32"><div>uint32x4_t <b><b>vshrq_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Unsigned shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushr-unsigned-shift-right-immediate">USHR</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshr_n_u64" type="checkbox"><label for="vshr_n_u64"><div>uint64x1_t <b><b>vshr_n_u64</b></b> (uint64x1_t a, const int n)<span class="right">Unsigned shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushr-unsigned-shift-right-immediate">USHR</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrq_n_u64" type="checkbox"><label for="vshrq_n_u64"><div>uint64x2_t <b><b>vshrq_n_u64</b></b> (uint64x2_t a, const int n)<span class="right">Unsigned shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushr-unsigned-shift-right-immediate">USHR</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrd_n_s64" type="checkbox"><label for="vshrd_n_s64"><div>int64_t <b><b>vshrd_n_s64</b></b> (int64_t a, const int n)<span class="right">Signed shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshr-signed-shift-right-immediate">SSHR</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshrd_n_u64" type="checkbox"><label for="vshrd_n_u64"><div>uint64_t <b><b>vshrd_n_u64</b></b> (uint64_t a, const int n)<span class="right">Unsigned shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushr-unsigned-shift-right-immediate">USHR</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshl_n_s8" type="checkbox"><label for="vshl_n_s8"><div>int8x8_t <b><b>vshl_n_s8</b></b> (int8x8_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_n_s8" type="checkbox"><label for="vshlq_n_s8"><div>int8x16_t <b><b>vshlq_n_s8</b></b> (int8x16_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_n_s16" type="checkbox"><label for="vshl_n_s16"><div>int16x4_t <b><b>vshl_n_s16</b></b> (int16x4_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_n_s16" type="checkbox"><label for="vshlq_n_s16"><div>int16x8_t <b><b>vshlq_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_n_s32" type="checkbox"><label for="vshl_n_s32"><div>int32x2_t <b><b>vshl_n_s32</b></b> (int32x2_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_n_s32" type="checkbox"><label for="vshlq_n_s32"><div>int32x4_t <b><b>vshlq_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_n_s64" type="checkbox"><label for="vshl_n_s64"><div>int64x1_t <b><b>vshl_n_s64</b></b> (int64x1_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_n_s64" type="checkbox"><label for="vshlq_n_s64"><div>int64x2_t <b><b>vshlq_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_n_u8" type="checkbox"><label for="vshl_n_u8"><div>uint8x8_t <b><b>vshl_n_u8</b></b> (uint8x8_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_n_u8" type="checkbox"><label for="vshlq_n_u8"><div>uint8x16_t <b><b>vshlq_n_u8</b></b> (uint8x16_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_n_u16" type="checkbox"><label for="vshl_n_u16"><div>uint16x4_t <b><b>vshl_n_u16</b></b> (uint16x4_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_n_u16" type="checkbox"><label for="vshlq_n_u16"><div>uint16x8_t <b><b>vshlq_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_n_u32" type="checkbox"><label for="vshl_n_u32"><div>uint32x2_t <b><b>vshl_n_u32</b></b> (uint32x2_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_n_u32" type="checkbox"><label for="vshlq_n_u32"><div>uint32x4_t <b><b>vshlq_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshl_n_u64" type="checkbox"><label for="vshl_n_u64"><div>uint64x1_t <b><b>vshl_n_u64</b></b> (uint64x1_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshlq_n_u64" type="checkbox"><label for="vshlq_n_u64"><div>uint64x2_t <b><b>vshlq_n_u64</b></b> (uint64x2_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshld_n_s64" type="checkbox"><label for="vshld_n_s64"><div>int64_t <b><b>vshld_n_s64</b></b> (int64_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshld_n_u64" type="checkbox"><label for="vshld_n_u64"><div>uint64_t <b><b>vshld_n_u64</b></b> (uint64_t a, const int n)<span class="right">Shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left (immediate). This instruction reads each value from a vector, left shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shl-shift-left-immediate">SHL</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshr_n_s8" type="checkbox"><label for="vrshr_n_s8"><div>int8x8_t <b><b>vrshr_n_s8</b></b> (int8x8_t a, const int n)<span class="right">Signed rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshr-signed-rounding-shift-right-immediate">SRSHR</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrq_n_s8" type="checkbox"><label for="vrshrq_n_s8"><div>int8x16_t <b><b>vrshrq_n_s8</b></b> (int8x16_t a, const int n)<span class="right">Signed rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshr-signed-rounding-shift-right-immediate">SRSHR</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshr_n_s16" type="checkbox"><label for="vrshr_n_s16"><div>int16x4_t <b><b>vrshr_n_s16</b></b> (int16x4_t a, const int n)<span class="right">Signed rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshr-signed-rounding-shift-right-immediate">SRSHR</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrq_n_s16" type="checkbox"><label for="vrshrq_n_s16"><div>int16x8_t <b><b>vrshrq_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Signed rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshr-signed-rounding-shift-right-immediate">SRSHR</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshr_n_s32" type="checkbox"><label for="vrshr_n_s32"><div>int32x2_t <b><b>vrshr_n_s32</b></b> (int32x2_t a, const int n)<span class="right">Signed rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshr-signed-rounding-shift-right-immediate">SRSHR</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrq_n_s32" type="checkbox"><label for="vrshrq_n_s32"><div>int32x4_t <b><b>vrshrq_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshr-signed-rounding-shift-right-immediate">SRSHR</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshr_n_s64" type="checkbox"><label for="vrshr_n_s64"><div>int64x1_t <b><b>vrshr_n_s64</b></b> (int64x1_t a, const int n)<span class="right">Signed rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshr-signed-rounding-shift-right-immediate">SRSHR</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrq_n_s64" type="checkbox"><label for="vrshrq_n_s64"><div>int64x2_t <b><b>vrshrq_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Signed rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshr-signed-rounding-shift-right-immediate">SRSHR</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshr_n_u8" type="checkbox"><label for="vrshr_n_u8"><div>uint8x8_t <b><b>vrshr_n_u8</b></b> (uint8x8_t a, const int n)<span class="right">Unsigned rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshr-unsigned-rounding-shift-right-immediate">URSHR</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrq_n_u8" type="checkbox"><label for="vrshrq_n_u8"><div>uint8x16_t <b><b>vrshrq_n_u8</b></b> (uint8x16_t a, const int n)<span class="right">Unsigned rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshr-unsigned-rounding-shift-right-immediate">URSHR</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshr_n_u16" type="checkbox"><label for="vrshr_n_u16"><div>uint16x4_t <b><b>vrshr_n_u16</b></b> (uint16x4_t a, const int n)<span class="right">Unsigned rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshr-unsigned-rounding-shift-right-immediate">URSHR</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrq_n_u16" type="checkbox"><label for="vrshrq_n_u16"><div>uint16x8_t <b><b>vrshrq_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Unsigned rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshr-unsigned-rounding-shift-right-immediate">URSHR</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshr_n_u32" type="checkbox"><label for="vrshr_n_u32"><div>uint32x2_t <b><b>vrshr_n_u32</b></b> (uint32x2_t a, const int n)<span class="right">Unsigned rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshr-unsigned-rounding-shift-right-immediate">URSHR</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrq_n_u32" type="checkbox"><label for="vrshrq_n_u32"><div>uint32x4_t <b><b>vrshrq_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Unsigned rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshr-unsigned-rounding-shift-right-immediate">URSHR</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshr_n_u64" type="checkbox"><label for="vrshr_n_u64"><div>uint64x1_t <b><b>vrshr_n_u64</b></b> (uint64x1_t a, const int n)<span class="right">Unsigned rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshr-unsigned-rounding-shift-right-immediate">URSHR</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrq_n_u64" type="checkbox"><label for="vrshrq_n_u64"><div>uint64x2_t <b><b>vrshrq_n_u64</b></b> (uint64x2_t a, const int n)<span class="right">Unsigned rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshr-unsigned-rounding-shift-right-immediate">URSHR</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrd_n_s64" type="checkbox"><label for="vrshrd_n_s64"><div>int64_t <b><b>vrshrd_n_s64</b></b> (int64_t a, const int n)<span class="right">Signed rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srshr-signed-rounding-shift-right-immediate">SRSHR</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshrd_n_u64" type="checkbox"><label for="vrshrd_n_u64"><div>uint64_t <b><b>vrshrd_n_u64</b></b> (uint64_t a, const int n)<span class="right">Unsigned rounding shift right</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USHR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urshr-unsigned-rounding-shift-right-immediate">URSHR</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsra_n_s8" type="checkbox"><label for="vsra_n_s8"><div>int8x8_t <b><b>vsra_n_s8</b></b> (int8x8_t a, int8x8_t b, const int n)<span class="right">Signed shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssra-signed-shift-right-and-accumulate-immediate">SSRA</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsraq_n_s8" type="checkbox"><label for="vsraq_n_s8"><div>int8x16_t <b><b>vsraq_n_s8</b></b> (int8x16_t a, int8x16_t b, const int n)<span class="right">Signed shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssra-signed-shift-right-and-accumulate-immediate">SSRA</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsra_n_s16" type="checkbox"><label for="vsra_n_s16"><div>int16x4_t <b><b>vsra_n_s16</b></b> (int16x4_t a, int16x4_t b, const int n)<span class="right">Signed shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssra-signed-shift-right-and-accumulate-immediate">SSRA</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsraq_n_s16" type="checkbox"><label for="vsraq_n_s16"><div>int16x8_t <b><b>vsraq_n_s16</b></b> (int16x8_t a, int16x8_t b, const int n)<span class="right">Signed shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssra-signed-shift-right-and-accumulate-immediate">SSRA</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsra_n_s32" type="checkbox"><label for="vsra_n_s32"><div>int32x2_t <b><b>vsra_n_s32</b></b> (int32x2_t a, int32x2_t b, const int n)<span class="right">Signed shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssra-signed-shift-right-and-accumulate-immediate">SSRA</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsraq_n_s32" type="checkbox"><label for="vsraq_n_s32"><div>int32x4_t <b><b>vsraq_n_s32</b></b> (int32x4_t a, int32x4_t b, const int n)<span class="right">Signed shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssra-signed-shift-right-and-accumulate-immediate">SSRA</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsra_n_s64" type="checkbox"><label for="vsra_n_s64"><div>int64x1_t <b><b>vsra_n_s64</b></b> (int64x1_t a, int64x1_t b, const int n)<span class="right">Signed shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssra-signed-shift-right-and-accumulate-immediate">SSRA</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsraq_n_s64" type="checkbox"><label for="vsraq_n_s64"><div>int64x2_t <b><b>vsraq_n_s64</b></b> (int64x2_t a, int64x2_t b, const int n)<span class="right">Signed shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssra-signed-shift-right-and-accumulate-immediate">SSRA</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsra_n_u8" type="checkbox"><label for="vsra_n_u8"><div>uint8x8_t <b><b>vsra_n_u8</b></b> (uint8x8_t a, uint8x8_t b, const int n)<span class="right">Unsigned shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usra-unsigned-shift-right-and-accumulate-immediate">USRA</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsraq_n_u8" type="checkbox"><label for="vsraq_n_u8"><div>uint8x16_t <b><b>vsraq_n_u8</b></b> (uint8x16_t a, uint8x16_t b, const int n)<span class="right">Unsigned shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usra-unsigned-shift-right-and-accumulate-immediate">USRA</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsra_n_u16" type="checkbox"><label for="vsra_n_u16"><div>uint16x4_t <b><b>vsra_n_u16</b></b> (uint16x4_t a, uint16x4_t b, const int n)<span class="right">Unsigned shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usra-unsigned-shift-right-and-accumulate-immediate">USRA</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsraq_n_u16" type="checkbox"><label for="vsraq_n_u16"><div>uint16x8_t <b><b>vsraq_n_u16</b></b> (uint16x8_t a, uint16x8_t b, const int n)<span class="right">Unsigned shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usra-unsigned-shift-right-and-accumulate-immediate">USRA</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsra_n_u32" type="checkbox"><label for="vsra_n_u32"><div>uint32x2_t <b><b>vsra_n_u32</b></b> (uint32x2_t a, uint32x2_t b, const int n)<span class="right">Unsigned shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usra-unsigned-shift-right-and-accumulate-immediate">USRA</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsraq_n_u32" type="checkbox"><label for="vsraq_n_u32"><div>uint32x4_t <b><b>vsraq_n_u32</b></b> (uint32x4_t a, uint32x4_t b, const int n)<span class="right">Unsigned shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usra-unsigned-shift-right-and-accumulate-immediate">USRA</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsra_n_u64" type="checkbox"><label for="vsra_n_u64"><div>uint64x1_t <b><b>vsra_n_u64</b></b> (uint64x1_t a, uint64x1_t b, const int n)<span class="right">Unsigned shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usra-unsigned-shift-right-and-accumulate-immediate">USRA</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsraq_n_u64" type="checkbox"><label for="vsraq_n_u64"><div>uint64x2_t <b><b>vsraq_n_u64</b></b> (uint64x2_t a, uint64x2_t b, const int n)<span class="right">Unsigned shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usra-unsigned-shift-right-and-accumulate-immediate">USRA</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsrad_n_s64" type="checkbox"><label for="vsrad_n_s64"><div>int64_t <b><b>vsrad_n_s64</b></b> (int64_t a, int64_t b, const int n)<span class="right">Signed shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SRSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ssra-signed-shift-right-and-accumulate-immediate">SSRA</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsrad_n_u64" type="checkbox"><label for="vsrad_n_u64"><div>uint64_t <b><b>vsrad_n_u64</b></b> (uint64_t a, uint64_t b, const int n)<span class="right">Unsigned shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">URSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/usra-unsigned-shift-right-and-accumulate-immediate">USRA</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsra_n_s8" type="checkbox"><label for="vrsra_n_s8"><div>int8x8_t <b><b>vrsra_n_s8</b></b> (int8x8_t a, int8x8_t b, const int n)<span class="right">Signed rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srsra-signed-rounding-shift-right-and-accumulate-immediate">SRSRA</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsraq_n_s8" type="checkbox"><label for="vrsraq_n_s8"><div>int8x16_t <b><b>vrsraq_n_s8</b></b> (int8x16_t a, int8x16_t b, const int n)<span class="right">Signed rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srsra-signed-rounding-shift-right-and-accumulate-immediate">SRSRA</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsra_n_s16" type="checkbox"><label for="vrsra_n_s16"><div>int16x4_t <b><b>vrsra_n_s16</b></b> (int16x4_t a, int16x4_t b, const int n)<span class="right">Signed rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srsra-signed-rounding-shift-right-and-accumulate-immediate">SRSRA</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsraq_n_s16" type="checkbox"><label for="vrsraq_n_s16"><div>int16x8_t <b><b>vrsraq_n_s16</b></b> (int16x8_t a, int16x8_t b, const int n)<span class="right">Signed rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srsra-signed-rounding-shift-right-and-accumulate-immediate">SRSRA</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsra_n_s32" type="checkbox"><label for="vrsra_n_s32"><div>int32x2_t <b><b>vrsra_n_s32</b></b> (int32x2_t a, int32x2_t b, const int n)<span class="right">Signed rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srsra-signed-rounding-shift-right-and-accumulate-immediate">SRSRA</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsraq_n_s32" type="checkbox"><label for="vrsraq_n_s32"><div>int32x4_t <b><b>vrsraq_n_s32</b></b> (int32x4_t a, int32x4_t b, const int n)<span class="right">Signed rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srsra-signed-rounding-shift-right-and-accumulate-immediate">SRSRA</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsra_n_s64" type="checkbox"><label for="vrsra_n_s64"><div>int64x1_t <b><b>vrsra_n_s64</b></b> (int64x1_t a, int64x1_t b, const int n)<span class="right">Signed rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srsra-signed-rounding-shift-right-and-accumulate-immediate">SRSRA</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsraq_n_s64" type="checkbox"><label for="vrsraq_n_s64"><div>int64x2_t <b><b>vrsraq_n_s64</b></b> (int64x2_t a, int64x2_t b, const int n)<span class="right">Signed rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srsra-signed-rounding-shift-right-and-accumulate-immediate">SRSRA</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsra_n_u8" type="checkbox"><label for="vrsra_n_u8"><div>uint8x8_t <b><b>vrsra_n_u8</b></b> (uint8x8_t a, uint8x8_t b, const int n)<span class="right">Unsigned rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursra-unsigned-rounding-shift-right-and-accumulate-immediate">URSRA</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsraq_n_u8" type="checkbox"><label for="vrsraq_n_u8"><div>uint8x16_t <b><b>vrsraq_n_u8</b></b> (uint8x16_t a, uint8x16_t b, const int n)<span class="right">Unsigned rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursra-unsigned-rounding-shift-right-and-accumulate-immediate">URSRA</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsra_n_u16" type="checkbox"><label for="vrsra_n_u16"><div>uint16x4_t <b><b>vrsra_n_u16</b></b> (uint16x4_t a, uint16x4_t b, const int n)<span class="right">Unsigned rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursra-unsigned-rounding-shift-right-and-accumulate-immediate">URSRA</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsraq_n_u16" type="checkbox"><label for="vrsraq_n_u16"><div>uint16x8_t <b><b>vrsraq_n_u16</b></b> (uint16x8_t a, uint16x8_t b, const int n)<span class="right">Unsigned rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursra-unsigned-rounding-shift-right-and-accumulate-immediate">URSRA</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsra_n_u32" type="checkbox"><label for="vrsra_n_u32"><div>uint32x2_t <b><b>vrsra_n_u32</b></b> (uint32x2_t a, uint32x2_t b, const int n)<span class="right">Unsigned rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursra-unsigned-rounding-shift-right-and-accumulate-immediate">URSRA</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsraq_n_u32" type="checkbox"><label for="vrsraq_n_u32"><div>uint32x4_t <b><b>vrsraq_n_u32</b></b> (uint32x4_t a, uint32x4_t b, const int n)<span class="right">Unsigned rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursra-unsigned-rounding-shift-right-and-accumulate-immediate">URSRA</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsra_n_u64" type="checkbox"><label for="vrsra_n_u64"><div>uint64x1_t <b><b>vrsra_n_u64</b></b> (uint64x1_t a, uint64x1_t b, const int n)<span class="right">Unsigned rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursra-unsigned-rounding-shift-right-and-accumulate-immediate">URSRA</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsraq_n_u64" type="checkbox"><label for="vrsraq_n_u64"><div>uint64x2_t <b><b>vrsraq_n_u64</b></b> (uint64x2_t a, uint64x2_t b, const int n)<span class="right">Unsigned rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursra-unsigned-rounding-shift-right-and-accumulate-immediate">URSRA</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsrad_n_s64" type="checkbox"><label for="vrsrad_n_s64"><div>int64_t <b><b>vrsrad_n_s64</b></b> (int64_t a, int64_t b, const int n)<span class="right">Signed rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SSRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/srsra-signed-rounding-shift-right-and-accumulate-immediate">SRSRA</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsrad_n_u64" type="checkbox"><label for="vrsrad_n_u64"><div>uint64_t <b><b>vrsrad_n_u64</b></b> (uint64_t a, uint64_t b, const int n)<span class="right">Unsigned rounding shift right and accumulate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Rounding Shift Right and Accumulate (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">USRA</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursra-unsigned-rounding-shift-right-and-accumulate-immediate">URSRA</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2;
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+operand2 = if accumulate then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] + element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_n_s8" type="checkbox"><label for="vqshl_n_s8"><div>int8x8_t <b><b>vqshl_n_s8</b></b> (int8x8_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_n_s8" type="checkbox"><label for="vqshlq_n_s8"><div>int8x16_t <b><b>vqshlq_n_s8</b></b> (int8x16_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_n_s16" type="checkbox"><label for="vqshl_n_s16"><div>int16x4_t <b><b>vqshl_n_s16</b></b> (int16x4_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_n_s16" type="checkbox"><label for="vqshlq_n_s16"><div>int16x8_t <b><b>vqshlq_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_n_s32" type="checkbox"><label for="vqshl_n_s32"><div>int32x2_t <b><b>vqshl_n_s32</b></b> (int32x2_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_n_s32" type="checkbox"><label for="vqshlq_n_s32"><div>int32x4_t <b><b>vqshlq_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_n_s64" type="checkbox"><label for="vqshl_n_s64"><div>int64x1_t <b><b>vqshl_n_s64</b></b> (int64x1_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_n_s64" type="checkbox"><label for="vqshlq_n_s64"><div>int64x2_t <b><b>vqshlq_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_n_u8" type="checkbox"><label for="vqshl_n_u8"><div>uint8x8_t <b><b>vqshl_n_u8</b></b> (uint8x8_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_n_u8" type="checkbox"><label for="vqshlq_n_u8"><div>uint8x16_t <b><b>vqshlq_n_u8</b></b> (uint8x16_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_n_u16" type="checkbox"><label for="vqshl_n_u16"><div>uint16x4_t <b><b>vqshl_n_u16</b></b> (uint16x4_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_n_u16" type="checkbox"><label for="vqshlq_n_u16"><div>uint16x8_t <b><b>vqshlq_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_n_u32" type="checkbox"><label for="vqshl_n_u32"><div>uint32x2_t <b><b>vqshl_n_u32</b></b> (uint32x2_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_n_u32" type="checkbox"><label for="vqshlq_n_u32"><div>uint32x4_t <b><b>vqshlq_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshl_n_u64" type="checkbox"><label for="vqshl_n_u64"><div>uint64x1_t <b><b>vqshl_n_u64</b></b> (uint64x1_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlq_n_u64" type="checkbox"><label for="vqshlq_n_u64"><div>uint64x2_t <b><b>vqshlq_n_u64</b></b> (uint64x2_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlb_n_s8" type="checkbox"><label for="vqshlb_n_s8"><div>int8_t <b><b>vqshlb_n_s8</b></b> (int8_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Bd,Bn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshlh_n_s16" type="checkbox"><label for="vqshlh_n_s16"><div>int16_t <b><b>vqshlh_n_s16</b></b> (int16_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Hd,Hn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshls_n_s32" type="checkbox"><label for="vqshls_n_s32"><div>int32_t <b><b>vqshls_n_s32</b></b> (int32_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Sd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshld_n_s64" type="checkbox"><label for="vqshld_n_s64"><div>int64_t <b><b>vqshld_n_s64</b></b> (int64_t a, const int n)<span class="right">Signed saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts each element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshl-register-signed-saturating-shift-left-register">SQSHL</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshlb_n_u8" type="checkbox"><label for="vqshlb_n_u8"><div>uint8_t <b><b>vqshlb_n_u8</b></b> (uint8_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Bd,Bn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshlh_n_u16" type="checkbox"><label for="vqshlh_n_u16"><div>uint16_t <b><b>vqshlh_n_u16</b></b> (uint16_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Hd,Hn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshls_n_u32" type="checkbox"><label for="vqshls_n_u32"><div>uint32_t <b><b>vqshls_n_u32</b></b> (uint32_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Sd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshld_n_u64" type="checkbox"><label for="vqshld_n_u64"><div>uint64_t <b><b>vqshld_n_u64</b></b> (uint64_t a, const int n)<span class="right">Unsigned saturating shift left</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Left (register). This instruction takes each element in the vector of the first source SIMD&amp;FP register, shifts the element by a value from the least significant byte of the corresponding element of the second source SIMD&amp;FP register, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshl-register-unsigned-saturating-shift-left-register">UQSHL</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer round_const = 0;
+integer shift;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    shift = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]&lt;7:0&gt;);
+    if rounding then
+        round_const = 1 &lt;&lt; (-shift - 1);    // 0 for left shift, 2^(n-1) for right shift 
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned) + round_const) &lt;&lt; shift;
+    if saturating then
+        (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+        if sat then FPSR.QC = '1';
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshlu_n_s8" type="checkbox"><label for="vqshlu_n_s8"><div>uint8x8_t <b><b>vqshlu_n_s8</b></b> (int8x8_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshluq_n_s8" type="checkbox"><label for="vqshluq_n_s8"><div>uint8x16_t <b><b>vqshluq_n_s8</b></b> (int8x16_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlu_n_s16" type="checkbox"><label for="vqshlu_n_s16"><div>uint16x4_t <b><b>vqshlu_n_s16</b></b> (int16x4_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshluq_n_s16" type="checkbox"><label for="vqshluq_n_s16"><div>uint16x8_t <b><b>vqshluq_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlu_n_s32" type="checkbox"><label for="vqshlu_n_s32"><div>uint32x2_t <b><b>vqshlu_n_s32</b></b> (int32x2_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshluq_n_s32" type="checkbox"><label for="vqshluq_n_s32"><div>uint32x4_t <b><b>vqshluq_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlu_n_s64" type="checkbox"><label for="vqshlu_n_s64"><div>uint64x1_t <b><b>vqshlu_n_s64</b></b> (int64x1_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshluq_n_s64" type="checkbox"><label for="vqshluq_n_s64"><div>uint64x2_t <b><b>vqshluq_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshlub_n_s8" type="checkbox"><label for="vqshlub_n_s8"><div>uint8_t <b><b>vqshlub_n_s8</b></b> (int8_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Bd,Bn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshluh_n_s16" type="checkbox"><label for="vqshluh_n_s16"><div>uint16_t <b><b>vqshluh_n_s16</b></b> (int16_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Hd,Hn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshlus_n_s32" type="checkbox"><label for="vqshlus_n_s32"><div>uint32_t <b><b>vqshlus_n_s32</b></b> (int32_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Sd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshlud_n_s64" type="checkbox"><label for="vqshlud_n_s64"><div>uint64_t <b><b>vqshlud_n_s64</b></b> (int64_t a, const int n)<span class="right">Signed saturating shift left unsigned</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Left Unsigned (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, shifts each value by an immediate value, saturates the shifted result to an unsigned integer value, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHL</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshlu-signed-saturating-shift-left-unsigned-immediate">SQSHLU</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], src_unsigned) &lt;&lt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, dst_unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_n_s16" type="checkbox"><label for="vshrn_n_s16"><div>int8x8_t <b><b>vshrn_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_n_s32" type="checkbox"><label for="vshrn_n_s32"><div>int16x4_t <b><b>vshrn_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_n_s64" type="checkbox"><label for="vshrn_n_s64"><div>int32x2_t <b><b>vshrn_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_n_u16" type="checkbox"><label for="vshrn_n_u16"><div>uint8x8_t <b><b>vshrn_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_n_u32" type="checkbox"><label for="vshrn_n_u32"><div>uint16x4_t <b><b>vshrn_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_n_u64" type="checkbox"><label for="vshrn_n_u64"><div>uint32x2_t <b><b>vshrn_n_u64</b></b> (uint64x2_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_high_n_s16" type="checkbox"><label for="vshrn_high_n_s16"><div>int8x16_t <b><b>vshrn_high_n_s16</b></b> (int8x8_t r, int16x8_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_high_n_s32" type="checkbox"><label for="vshrn_high_n_s32"><div>int16x8_t <b><b>vshrn_high_n_s32</b></b> (int16x4_t r, int32x4_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_high_n_s64" type="checkbox"><label for="vshrn_high_n_s64"><div>int32x4_t <b><b>vshrn_high_n_s64</b></b> (int32x2_t r, int64x2_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_high_n_u16" type="checkbox"><label for="vshrn_high_n_u16"><div>uint8x16_t <b><b>vshrn_high_n_u16</b></b> (uint8x8_t r, uint16x8_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_high_n_u32" type="checkbox"><label for="vshrn_high_n_u32"><div>uint16x8_t <b><b>vshrn_high_n_u32</b></b> (uint16x4_t r, uint32x4_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshrn_high_n_u64" type="checkbox"><label for="vshrn_high_n_u64"><div>uint32x4_t <b><b>vshrn_high_n_u64</b></b> (uint32x2_t r, uint64x2_t a, const int n)<span class="right">Shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">RSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shrn-shrn2-shift-right-narrow-immediate">SHRN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrun_n_s16" type="checkbox"><label for="vqshrun_n_s16"><div>uint8x8_t <b><b>vqshrun_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Signed saturating shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrun-sqshrun2-signed-saturating-shift-right-unsigned-narrow-immediate">SQSHRUN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshrun_n_s32" type="checkbox"><label for="vqshrun_n_s32"><div>uint16x4_t <b><b>vqshrun_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed saturating shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrun-sqshrun2-signed-saturating-shift-right-unsigned-narrow-immediate">SQSHRUN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshrun_n_s64" type="checkbox"><label for="vqshrun_n_s64"><div>uint32x2_t <b><b>vqshrun_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Signed saturating shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrun-sqshrun2-signed-saturating-shift-right-unsigned-narrow-immediate">SQSHRUN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshrunh_n_s16" type="checkbox"><label for="vqshrunh_n_s16"><div>uint8_t <b><b>vqshrunh_n_s16</b></b> (int16_t a, const int n)<span class="right">Signed saturating shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrun-sqshrun2-signed-saturating-shift-right-unsigned-narrow-immediate">SQSHRUN</a> Bd,Hn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshruns_n_s32" type="checkbox"><label for="vqshruns_n_s32"><div>uint16_t <b><b>vqshruns_n_s32</b></b> (int32_t a, const int n)<span class="right">Signed saturating shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrun-sqshrun2-signed-saturating-shift-right-unsigned-narrow-immediate">SQSHRUN</a> Hd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrund_n_s64" type="checkbox"><label for="vqshrund_n_s64"><div>uint32_t <b><b>vqshrund_n_s64</b></b> (int64_t a, const int n)<span class="right">Signed saturating shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrun-sqshrun2-signed-saturating-shift-right-unsigned-narrow-immediate">SQSHRUN</a> Sd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrun_high_n_s16" type="checkbox"><label for="vqshrun_high_n_s16"><div>uint8x16_t <b><b>vqshrun_high_n_s16</b></b> (uint8x8_t r, int16x8_t a, const int n)<span class="right">Signed saturating shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrun-sqshrun2-signed-saturating-shift-right-unsigned-narrow-immediate">SQSHRUN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrun_high_n_s32" type="checkbox"><label for="vqshrun_high_n_s32"><div>uint16x8_t <b><b>vqshrun_high_n_s32</b></b> (uint16x4_t r, int32x4_t a, const int n)<span class="right">Signed saturating shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrun-sqshrun2-signed-saturating-shift-right-unsigned-narrow-immediate">SQSHRUN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrun_high_n_s64" type="checkbox"><label for="vqshrun_high_n_s64"><div>uint32x4_t <b><b>vqshrun_high_n_s64</b></b> (uint32x2_t r, int64x2_t a, const int n)<span class="right">Signed saturating shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrun-sqshrun2-signed-saturating-shift-right-unsigned-narrow-immediate">SQSHRUN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrun_n_s16" type="checkbox"><label for="vqrshrun_n_s16"><div>uint8x8_t <b><b>vqrshrun_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Signed saturating rounded shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrun-sqrshrun2-signed-saturating-rounded-shift-right-unsigned-narrow-immediate">SQRSHRUN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrun_n_s32" type="checkbox"><label for="vqrshrun_n_s32"><div>uint16x4_t <b><b>vqrshrun_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed saturating rounded shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrun-sqrshrun2-signed-saturating-rounded-shift-right-unsigned-narrow-immediate">SQRSHRUN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrun_n_s64" type="checkbox"><label for="vqrshrun_n_s64"><div>uint32x2_t <b><b>vqrshrun_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Signed saturating rounded shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrun-sqrshrun2-signed-saturating-rounded-shift-right-unsigned-narrow-immediate">SQRSHRUN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrunh_n_s16" type="checkbox"><label for="vqrshrunh_n_s16"><div>uint8_t <b><b>vqrshrunh_n_s16</b></b> (int16_t a, const int n)<span class="right">Signed saturating rounded shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrun-sqrshrun2-signed-saturating-rounded-shift-right-unsigned-narrow-immediate">SQRSHRUN</a> Bd,Hn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshruns_n_s32" type="checkbox"><label for="vqrshruns_n_s32"><div>uint16_t <b><b>vqrshruns_n_s32</b></b> (int32_t a, const int n)<span class="right">Signed saturating rounded shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrun-sqrshrun2-signed-saturating-rounded-shift-right-unsigned-narrow-immediate">SQRSHRUN</a> Hd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrund_n_s64" type="checkbox"><label for="vqrshrund_n_s64"><div>uint32_t <b><b>vqrshrund_n_s64</b></b> (int64_t a, const int n)<span class="right">Signed saturating rounded shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrun-sqrshrun2-signed-saturating-rounded-shift-right-unsigned-narrow-immediate">SQRSHRUN</a> Sd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrun_high_n_s16" type="checkbox"><label for="vqrshrun_high_n_s16"><div>uint8x16_t <b><b>vqrshrun_high_n_s16</b></b> (uint8x8_t r, int16x8_t a, const int n)<span class="right">Signed saturating rounded shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrun-sqrshrun2-signed-saturating-rounded-shift-right-unsigned-narrow-immediate">SQRSHRUN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrun_high_n_s32" type="checkbox"><label for="vqrshrun_high_n_s32"><div>uint16x8_t <b><b>vqrshrun_high_n_s32</b></b> (uint16x4_t r, int32x4_t a, const int n)<span class="right">Signed saturating rounded shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrun-sqrshrun2-signed-saturating-rounded-shift-right-unsigned-narrow-immediate">SQRSHRUN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrun_high_n_s64" type="checkbox"><label for="vqrshrun_high_n_s64"><div>uint32x4_t <b><b>vqrshrun_high_n_s64</b></b> (uint32x2_t r, int64x2_t a, const int n)<span class="right">Signed saturating rounded shift right unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Unsigned Narrow (immediate). This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, right shifts each value by an immediate value, saturates the result to an unsigned integer value that is half the original width, places the final result into a vector, and writes the vector to the destination SIMD&amp;FP register. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRUN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrun-sqrshrun2-signed-saturating-rounded-shift-right-unsigned-narrow-immediate">SQRSHRUN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_n_s16" type="checkbox"><label for="vqshrn_n_s16"><div>int8x8_t <b><b>vqshrn_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Signed saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts and truncates each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrn-sqshrn2-signed-saturating-shift-right-narrow-immediate">SQSHRN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_n_s32" type="checkbox"><label for="vqshrn_n_s32"><div>int16x4_t <b><b>vqshrn_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts and truncates each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrn-sqshrn2-signed-saturating-shift-right-narrow-immediate">SQSHRN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_n_s64" type="checkbox"><label for="vqshrn_n_s64"><div>int32x2_t <b><b>vqshrn_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Signed saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts and truncates each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrn-sqshrn2-signed-saturating-shift-right-narrow-immediate">SQSHRN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_n_u16" type="checkbox"><label for="vqshrn_n_u16"><div>uint8x8_t <b><b>vqshrn_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Unsigned saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshrn-uqshrn2-unsigned-saturating-shift-right-narrow-immediate">UQSHRN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_n_u32" type="checkbox"><label for="vqshrn_n_u32"><div>uint16x4_t <b><b>vqshrn_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Unsigned saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshrn-uqshrn2-unsigned-saturating-shift-right-narrow-immediate">UQSHRN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_n_u64" type="checkbox"><label for="vqshrn_n_u64"><div>uint32x2_t <b><b>vqshrn_n_u64</b></b> (uint64x2_t a, const int n)<span class="right">Unsigned saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshrn-uqshrn2-unsigned-saturating-shift-right-narrow-immediate">UQSHRN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqshrnh_n_s16" type="checkbox"><label for="vqshrnh_n_s16"><div>int8_t <b><b>vqshrnh_n_s16</b></b> (int16_t a, const int n)<span class="right">Signed saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts and truncates each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrn-sqshrn2-signed-saturating-shift-right-narrow-immediate">SQSHRN</a> Bd,Hn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrns_n_s32" type="checkbox"><label for="vqshrns_n_s32"><div>int16_t <b><b>vqshrns_n_s32</b></b> (int32_t a, const int n)<span class="right">Signed saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts and truncates each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrn-sqshrn2-signed-saturating-shift-right-narrow-immediate">SQSHRN</a> Hd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrnd_n_s64" type="checkbox"><label for="vqshrnd_n_s64"><div>int32_t <b><b>vqshrnd_n_s64</b></b> (int64_t a, const int n)<span class="right">Signed saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts and truncates each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrn-sqshrn2-signed-saturating-shift-right-narrow-immediate">SQSHRN</a> Sd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrnh_n_u16" type="checkbox"><label for="vqshrnh_n_u16"><div>uint8_t <b><b>vqshrnh_n_u16</b></b> (uint16_t a, const int n)<span class="right">Unsigned saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshrn-uqshrn2-unsigned-saturating-shift-right-narrow-immediate">UQSHRN</a> Bd,Hn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrns_n_u32" type="checkbox"><label for="vqshrns_n_u32"><div>uint16_t <b><b>vqshrns_n_u32</b></b> (uint32_t a, const int n)<span class="right">Unsigned saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshrn-uqshrn2-unsigned-saturating-shift-right-narrow-immediate">UQSHRN</a> Hd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrnd_n_u64" type="checkbox"><label for="vqshrnd_n_u64"><div>uint32_t <b><b>vqshrnd_n_u64</b></b> (uint64_t a, const int n)<span class="right">Unsigned saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshrn-uqshrn2-unsigned-saturating-shift-right-narrow-immediate">UQSHRN</a> Sd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_high_n_s16" type="checkbox"><label for="vqshrn_high_n_s16"><div>int8x16_t <b><b>vqshrn_high_n_s16</b></b> (int8x8_t r, int16x8_t a, const int n)<span class="right">Signed saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts and truncates each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrn-sqshrn2-signed-saturating-shift-right-narrow-immediate">SQSHRN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_high_n_s32" type="checkbox"><label for="vqshrn_high_n_s32"><div>int16x8_t <b><b>vqshrn_high_n_s32</b></b> (int16x4_t r, int32x4_t a, const int n)<span class="right">Signed saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts and truncates each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrn-sqshrn2-signed-saturating-shift-right-narrow-immediate">SQSHRN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_high_n_s64" type="checkbox"><label for="vqshrn_high_n_s64"><div>int32x4_t <b><b>vqshrn_high_n_s64</b></b> (int32x2_t r, int64x2_t a, const int n)<span class="right">Signed saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts and truncates each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqshrn-sqshrn2-signed-saturating-shift-right-narrow-immediate">SQSHRN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_high_n_u16" type="checkbox"><label for="vqshrn_high_n_u16"><div>uint8x16_t <b><b>vqshrn_high_n_u16</b></b> (uint8x8_t r, uint16x8_t a, const int n)<span class="right">Unsigned saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshrn-uqshrn2-unsigned-saturating-shift-right-narrow-immediate">UQSHRN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_high_n_u32" type="checkbox"><label for="vqshrn_high_n_u32"><div>uint16x8_t <b><b>vqshrn_high_n_u32</b></b> (uint16x4_t r, uint32x4_t a, const int n)<span class="right">Unsigned saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshrn-uqshrn2-unsigned-saturating-shift-right-narrow-immediate">UQSHRN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqshrn_high_n_u64" type="checkbox"><label for="vqshrn_high_n_u64"><div>uint32x4_t <b><b>vqshrn_high_n_u64</b></b> (uint32x2_t r, uint64x2_t a, const int n)<span class="right">Unsigned saturating shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are truncated. For rounded results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQRSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqshrn-uqshrn2-unsigned-saturating-shift-right-narrow-immediate">UQSHRN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_n_s16" type="checkbox"><label for="vrshrn_n_s16"><div>int8x8_t <b><b>vrshrn_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_n_s32" type="checkbox"><label for="vrshrn_n_s32"><div>int16x4_t <b><b>vrshrn_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_n_s64" type="checkbox"><label for="vrshrn_n_s64"><div>int32x2_t <b><b>vrshrn_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_n_u16" type="checkbox"><label for="vrshrn_n_u16"><div>uint8x8_t <b><b>vrshrn_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_n_u32" type="checkbox"><label for="vrshrn_n_u32"><div>uint16x4_t <b><b>vrshrn_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_n_u64" type="checkbox"><label for="vrshrn_n_u64"><div>uint32x2_t <b><b>vrshrn_n_u64</b></b> (uint64x2_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_high_n_s16" type="checkbox"><label for="vrshrn_high_n_s16"><div>int8x16_t <b><b>vrshrn_high_n_s16</b></b> (int8x8_t r, int16x8_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_high_n_s32" type="checkbox"><label for="vrshrn_high_n_s32"><div>int16x8_t <b><b>vrshrn_high_n_s32</b></b> (int16x4_t r, int32x4_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_high_n_s64" type="checkbox"><label for="vrshrn_high_n_s64"><div>int32x4_t <b><b>vrshrn_high_n_s64</b></b> (int32x2_t r, int64x2_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_high_n_u16" type="checkbox"><label for="vrshrn_high_n_u16"><div>uint8x16_t <b><b>vrshrn_high_n_u16</b></b> (uint8x8_t r, uint16x8_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_high_n_u32" type="checkbox"><label for="vrshrn_high_n_u32"><div>uint16x8_t <b><b>vrshrn_high_n_u32</b></b> (uint16x4_t r, uint32x4_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrshrn_high_n_u64" type="checkbox"><label for="vrshrn_high_n_u64"><div>uint32x4_t <b><b>vrshrn_high_n_u64</b></b> (uint32x2_t r, uint64x2_t a, const int n)<span class="right">Rounding shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Rounding Shift Right Narrow (immediate). This instruction reads each unsigned integer value from the vector in the source SIMD&amp;FP register, right shifts each result by an immediate value, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rshrn-rshrn2-rounding-shift-right-narrow-immediate">RSHRN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; 32(Vd) <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize]) + round_const) &gt;&gt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_n_s16" type="checkbox"><label for="vqrshrn_n_s16"><div>int8x8_t <b><b>vqrshrn_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Signed saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrn-sqrshrn2-signed-saturating-rounded-shift-right-narrow-immediate">SQRSHRN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_n_s32" type="checkbox"><label for="vqrshrn_n_s32"><div>int16x4_t <b><b>vqrshrn_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrn-sqrshrn2-signed-saturating-rounded-shift-right-narrow-immediate">SQRSHRN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_n_s64" type="checkbox"><label for="vqrshrn_n_s64"><div>int32x2_t <b><b>vqrshrn_n_s64</b></b> (int64x2_t a, const int n)<span class="right">Signed saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrn-sqrshrn2-signed-saturating-rounded-shift-right-narrow-immediate">SQRSHRN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_n_u16" type="checkbox"><label for="vqrshrn_n_u16"><div>uint8x8_t <b><b>vqrshrn_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Unsigned saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshrn-uqrshrn2-unsigned-saturating-rounded-shift-right-narrow-immediate">UQRSHRN</a> Vd.8B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_n_u32" type="checkbox"><label for="vqrshrn_n_u32"><div>uint16x4_t <b><b>vqrshrn_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Unsigned saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshrn-uqrshrn2-unsigned-saturating-rounded-shift-right-narrow-immediate">UQRSHRN</a> Vd.4H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_n_u64" type="checkbox"><label for="vqrshrn_n_u64"><div>uint32x2_t <b><b>vqrshrn_n_u64</b></b> (uint64x2_t a, const int n)<span class="right">Unsigned saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshrn-uqrshrn2-unsigned-saturating-rounded-shift-right-narrow-immediate">UQRSHRN</a> Vd.2S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrnh_n_s16" type="checkbox"><label for="vqrshrnh_n_s16"><div>int8_t <b><b>vqrshrnh_n_s16</b></b> (int16_t a, const int n)<span class="right">Signed saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrn-sqrshrn2-signed-saturating-rounded-shift-right-narrow-immediate">SQRSHRN</a> Bd,Hn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrns_n_s32" type="checkbox"><label for="vqrshrns_n_s32"><div>int16_t <b><b>vqrshrns_n_s32</b></b> (int32_t a, const int n)<span class="right">Signed saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrn-sqrshrn2-signed-saturating-rounded-shift-right-narrow-immediate">SQRSHRN</a> Hd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrnd_n_s64" type="checkbox"><label for="vqrshrnd_n_s64"><div>int32_t <b><b>vqrshrnd_n_s64</b></b> (int64_t a, const int n)<span class="right">Signed saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrn-sqrshrn2-signed-saturating-rounded-shift-right-narrow-immediate">SQRSHRN</a> Sd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrnh_n_u16" type="checkbox"><label for="vqrshrnh_n_u16"><div>uint8_t <b><b>vqrshrnh_n_u16</b></b> (uint16_t a, const int n)<span class="right">Unsigned saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshrn-uqrshrn2-unsigned-saturating-rounded-shift-right-narrow-immediate">UQRSHRN</a> Bd,Hn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrns_n_u32" type="checkbox"><label for="vqrshrns_n_u32"><div>uint16_t <b><b>vqrshrns_n_u32</b></b> (uint32_t a, const int n)<span class="right">Unsigned saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshrn-uqrshrn2-unsigned-saturating-rounded-shift-right-narrow-immediate">UQRSHRN</a> Hd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrnd_n_u64" type="checkbox"><label for="vqrshrnd_n_u64"><div>uint32_t <b><b>vqrshrnd_n_u64</b></b> (uint64_t a, const int n)<span class="right">Unsigned saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshrn-uqrshrn2-unsigned-saturating-rounded-shift-right-narrow-immediate">UQRSHRN</a> Sd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_high_n_s16" type="checkbox"><label for="vqrshrn_high_n_s16"><div>int8x16_t <b><b>vqrshrn_high_n_s16</b></b> (int8x8_t r, int16x8_t a, const int n)<span class="right">Signed saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrn-sqrshrn2-signed-saturating-rounded-shift-right-narrow-immediate">SQRSHRN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_high_n_s32" type="checkbox"><label for="vqrshrn_high_n_s32"><div>int16x8_t <b><b>vqrshrn_high_n_s32</b></b> (int16x4_t r, int32x4_t a, const int n)<span class="right">Signed saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrn-sqrshrn2-signed-saturating-rounded-shift-right-narrow-immediate">SQRSHRN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_high_n_s64" type="checkbox"><label for="vqrshrn_high_n_s64"><div>int32x4_t <b><b>vqrshrn_high_n_s64</b></b> (int32x2_t r, int64x2_t a, const int n)<span class="right">Signed saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, saturates each shifted result to a value that is half the original width, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are signed integer values. The destination vector elements are half as long as the source vector elements. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">SQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrshrn-sqrshrn2-signed-saturating-rounded-shift-right-narrow-immediate">SQRSHRN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_high_n_u16" type="checkbox"><label for="vqrshrn_high_n_u16"><div>uint8x16_t <b><b>vqrshrn_high_n_u16</b></b> (uint8x8_t r, uint16x8_t a, const int n)<span class="right">Unsigned saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshrn-uqrshrn2-unsigned-saturating-rounded-shift-right-narrow-immediate">UQRSHRN2</a> Vd.16B,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_high_n_u32" type="checkbox"><label for="vqrshrn_high_n_u32"><div>uint16x8_t <b><b>vqrshrn_high_n_u32</b></b> (uint16x4_t r, uint32x4_t a, const int n)<span class="right">Unsigned saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshrn-uqrshrn2-unsigned-saturating-rounded-shift-right-narrow-immediate">UQRSHRN2</a> Vd.8H,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrshrn_high_n_u64" type="checkbox"><label for="vqrshrn_high_n_u64"><div>uint32x4_t <b><b>vqrshrn_high_n_u64</b></b> (uint32x2_t r, uint64x2_t a, const int n)<span class="right">Unsigned saturating rounded shift right narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating Rounded Shift Right Narrow (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each result by an immediate value, puts the final result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values. The results are rounded. For truncated results, see <a class="armarm-xref" title="Reference to ARM ARM section">UQSHRN</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqrshrn-uqrshrn2-unsigned-saturating-rounded-shift-right-narrow-immediate">UQRSHRN2</a> Vd.4S,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize*2) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer round_const = if round then (1 &lt;&lt; (shift - 1)) else 0;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], unsigned) + round_const) &gt;&gt; shift;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(element, esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_s8" type="checkbox"><label for="vshll_n_s8"><div>int16x8_t <b><b>vshll_n_s8</b></b> (int8x8_t a, const int n)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL</a> Vd.8H,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_s16" type="checkbox"><label for="vshll_n_s16"><div>int32x4_t <b><b>vshll_n_s16</b></b> (int16x4_t a, const int n)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL</a> Vd.4S,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_s32" type="checkbox"><label for="vshll_n_s32"><div>int64x2_t <b><b>vshll_n_s32</b></b> (int32x2_t a, const int n)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL</a> Vd.2D,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_u8" type="checkbox"><label for="vshll_n_u8"><div>uint16x8_t <b><b>vshll_n_u8</b></b> (uint8x8_t a, const int n)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL</a> Vd.8H,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_u16" type="checkbox"><label for="vshll_n_u16"><div>uint32x4_t <b><b>vshll_n_u16</b></b> (uint16x4_t a, const int n)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL</a> Vd.4S,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_u32" type="checkbox"><label for="vshll_n_u32"><div>uint64x2_t <b><b>vshll_n_u32</b></b> (uint32x2_t a, const int n)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL</a> Vd.2D,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_s8" type="checkbox"><label for="vshll_high_n_s8"><div>int16x8_t <b><b>vshll_high_n_s8</b></b> (int8x16_t a, const int n)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL2</a> Vd.8H,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_s16" type="checkbox"><label for="vshll_high_n_s16"><div>int32x4_t <b><b>vshll_high_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL2</a> Vd.4S,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_s32" type="checkbox"><label for="vshll_high_n_s32"><div>int64x2_t <b><b>vshll_high_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL2</a> Vd.2D,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_u8" type="checkbox"><label for="vshll_high_n_u8"><div>uint16x8_t <b><b>vshll_high_n_u8</b></b> (uint8x16_t a, const int n)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL2</a> Vd.8H,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_u16" type="checkbox"><label for="vshll_high_n_u16"><div>uint32x4_t <b><b>vshll_high_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL2</a> Vd.4S,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_u32" type="checkbox"><label for="vshll_high_n_u32"><div>uint64x2_t <b><b>vshll_high_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL2</a> Vd.2D,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_s8" type="checkbox"><label for="vshll_n_s8"><div>int16x8_t <b><b>vshll_n_s8</b></b> (int8x8_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL</a> Vd.8H,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+8 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_s16" type="checkbox"><label for="vshll_n_s16"><div>int32x4_t <b><b>vshll_n_s16</b></b> (int16x4_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL</a> Vd.4S,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+16 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_s32" type="checkbox"><label for="vshll_n_s32"><div>int64x2_t <b><b>vshll_n_s32</b></b> (int32x2_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL</a> Vd.2D,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+32 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_u8" type="checkbox"><label for="vshll_n_u8"><div>uint16x8_t <b><b>vshll_n_u8</b></b> (uint8x8_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL</a> Vd.8H,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+8 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_u16" type="checkbox"><label for="vshll_n_u16"><div>uint32x4_t <b><b>vshll_n_u16</b></b> (uint16x4_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL</a> Vd.4S,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+16 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_n_u32" type="checkbox"><label for="vshll_n_u32"><div>uint64x2_t <b><b>vshll_n_u32</b></b> (uint32x2_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL</a> Vd.2D,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+32 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_s8" type="checkbox"><label for="vshll_high_n_s8"><div>int16x8_t <b><b>vshll_high_n_s8</b></b> (int8x16_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL2</a> Vd.8H,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+8 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_s16" type="checkbox"><label for="vshll_high_n_s16"><div>int32x4_t <b><b>vshll_high_n_s16</b></b> (int16x8_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL2</a> Vd.4S,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+16 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_s32" type="checkbox"><label for="vshll_high_n_s32"><div>int64x2_t <b><b>vshll_high_n_s32</b></b> (int32x4_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL2</a> Vd.2D,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+32 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_u8" type="checkbox"><label for="vshll_high_n_u8"><div>uint16x8_t <b><b>vshll_high_n_u8</b></b> (uint8x16_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL2</a> Vd.8H,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+8 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_u16" type="checkbox"><label for="vshll_high_n_u16"><div>uint32x4_t <b><b>vshll_high_n_u16</b></b> (uint16x8_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL2</a> Vd.4S,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+16 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vshll_high_n_u32" type="checkbox"><label for="vshll_high_n_u32"><div>uint64x2_t <b><b>vshll_high_n_u32</b></b> (uint32x4_t a, const int n)<span class="right">Shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left Long (by element size). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, left shifts each result by the element size, writes the final result to a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/shll-shll2-shift-left-long-by-element-size">SHLL2</a> Vd.2D,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+32 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_s8" type="checkbox"><label for="vsri_n_s8"><div>int8x8_t <b><b>vsri_n_s8</b></b> (int8x8_t a, int8x8_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_s8" type="checkbox"><label for="vsriq_n_s8"><div>int8x16_t <b><b>vsriq_n_s8</b></b> (int8x16_t a, int8x16_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_s16" type="checkbox"><label for="vsri_n_s16"><div>int16x4_t <b><b>vsri_n_s16</b></b> (int16x4_t a, int16x4_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_s16" type="checkbox"><label for="vsriq_n_s16"><div>int16x8_t <b><b>vsriq_n_s16</b></b> (int16x8_t a, int16x8_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_s32" type="checkbox"><label for="vsri_n_s32"><div>int32x2_t <b><b>vsri_n_s32</b></b> (int32x2_t a, int32x2_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_s32" type="checkbox"><label for="vsriq_n_s32"><div>int32x4_t <b><b>vsriq_n_s32</b></b> (int32x4_t a, int32x4_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_s64" type="checkbox"><label for="vsri_n_s64"><div>int64x1_t <b><b>vsri_n_s64</b></b> (int64x1_t a, int64x1_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_s64" type="checkbox"><label for="vsriq_n_s64"><div>int64x2_t <b><b>vsriq_n_s64</b></b> (int64x2_t a, int64x2_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_u8" type="checkbox"><label for="vsri_n_u8"><div>uint8x8_t <b><b>vsri_n_u8</b></b> (uint8x8_t a, uint8x8_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_u8" type="checkbox"><label for="vsriq_n_u8"><div>uint8x16_t <b><b>vsriq_n_u8</b></b> (uint8x16_t a, uint8x16_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_u16" type="checkbox"><label for="vsri_n_u16"><div>uint16x4_t <b><b>vsri_n_u16</b></b> (uint16x4_t a, uint16x4_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_u16" type="checkbox"><label for="vsriq_n_u16"><div>uint16x8_t <b><b>vsriq_n_u16</b></b> (uint16x8_t a, uint16x8_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_u32" type="checkbox"><label for="vsri_n_u32"><div>uint32x2_t <b><b>vsri_n_u32</b></b> (uint32x2_t a, uint32x2_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_u32" type="checkbox"><label for="vsriq_n_u32"><div>uint32x4_t <b><b>vsriq_n_u32</b></b> (uint32x4_t a, uint32x4_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_u64" type="checkbox"><label for="vsri_n_u64"><div>uint64x1_t <b><b>vsri_n_u64</b></b> (uint64x1_t a, uint64x1_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_u64" type="checkbox"><label for="vsriq_n_u64"><div>uint64x2_t <b><b>vsriq_n_u64</b></b> (uint64x2_t a, uint64x2_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_p64" type="checkbox"><label for="vsri_n_p64"><div>poly64x1_t <b><b>vsri_n_p64</b></b> (poly64x1_t a, poly64x1_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_p64" type="checkbox"><label for="vsriq_n_p64"><div>poly64x2_t <b><b>vsriq_n_p64</b></b> (poly64x2_t a, poly64x2_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_p8" type="checkbox"><label for="vsri_n_p8"><div>poly8x8_t <b><b>vsri_n_p8</b></b> (poly8x8_t a, poly8x8_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_p8" type="checkbox"><label for="vsriq_n_p8"><div>poly8x16_t <b><b>vsriq_n_p8</b></b> (poly8x16_t a, poly8x16_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+1 &lt;&lt; n &lt;&lt; 8 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsri_n_p16" type="checkbox"><label for="vsri_n_p16"><div>poly16x4_t <b><b>vsri_n_p16</b></b> (poly16x4_t a, poly16x4_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsriq_n_p16" type="checkbox"><label for="vsriq_n_p16"><div>poly16x8_t <b><b>vsriq_n_p16</b></b> (poly16x8_t a, poly16x8_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+1 &lt;&lt; n &lt;&lt; 16 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsrid_n_s64" type="checkbox"><label for="vsrid_n_s64"><div>int64_t <b><b>vsrid_n_s64</b></b> (int64_t a, int64_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsrid_n_u64" type="checkbox"><label for="vsrid_n_u64"><div>uint64_t <b><b>vsrid_n_u64</b></b> (uint64_t a, uint64_t b, const int n)<span class="right">Shift right and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Right and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, right shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the right of each vector element of the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sri-shift-right-and-insert-immediate">SRI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_s8" type="checkbox"><label for="vsli_n_s8"><div>int8x8_t <b><b>vsli_n_s8</b></b> (int8x8_t a, int8x8_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_s8" type="checkbox"><label for="vsliq_n_s8"><div>int8x16_t <b><b>vsliq_n_s8</b></b> (int8x16_t a, int8x16_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_s16" type="checkbox"><label for="vsli_n_s16"><div>int16x4_t <b><b>vsli_n_s16</b></b> (int16x4_t a, int16x4_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_s16" type="checkbox"><label for="vsliq_n_s16"><div>int16x8_t <b><b>vsliq_n_s16</b></b> (int16x8_t a, int16x8_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_s32" type="checkbox"><label for="vsli_n_s32"><div>int32x2_t <b><b>vsli_n_s32</b></b> (int32x2_t a, int32x2_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_s32" type="checkbox"><label for="vsliq_n_s32"><div>int32x4_t <b><b>vsliq_n_s32</b></b> (int32x4_t a, int32x4_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_s64" type="checkbox"><label for="vsli_n_s64"><div>int64x1_t <b><b>vsli_n_s64</b></b> (int64x1_t a, int64x1_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_s64" type="checkbox"><label for="vsliq_n_s64"><div>int64x2_t <b><b>vsliq_n_s64</b></b> (int64x2_t a, int64x2_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_u8" type="checkbox"><label for="vsli_n_u8"><div>uint8x8_t <b><b>vsli_n_u8</b></b> (uint8x8_t a, uint8x8_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_u8" type="checkbox"><label for="vsliq_n_u8"><div>uint8x16_t <b><b>vsliq_n_u8</b></b> (uint8x16_t a, uint8x16_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_u16" type="checkbox"><label for="vsli_n_u16"><div>uint16x4_t <b><b>vsli_n_u16</b></b> (uint16x4_t a, uint16x4_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_u16" type="checkbox"><label for="vsliq_n_u16"><div>uint16x8_t <b><b>vsliq_n_u16</b></b> (uint16x8_t a, uint16x8_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_u32" type="checkbox"><label for="vsli_n_u32"><div>uint32x2_t <b><b>vsli_n_u32</b></b> (uint32x2_t a, uint32x2_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_u32" type="checkbox"><label for="vsliq_n_u32"><div>uint32x4_t <b><b>vsliq_n_u32</b></b> (uint32x4_t a, uint32x4_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+0 &lt;&lt; n &lt;&lt; 31 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_u64" type="checkbox"><label for="vsli_n_u64"><div>uint64x1_t <b><b>vsli_n_u64</b></b> (uint64x1_t a, uint64x1_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_u64" type="checkbox"><label for="vsliq_n_u64"><div>uint64x2_t <b><b>vsliq_n_u64</b></b> (uint64x2_t a, uint64x2_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_p64" type="checkbox"><label for="vsli_n_p64"><div>poly64x1_t <b><b>vsli_n_p64</b></b> (poly64x1_t a, poly64x1_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_p64" type="checkbox"><label for="vsliq_n_p64"><div>poly64x2_t <b><b>vsliq_n_p64</b></b> (poly64x2_t a, poly64x2_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_p8" type="checkbox"><label for="vsli_n_p8"><div>poly8x8_t <b><b>vsli_n_p8</b></b> (poly8x8_t a, poly8x8_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.8B,Vn.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_p8" type="checkbox"><label for="vsliq_n_p8"><div>poly8x16_t <b><b>vsliq_n_p8</b></b> (poly8x16_t a, poly8x16_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.16B,Vn.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsli_n_p16" type="checkbox"><label for="vsli_n_p16"><div>poly16x4_t <b><b>vsli_n_p16</b></b> (poly16x4_t a, poly16x4_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.4H,Vn.4H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsliq_n_p16" type="checkbox"><label for="vsliq_n_p16"><div>poly16x8_t <b><b>vsliq_n_p16</b></b> (poly16x8_t a, poly16x8_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Vd.8H,Vn.8H,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vslid_n_s64" type="checkbox"><label for="vslid_n_s64"><div>int64_t <b><b>vslid_n_s64</b></b> (int64_t a, int64_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vslid_n_u64" type="checkbox"><label for="vslid_n_u64"><div>uint64_t <b><b>vslid_n_u64</b></b> (uint64_t a, uint64_t b, const int n)<span class="right">Shift left and insert</span></div></label><article>      <h4>Description</h4><p><p class="aml">Shift Left and Insert (immediate). This instruction reads each vector element in the source SIMD&amp;FP register, left shifts each vector element by an immediate value, and inserts the result into the corresponding vector element in the destination SIMD&amp;FP register such that the new zero bits created by the shift are not inserted but retain their existing value. Bits shifted out of the left of each vector element in the source register are lost.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sli-shift-left-and-insert-immediate">SLI</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Dn <br />
+0 &lt;&lt; n &lt;&lt; 63 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) mask = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.1" title="function: bits(N) Ones(integer N)">Ones</a>(esize), shift);
+bits(esize) shifted;
+
+for e = 0 to elements-1
+    shifted = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSL.2" title="function: bits(N) LSL(bits(N) x, integer shift)">LSL</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], shift);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize] AND NOT(mask)) OR shifted;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_s32_f32" type="checkbox"><label for="vcvt_s32_f32"><div>int32x2_t <b><b>vcvt_s32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_s32_f32" type="checkbox"><label for="vcvtq_s32_f32"><div>int32x4_t <b><b>vcvtq_s32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_u32_f32" type="checkbox"><label for="vcvt_u32_f32"><div>uint32x2_t <b><b>vcvt_u32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_u32_f32" type="checkbox"><label for="vcvtq_u32_f32"><div>uint32x4_t <b><b>vcvtq_u32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtn_s32_f32" type="checkbox"><label for="vcvtn_s32_f32"><div>int32x2_t <b><b>vcvtn_s32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtns-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNS</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtnq_s32_f32" type="checkbox"><label for="vcvtnq_s32_f32"><div>int32x4_t <b><b>vcvtnq_s32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtns-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNS</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtn_u32_f32" type="checkbox"><label for="vcvtn_u32_f32"><div>uint32x2_t <b><b>vcvtn_u32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtnu-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNU</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtnq_u32_f32" type="checkbox"><label for="vcvtnq_u32_f32"><div>uint32x4_t <b><b>vcvtnq_u32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtnu-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNU</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtm_s32_f32" type="checkbox"><label for="vcvtm_s32_f32"><div>int32x2_t <b><b>vcvtm_s32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to signed integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtms-vector-floating-point-convert-to-signed-integer-rounding-toward-minus-infinity-vector">FCVTMS</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtmq_s32_f32" type="checkbox"><label for="vcvtmq_s32_f32"><div>int32x4_t <b><b>vcvtmq_s32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to signed integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtms-vector-floating-point-convert-to-signed-integer-rounding-toward-minus-infinity-vector">FCVTMS</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtm_u32_f32" type="checkbox"><label for="vcvtm_u32_f32"><div>uint32x2_t <b><b>vcvtm_u32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtmu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-minus-infinity-vector">FCVTMU</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtmq_u32_f32" type="checkbox"><label for="vcvtmq_u32_f32"><div>uint32x4_t <b><b>vcvtmq_u32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtmu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-minus-infinity-vector">FCVTMU</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtp_s32_f32" type="checkbox"><label for="vcvtp_s32_f32"><div>int32x2_t <b><b>vcvtp_s32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to signed integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtps-vector-floating-point-convert-to-signed-integer-rounding-toward-plus-infinity-vector">FCVTPS</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtpq_s32_f32" type="checkbox"><label for="vcvtpq_s32_f32"><div>int32x4_t <b><b>vcvtpq_s32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to signed integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtps-vector-floating-point-convert-to-signed-integer-rounding-toward-plus-infinity-vector">FCVTPS</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtp_u32_f32" type="checkbox"><label for="vcvtp_u32_f32"><div>uint32x2_t <b><b>vcvtp_u32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtpu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-plus-infinity-vector">FCVTPU</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtpq_u32_f32" type="checkbox"><label for="vcvtpq_u32_f32"><div>uint32x4_t <b><b>vcvtpq_u32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtpu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-plus-infinity-vector">FCVTPU</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvta_s32_f32" type="checkbox"><label for="vcvta_s32_f32"><div>int32x2_t <b><b>vcvta_s32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to a signed integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtas-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAS</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtaq_s32_f32" type="checkbox"><label for="vcvtaq_s32_f32"><div>int32x4_t <b><b>vcvtaq_s32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to a signed integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtas-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAS</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvta_u32_f32" type="checkbox"><label for="vcvta_u32_f32"><div>uint32x2_t <b><b>vcvta_u32_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtau-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAU</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtaq_u32_f32" type="checkbox"><label for="vcvtaq_u32_f32"><div>uint32x4_t <b><b>vcvtaq_u32_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtau-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAU</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvts_s32_f32" type="checkbox"><label for="vcvts_s32_f32"><div>int32_t <b><b>vcvts_s32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvts_u32_f32" type="checkbox"><label for="vcvts_u32_f32"><div>uint32_t <b><b>vcvts_u32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtns_s32_f32" type="checkbox"><label for="vcvtns_s32_f32"><div>int32_t <b><b>vcvtns_s32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtns-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNS</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtns_u32_f32" type="checkbox"><label for="vcvtns_u32_f32"><div>uint32_t <b><b>vcvtns_u32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtnu-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNU</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtms_s32_f32" type="checkbox"><label for="vcvtms_s32_f32"><div>int32_t <b><b>vcvtms_s32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to signed integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtms-vector-floating-point-convert-to-signed-integer-rounding-toward-minus-infinity-vector">FCVTMS</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtms_u32_f32" type="checkbox"><label for="vcvtms_u32_f32"><div>uint32_t <b><b>vcvtms_u32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtmu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-minus-infinity-vector">FCVTMU</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtps_s32_f32" type="checkbox"><label for="vcvtps_s32_f32"><div>int32_t <b><b>vcvtps_s32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to signed integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtps-vector-floating-point-convert-to-signed-integer-rounding-toward-plus-infinity-vector">FCVTPS</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtps_u32_f32" type="checkbox"><label for="vcvtps_u32_f32"><div>uint32_t <b><b>vcvtps_u32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtpu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-plus-infinity-vector">FCVTPU</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtas_s32_f32" type="checkbox"><label for="vcvtas_s32_f32"><div>int32_t <b><b>vcvtas_s32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to a signed integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtas-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAS</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtas_u32_f32" type="checkbox"><label for="vcvtas_u32_f32"><div>uint32_t <b><b>vcvtas_u32_f32</b></b> (float32_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtau-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAU</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_s64_f64" type="checkbox"><label for="vcvt_s64_f64"><div>int64x1_t <b><b>vcvt_s64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_s64_f64" type="checkbox"><label for="vcvtq_s64_f64"><div>int64x2_t <b><b>vcvtq_s64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_u64_f64" type="checkbox"><label for="vcvt_u64_f64"><div>uint64x1_t <b><b>vcvt_u64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_u64_f64" type="checkbox"><label for="vcvtq_u64_f64"><div>uint64x2_t <b><b>vcvtq_u64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtn_s64_f64" type="checkbox"><label for="vcvtn_s64_f64"><div>int64x1_t <b><b>vcvtn_s64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtns-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtnq_s64_f64" type="checkbox"><label for="vcvtnq_s64_f64"><div>int64x2_t <b><b>vcvtnq_s64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtns-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNS</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtn_u64_f64" type="checkbox"><label for="vcvtn_u64_f64"><div>uint64x1_t <b><b>vcvtn_u64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtnu-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtnq_u64_f64" type="checkbox"><label for="vcvtnq_u64_f64"><div>uint64x2_t <b><b>vcvtnq_u64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtnu-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNU</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtm_s64_f64" type="checkbox"><label for="vcvtm_s64_f64"><div>int64x1_t <b><b>vcvtm_s64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to signed integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtms-vector-floating-point-convert-to-signed-integer-rounding-toward-minus-infinity-vector">FCVTMS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtmq_s64_f64" type="checkbox"><label for="vcvtmq_s64_f64"><div>int64x2_t <b><b>vcvtmq_s64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to signed integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtms-vector-floating-point-convert-to-signed-integer-rounding-toward-minus-infinity-vector">FCVTMS</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtm_u64_f64" type="checkbox"><label for="vcvtm_u64_f64"><div>uint64x1_t <b><b>vcvtm_u64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtmu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-minus-infinity-vector">FCVTMU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtmq_u64_f64" type="checkbox"><label for="vcvtmq_u64_f64"><div>uint64x2_t <b><b>vcvtmq_u64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtmu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-minus-infinity-vector">FCVTMU</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtp_s64_f64" type="checkbox"><label for="vcvtp_s64_f64"><div>int64x1_t <b><b>vcvtp_s64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to signed integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtps-vector-floating-point-convert-to-signed-integer-rounding-toward-plus-infinity-vector">FCVTPS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtpq_s64_f64" type="checkbox"><label for="vcvtpq_s64_f64"><div>int64x2_t <b><b>vcvtpq_s64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to signed integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtps-vector-floating-point-convert-to-signed-integer-rounding-toward-plus-infinity-vector">FCVTPS</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtp_u64_f64" type="checkbox"><label for="vcvtp_u64_f64"><div>uint64x1_t <b><b>vcvtp_u64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtpu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-plus-infinity-vector">FCVTPU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtpq_u64_f64" type="checkbox"><label for="vcvtpq_u64_f64"><div>uint64x2_t <b><b>vcvtpq_u64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtpu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-plus-infinity-vector">FCVTPU</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvta_s64_f64" type="checkbox"><label for="vcvta_s64_f64"><div>int64x1_t <b><b>vcvta_s64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to a signed integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtas-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtaq_s64_f64" type="checkbox"><label for="vcvtaq_s64_f64"><div>int64x2_t <b><b>vcvtaq_s64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to a signed integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtas-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAS</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvta_u64_f64" type="checkbox"><label for="vcvta_u64_f64"><div>uint64x1_t <b><b>vcvta_u64_f64</b></b> (float64x1_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtau-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtaq_u64_f64" type="checkbox"><label for="vcvtaq_u64_f64"><div>uint64x2_t <b><b>vcvtaq_u64_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtau-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAU</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtd_s64_f64" type="checkbox"><label for="vcvtd_s64_f64"><div>int64_t <b><b>vcvtd_s64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtd_u64_f64" type="checkbox"><label for="vcvtd_u64_f64"><div>uint64_t <b><b>vcvtd_u64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtnd_s64_f64" type="checkbox"><label for="vcvtnd_s64_f64"><div>int64_t <b><b>vcvtnd_s64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtns-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtnd_u64_f64" type="checkbox"><label for="vcvtnd_u64_f64"><div>uint64_t <b><b>vcvtnd_u64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to even (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtnu-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-even-vector">FCVTNU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtmd_s64_f64" type="checkbox"><label for="vcvtmd_s64_f64"><div>int64_t <b><b>vcvtmd_s64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to signed integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtms-vector-floating-point-convert-to-signed-integer-rounding-toward-minus-infinity-vector">FCVTMS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtmd_u64_f64" type="checkbox"><label for="vcvtmd_u64_f64"><div>uint64_t <b><b>vcvtmd_u64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Minus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtmu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-minus-infinity-vector">FCVTMU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtpd_s64_f64" type="checkbox"><label for="vcvtpd_s64_f64"><div>int64_t <b><b>vcvtpd_s64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to signed integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtps-vector-floating-point-convert-to-signed-integer-rounding-toward-plus-infinity-vector">FCVTPS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtpd_u64_f64" type="checkbox"><label for="vcvtpd_u64_f64"><div>uint64_t <b><b>vcvtpd_u64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to unsigned integer, rounding toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Plus infinity (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtpu-vector-floating-point-convert-to-unsigned-integer-rounding-toward-plus-infinity-vector">FCVTPU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtad_s64_f64" type="checkbox"><label for="vcvtad_s64_f64"><div>int64_t <b><b>vcvtad_s64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to signed integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to a signed integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtas-vector-floating-point-convert-to-signed-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtad_u64_f64" type="checkbox"><label for="vcvtad_u64_f64"><div>uint64_t <b><b>vcvtad_u64_f64</b></b> (float64_t a)<span class="right">Floating-point convert to unsigned integer, rounding to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector). This instruction converts each element in a vector from a floating-point value to an unsigned integer value using the Round to Nearest with Ties to Away rounding mode and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtau-vector-floating-point-convert-to-unsigned-integer-rounding-to-nearest-with-ties-to-away-vector">FCVTAU</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_n_s32_f32" type="checkbox"><label for="vcvt_n_s32_f32"><div>int32x2_t <b><b>vcvt_n_s32_f32</b></b> (float32x2_t a, const int n)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_n_s32_f32" type="checkbox"><label for="vcvtq_n_s32_f32"><div>int32x4_t <b><b>vcvtq_n_s32_f32</b></b> (float32x4_t a, const int n)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_n_u32_f32" type="checkbox"><label for="vcvt_n_u32_f32"><div>uint32x2_t <b><b>vcvt_n_u32_f32</b></b> (float32x2_t a, const int n)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_n_u32_f32" type="checkbox"><label for="vcvtq_n_u32_f32"><div>uint32x4_t <b><b>vcvtq_n_u32_f32</b></b> (float32x4_t a, const int n)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvts_n_s32_f32" type="checkbox"><label for="vcvts_n_s32_f32"><div>int32_t <b><b>vcvts_n_s32_f32</b></b> (float32_t a, const int n)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Sd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvts_n_u32_f32" type="checkbox"><label for="vcvts_n_u32_f32"><div>uint32_t <b><b>vcvts_n_u32_f32</b></b> (float32_t a, const int n)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Sd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_n_s64_f64" type="checkbox"><label for="vcvt_n_s64_f64"><div>int64x1_t <b><b>vcvt_n_s64_f64</b></b> (float64x1_t a, const int n)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_n_s64_f64" type="checkbox"><label for="vcvtq_n_s64_f64"><div>int64x2_t <b><b>vcvtq_n_s64_f64</b></b> (float64x2_t a, const int n)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_n_u64_f64" type="checkbox"><label for="vcvt_n_u64_f64"><div>uint64x1_t <b><b>vcvt_n_u64_f64</b></b> (float64x1_t a, const int n)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_n_u64_f64" type="checkbox"><label for="vcvtq_n_u64_f64"><div>uint64x2_t <b><b>vcvtq_n_u64_f64</b></b> (float64x2_t a, const int n)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtd_n_s64_f64" type="checkbox"><label for="vcvtd_n_s64_f64"><div>int64_t <b><b>vcvtd_n_s64_f64</b></b> (float64_t a, const int n)<span class="right">Floating-point convert to signed integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Signed integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to a signed integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzs-vector-integer-floating-point-convert-to-signed-integer-rounding-toward-zero-vector">FCVTZS</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtd_n_u64_f64" type="checkbox"><label for="vcvtd_n_u64_f64"><div>uint64_t <b><b>vcvtd_n_u64_f64</b></b> (float64_t a, const int n)<span class="right">Floating-point convert to unsigned integer, rounding toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to Unsigned integer, rounding toward Zero (vector). This instruction converts a scalar or each element in a vector from a floating-point value to an unsigned integer value using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtzu-vector-integer-floating-point-convert-to-unsigned-integer-rounding-toward-zero-vector">FCVTZU</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPToFixed.5" title="function: bits(M) FPToFixed(bits(N) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FPToFixed</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_f32_s32" type="checkbox"><label for="vcvt_f32_s32"><div>float32x2_t <b><b>vcvt_f32_s32</b></b> (int32x2_t a)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_f32_s32" type="checkbox"><label for="vcvtq_f32_s32"><div>float32x4_t <b><b>vcvtq_f32_s32</b></b> (int32x4_t a)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_f32_u32" type="checkbox"><label for="vcvt_f32_u32"><div>float32x2_t <b><b>vcvt_f32_u32</b></b> (uint32x2_t a)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_f32_u32" type="checkbox"><label for="vcvtq_f32_u32"><div>float32x4_t <b><b>vcvtq_f32_u32</b></b> (uint32x4_t a)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvts_f32_s32" type="checkbox"><label for="vcvts_f32_s32"><div>float32_t <b><b>vcvts_f32_s32</b></b> (int32_t a)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvts_f32_u32" type="checkbox"><label for="vcvts_f32_u32"><div>float32_t <b><b>vcvts_f32_u32</b></b> (uint32_t a)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_f64_s64" type="checkbox"><label for="vcvt_f64_s64"><div>float64x1_t <b><b>vcvt_f64_s64</b></b> (int64x1_t a)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_f64_s64" type="checkbox"><label for="vcvtq_f64_s64"><div>float64x2_t <b><b>vcvtq_f64_s64</b></b> (int64x2_t a)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_f64_u64" type="checkbox"><label for="vcvt_f64_u64"><div>float64x1_t <b><b>vcvt_f64_u64</b></b> (uint64x1_t a)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_f64_u64" type="checkbox"><label for="vcvtq_f64_u64"><div>float64x2_t <b><b>vcvtq_f64_u64</b></b> (uint64x2_t a)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtd_f64_s64" type="checkbox"><label for="vcvtd_f64_s64"><div>float64_t <b><b>vcvtd_f64_s64</b></b> (int64_t a)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtd_f64_u64" type="checkbox"><label for="vcvtd_f64_u64"><div>float64_t <b><b>vcvtd_f64_u64</b></b> (uint64_t a)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_n_f32_s32" type="checkbox"><label for="vcvt_n_f32_s32"><div>float32x2_t <b><b>vcvt_n_f32_s32</b></b> (int32x2_t a, const int n)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_n_f32_s32" type="checkbox"><label for="vcvtq_n_f32_s32"><div>float32x4_t <b><b>vcvtq_n_f32_s32</b></b> (int32x4_t a, const int n)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_n_f32_u32" type="checkbox"><label for="vcvt_n_f32_u32"><div>float32x2_t <b><b>vcvt_n_f32_u32</b></b> (uint32x2_t a, const int n)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Vd.2S,Vn.2S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_n_f32_u32" type="checkbox"><label for="vcvtq_n_f32_u32"><div>float32x4_t <b><b>vcvtq_n_f32_u32</b></b> (uint32x4_t a, const int n)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Vd.4S,Vn.4S,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvts_n_f32_s32" type="checkbox"><label for="vcvts_n_f32_s32"><div>float32_t <b><b>vcvts_n_f32_s32</b></b> (int32_t a, const int n)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Sd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvts_n_f32_u32" type="checkbox"><label for="vcvts_n_f32_u32"><div>float32_t <b><b>vcvts_n_f32_u32</b></b> (uint32_t a, const int n)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Sd,Sn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+1 &lt;&lt; n &lt;&lt; 32 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_n_f64_s64" type="checkbox"><label for="vcvt_n_f64_s64"><div>float64x1_t <b><b>vcvt_n_f64_s64</b></b> (int64x1_t a, const int n)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_n_f64_s64" type="checkbox"><label for="vcvtq_n_f64_s64"><div>float64x2_t <b><b>vcvtq_n_f64_s64</b></b> (int64x2_t a, const int n)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_n_f64_u64" type="checkbox"><label for="vcvt_n_f64_u64"><div>float64x1_t <b><b>vcvt_n_f64_u64</b></b> (uint64x1_t a, const int n)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtq_n_f64_u64" type="checkbox"><label for="vcvtq_n_f64_u64"><div>float64x2_t <b><b>vcvtq_n_f64_u64</b></b> (uint64x2_t a, const int n)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Vd.2D,Vn.2D,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtd_n_f64_s64" type="checkbox"><label for="vcvtd_n_f64_s64"><div>float64_t <b><b>vcvtd_n_f64_s64</b></b> (int64_t a, const int n)<span class="right">Signed integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed integer Convert to Floating-point (vector). This instruction converts each element in a vector from signed integer to floating-point using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/scvtf-vector-integer-signed-integer-convert-to-floating-point-vector">SCVTF</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtd_n_f64_u64" type="checkbox"><label for="vcvtd_n_f64_u64"><div>float64_t <b><b>vcvtd_n_f64_u64</b></b> (uint64_t a, const int n)<span class="right">Unsigned integer convert to floating-point</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned integer Convert to Floating-point (vector). This instruction converts each element in a vector from an unsigned integer value to a floating-point value using the rounding mode that is specified by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ucvtf-vector-integer-unsigned-integer-convert-to-floating-point-vector">UCVTF</a> Dd,Dn,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+1 &lt;&lt; n &lt;&lt; 64 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding</a> rounding = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundingMode.1" title="function: FPRounding FPRoundingMode(FPCRType fpcr)">FPRoundingMode</a>(FPCR);
+bits(esize) element;
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FixedToFP.5" title="function: bits(N) FixedToFP(bits(M) op, integer fbits, boolean unsigned, FPCRType fpcr, FPRounding rounding)">FixedToFP</a>(element, 0, unsigned, FPCR, rounding);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_f16_f32" type="checkbox"><label for="vcvt_f16_f32"><div>float16x4_t <b><b>vcvt_f16_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to lower precision narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to lower precision Narrow (vector). This instruction reads each vector element in the SIMD&amp;FP source register, converts each result to half the precision of the source element, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The rounding mode is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtn-fcvtn2-floating-point-convert-to-lower-precision-narrow-vector">FCVTN</a> Vd.4H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.2" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_high_f16_f32" type="checkbox"><label for="vcvt_high_f16_f32"><div>float16x8_t <b><b>vcvt_high_f16_f32</b></b> (float16x4_t r, float32x4_t a)<span class="right">Floating-point convert to lower precision narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to lower precision Narrow (vector). This instruction reads each vector element in the SIMD&amp;FP source register, converts each result to half the precision of the source element, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The rounding mode is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtn-fcvtn2-floating-point-convert-to-lower-precision-narrow-vector">FCVTN2</a> Vd.8H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.2" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_f32_f64" type="checkbox"><label for="vcvt_f32_f64"><div>float32x2_t <b><b>vcvt_f32_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to lower precision narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to lower precision Narrow (vector). This instruction reads each vector element in the SIMD&amp;FP source register, converts each result to half the precision of the source element, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The rounding mode is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtn-fcvtn2-floating-point-convert-to-lower-precision-narrow-vector">FCVTN</a> Vd.2S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.2" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_high_f32_f64" type="checkbox"><label for="vcvt_high_f32_f64"><div>float32x4_t <b><b>vcvt_high_f32_f64</b></b> (float32x2_t r, float64x2_t a)<span class="right">Floating-point convert to lower precision narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to lower precision Narrow (vector). This instruction reads each vector element in the SIMD&amp;FP source register, converts each result to half the precision of the source element, writes the final result to a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. The rounding mode is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtn-fcvtn2-floating-point-convert-to-lower-precision-narrow-vector">FCVTN2</a> Vd.4S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.2" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_f32_f16" type="checkbox"><label for="vcvt_f32_f16"><div>float32x4_t <b><b>vcvt_f32_f16</b></b> (float16x4_t a)<span class="right">Floating-point convert to higher precision long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to higher precision Long (vector). This instruction reads each element in a vector in the SIMD&amp;FP source register, converts each value to double the precision of the source element using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes each result to the equivalent element of the vector in the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtl-fcvtl2-floating-point-convert-to-higher-precision-long-vector">FCVTL</a> Vd.4S,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.2" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_high_f32_f16" type="checkbox"><label for="vcvt_high_f32_f16"><div>float32x4_t <b><b>vcvt_high_f32_f16</b></b> (float16x8_t a)<span class="right">Floating-point convert to higher precision long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to higher precision Long (vector). This instruction reads each element in a vector in the SIMD&amp;FP source register, converts each value to double the precision of the source element using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes each result to the equivalent element of the vector in the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtl-fcvtl2-floating-point-convert-to-higher-precision-long-vector">FCVTL2</a> Vd.4S,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.2" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_f64_f32" type="checkbox"><label for="vcvt_f64_f32"><div>float64x2_t <b><b>vcvt_f64_f32</b></b> (float32x2_t a)<span class="right">Floating-point convert to higher precision long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to higher precision Long (vector). This instruction reads each element in a vector in the SIMD&amp;FP source register, converts each value to double the precision of the source element using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes each result to the equivalent element of the vector in the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtl-fcvtl2-floating-point-convert-to-higher-precision-long-vector">FCVTL</a> Vd.2D,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.2" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvt_high_f64_f32" type="checkbox"><label for="vcvt_high_f64_f32"><div>float64x2_t <b><b>vcvt_high_f64_f32</b></b> (float32x4_t a)<span class="right">Floating-point convert to higher precision long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to higher precision Long (vector). This instruction reads each element in a vector in the SIMD&amp;FP source register, converts each value to double the precision of the source element using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes each result to the equivalent element of the vector in the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtl-fcvtl2-floating-point-convert-to-higher-precision-long-vector">FCVTL2</a> Vd.2D,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(2*datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.2" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtx_f32_f64" type="checkbox"><label for="vcvtx_f32_f64"><div>float32x2_t <b><b>vcvtx_f32_f64</b></b> (float64x2_t a)<span class="right">Floating-point convert to lower precision narrow, rounding to odd</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to lower precision Narrow, rounding to odd (vector). This instruction reads each vector element in the source SIMD&amp;FP register, narrows each value to half the precision of the source element using the Round to Odd rounding mode, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtxn-fcvtxn2-floating-point-convert-to-lower-precision-narrow-rounding-to-odd-vector">FCVTXN</a> Vd.2S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.3" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr, FPRounding rounding)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], FPCR, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding_ODD" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding_ODD</a>);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtxd_f32_f64" type="checkbox"><label for="vcvtxd_f32_f64"><div>float32_t <b><b>vcvtxd_f32_f64</b></b> (float64_t a)<span class="right">Floating-point convert to lower precision narrow, rounding to odd</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to lower precision Narrow, rounding to odd (vector). This instruction reads each vector element in the source SIMD&amp;FP register, narrows each value to half the precision of the source element using the Round to Odd rounding mode, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtxn-fcvtxn2-floating-point-convert-to-lower-precision-narrow-rounding-to-odd-vector">FCVTXN</a> Sd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.3" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr, FPRounding rounding)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], FPCR, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding_ODD" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding_ODD</a>);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcvtx_high_f32_f64" type="checkbox"><label for="vcvtx_high_f32_f64"><div>float32x4_t <b><b>vcvtx_high_f32_f64</b></b> (float32x2_t r, float64x2_t a)<span class="right">Floating-point convert to lower precision narrow, rounding to odd</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Convert to lower precision Narrow, rounding to odd (vector). This instruction reads each vector element in the source SIMD&amp;FP register, narrows each value to half the precision of the source element using the Round to Odd rounding mode, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fcvtxn-fcvtxn2-floating-point-convert-to-lower-precision-narrow-rounding-to-odd-vector">FCVTXN2</a> Vd.4S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPConvert.3" title="function: bits(M) FPConvert(bits(N) op, FPCRType fpcr, FPRounding rounding)">FPConvert</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize], FPCR, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#FPRounding_ODD" title="enumeration FPRounding  {FPRounding_TIEEVEN, FPRounding_POSINF,
+ FPRounding_NEGINF,  FPRounding_ZERO,
+ FPRounding_TIEAWAY, FPRounding_ODD}">FPRounding_ODD</a>);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrnd_f32" type="checkbox"><label for="vrnd_f32"><div>float32x2_t <b><b>vrnd_f32</b></b> (float32x2_t a)<span class="right">Floating-point round to integral, toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Zero (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintz-vector-floating-point-round-to-integral-toward-zero-vector">FRINTZ</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndq_f32" type="checkbox"><label for="vrndq_f32"><div>float32x4_t <b><b>vrndq_f32</b></b> (float32x4_t a)<span class="right">Floating-point round to integral, toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Zero (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintz-vector-floating-point-round-to-integral-toward-zero-vector">FRINTZ</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrnd_f64" type="checkbox"><label for="vrnd_f64"><div>float64x1_t <b><b>vrnd_f64</b></b> (float64x1_t a)<span class="right">Floating-point round to integral, toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Zero (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintz-vector-floating-point-round-to-integral-toward-zero-vector">FRINTZ</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndq_f64" type="checkbox"><label for="vrndq_f64"><div>float64x2_t <b><b>vrndq_f64</b></b> (float64x2_t a)<span class="right">Floating-point round to integral, toward zero</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Zero (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Zero rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintz-vector-floating-point-round-to-integral-toward-zero-vector">FRINTZ</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndn_f32" type="checkbox"><label for="vrndn_f32"><div>float32x2_t <b><b>vrndn_f32</b></b> (float32x2_t a)<span class="right">Floating-point round to integral, to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, to nearest with ties to even (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintn-vector-floating-point-round-to-integral-to-nearest-with-ties-to-even-vector">FRINTN</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndnq_f32" type="checkbox"><label for="vrndnq_f32"><div>float32x4_t <b><b>vrndnq_f32</b></b> (float32x4_t a)<span class="right">Floating-point round to integral, to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, to nearest with ties to even (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintn-vector-floating-point-round-to-integral-to-nearest-with-ties-to-even-vector">FRINTN</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndn_f64" type="checkbox"><label for="vrndn_f64"><div>float64x1_t <b><b>vrndn_f64</b></b> (float64x1_t a)<span class="right">Floating-point round to integral, to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, to nearest with ties to even (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintn-vector-floating-point-round-to-integral-to-nearest-with-ties-to-even-vector">FRINTN</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndnq_f64" type="checkbox"><label for="vrndnq_f64"><div>float64x2_t <b><b>vrndnq_f64</b></b> (float64x2_t a)<span class="right">Floating-point round to integral, to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, to nearest with ties to even (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintn-vector-floating-point-round-to-integral-to-nearest-with-ties-to-even-vector">FRINTN</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndns_f32" type="checkbox"><label for="vrndns_f32"><div>float32_t <b><b>vrndns_f32</b></b> (float32_t a)<span class="right">Floating-point round to integral, to nearest with ties to even</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, to nearest with ties to even (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round to Nearest rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintn-vector-floating-point-round-to-integral-to-nearest-with-ties-to-even-vector">FRINTN</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndm_f32" type="checkbox"><label for="vrndm_f32"><div>float32x2_t <b><b>vrndm_f32</b></b> (float32x2_t a)<span class="right">Floating-point round to integral, toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Minus infinity (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintm-vector-floating-point-round-to-integral-toward-minus-infinity-vector">FRINTM</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndmq_f32" type="checkbox"><label for="vrndmq_f32"><div>float32x4_t <b><b>vrndmq_f32</b></b> (float32x4_t a)<span class="right">Floating-point round to integral, toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Minus infinity (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintm-vector-floating-point-round-to-integral-toward-minus-infinity-vector">FRINTM</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndm_f64" type="checkbox"><label for="vrndm_f64"><div>float64x1_t <b><b>vrndm_f64</b></b> (float64x1_t a)<span class="right">Floating-point round to integral, toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Minus infinity (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintm-vector-floating-point-round-to-integral-toward-minus-infinity-vector">FRINTM</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndmq_f64" type="checkbox"><label for="vrndmq_f64"><div>float64x2_t <b><b>vrndmq_f64</b></b> (float64x2_t a)<span class="right">Floating-point round to integral, toward minus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Minus infinity (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Minus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintm-vector-floating-point-round-to-integral-toward-minus-infinity-vector">FRINTM</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndp_f32" type="checkbox"><label for="vrndp_f32"><div>float32x2_t <b><b>vrndp_f32</b></b> (float32x2_t a)<span class="right">Floating-point round to integral, toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Plus infinity (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintp-vector-floating-point-round-to-integral-toward-plus-infinity-vector">FRINTP</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndpq_f32" type="checkbox"><label for="vrndpq_f32"><div>float32x4_t <b><b>vrndpq_f32</b></b> (float32x4_t a)<span class="right">Floating-point round to integral, toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Plus infinity (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintp-vector-floating-point-round-to-integral-toward-plus-infinity-vector">FRINTP</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndp_f64" type="checkbox"><label for="vrndp_f64"><div>float64x1_t <b><b>vrndp_f64</b></b> (float64x1_t a)<span class="right">Floating-point round to integral, toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Plus infinity (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintp-vector-floating-point-round-to-integral-toward-plus-infinity-vector">FRINTP</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndpq_f64" type="checkbox"><label for="vrndpq_f64"><div>float64x2_t <b><b>vrndpq_f64</b></b> (float64x2_t a)<span class="right">Floating-point round to integral, toward plus infinity</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, toward Plus infinity (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round towards Plus Infinity rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintp-vector-floating-point-round-to-integral-toward-plus-infinity-vector">FRINTP</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrnda_f32" type="checkbox"><label for="vrnda_f32"><div>float32x2_t <b><b>vrnda_f32</b></b> (float32x2_t a)<span class="right">Floating-point round to integral, to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, to nearest with ties to Away (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round to Nearest with Ties to Away rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frinta-vector-floating-point-round-to-integral-to-nearest-with-ties-to-away-vector">FRINTA</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndaq_f32" type="checkbox"><label for="vrndaq_f32"><div>float32x4_t <b><b>vrndaq_f32</b></b> (float32x4_t a)<span class="right">Floating-point round to integral, to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, to nearest with ties to Away (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round to Nearest with Ties to Away rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frinta-vector-floating-point-round-to-integral-to-nearest-with-ties-to-away-vector">FRINTA</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrnda_f64" type="checkbox"><label for="vrnda_f64"><div>float64x1_t <b><b>vrnda_f64</b></b> (float64x1_t a)<span class="right">Floating-point round to integral, to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, to nearest with ties to Away (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round to Nearest with Ties to Away rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frinta-vector-floating-point-round-to-integral-to-nearest-with-ties-to-away-vector">FRINTA</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndaq_f64" type="checkbox"><label for="vrndaq_f64"><div>float64x2_t <b><b>vrndaq_f64</b></b> (float64x2_t a)<span class="right">Floating-point round to integral, to nearest with ties to away</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, to nearest with ties to Away (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the Round to Nearest with Ties to Away rounding mode, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frinta-vector-floating-point-round-to-integral-to-nearest-with-ties-to-away-vector">FRINTA</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndi_f32" type="checkbox"><label for="vrndi_f32"><div>float32x2_t <b><b>vrndi_f32</b></b> (float32x2_t a)<span class="right">Floating-point round to integral, using current rounding mode</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, using current rounding mode (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frinti-vector-floating-point-round-to-integral-using-current-rounding-mode-vector">FRINTI</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndiq_f32" type="checkbox"><label for="vrndiq_f32"><div>float32x4_t <b><b>vrndiq_f32</b></b> (float32x4_t a)<span class="right">Floating-point round to integral, using current rounding mode</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, using current rounding mode (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frinti-vector-floating-point-round-to-integral-using-current-rounding-mode-vector">FRINTI</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndi_f64" type="checkbox"><label for="vrndi_f64"><div>float64x1_t <b><b>vrndi_f64</b></b> (float64x1_t a)<span class="right">Floating-point round to integral, using current rounding mode</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, using current rounding mode (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frinti-vector-floating-point-round-to-integral-using-current-rounding-mode-vector">FRINTI</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndiq_f64" type="checkbox"><label for="vrndiq_f64"><div>float64x2_t <b><b>vrndiq_f64</b></b> (float64x2_t a)<span class="right">Floating-point round to integral, using current rounding mode</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral, using current rounding mode (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frinti-vector-floating-point-round-to-integral-using-current-rounding-mode-vector">FRINTI</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndx_f32" type="checkbox"><label for="vrndx_f32"><div>float32x2_t <b><b>vrndx_f32</b></b> (float32x2_t a)<span class="right">Floating-point round to integral exact, using current rounding mode</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral exact, using current rounding mode (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintx-vector-floating-point-round-to-integral-exact-using-current-rounding-mode-vector">FRINTX</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndxq_f32" type="checkbox"><label for="vrndxq_f32"><div>float32x4_t <b><b>vrndxq_f32</b></b> (float32x4_t a)<span class="right">Floating-point round to integral exact, using current rounding mode</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral exact, using current rounding mode (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintx-vector-floating-point-round-to-integral-exact-using-current-rounding-mode-vector">FRINTX</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrndx_f64" type="checkbox"><label for="vrndx_f64"><div>float64x1_t <b><b>vrndx_f64</b></b> (float64x1_t a)<span class="right">Floating-point round to integral exact, using current rounding mode</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral exact, using current rounding mode (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintx-vector-floating-point-round-to-integral-exact-using-current-rounding-mode-vector">FRINTX</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrndxq_f64" type="checkbox"><label for="vrndxq_f64"><div>float64x2_t <b><b>vrndxq_f64</b></b> (float64x2_t a)<span class="right">Floating-point round to integral exact, using current rounding mode</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Round to Integral exact, using current rounding mode (vector). This instruction rounds a vector of floating-point values in the SIMD&amp;FP source register to integral floating-point values of the same size using the rounding mode that is determined by the <a class="armarm-xref" title="Reference to ARM ARM section">FPCR</a>, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frintx-vector-floating-point-round-to-integral-exact-using-current-rounding-mode-vector">FRINTX</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRoundInt.4" title="function: bits(N) FPRoundInt(bits(N) op, FPCRType fpcr, FPRounding rounding, boolean exact)">FPRoundInt</a>(element, FPCR, rounding, exact);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_s16" type="checkbox"><label for="vmovn_s16"><div>int8x8_t <b><b>vmovn_s16</b></b> (int16x8_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN</a> Vd.8B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_s32" type="checkbox"><label for="vmovn_s32"><div>int16x4_t <b><b>vmovn_s32</b></b> (int32x4_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN</a> Vd.4H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_s64" type="checkbox"><label for="vmovn_s64"><div>int32x2_t <b><b>vmovn_s64</b></b> (int64x2_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN</a> Vd.2S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_u16" type="checkbox"><label for="vmovn_u16"><div>uint8x8_t <b><b>vmovn_u16</b></b> (uint16x8_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN</a> Vd.8B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_u32" type="checkbox"><label for="vmovn_u32"><div>uint16x4_t <b><b>vmovn_u32</b></b> (uint32x4_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN</a> Vd.4H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_u64" type="checkbox"><label for="vmovn_u64"><div>uint32x2_t <b><b>vmovn_u64</b></b> (uint64x2_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN</a> Vd.2S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_high_s16" type="checkbox"><label for="vmovn_high_s16"><div>int8x16_t <b><b>vmovn_high_s16</b></b> (int8x8_t r, int16x8_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN2</a> Vd.16B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_high_s32" type="checkbox"><label for="vmovn_high_s32"><div>int16x8_t <b><b>vmovn_high_s32</b></b> (int16x4_t r, int32x4_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN2</a> Vd.8H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_high_s64" type="checkbox"><label for="vmovn_high_s64"><div>int32x4_t <b><b>vmovn_high_s64</b></b> (int32x2_t r, int64x2_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN2</a> Vd.4S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_high_u16" type="checkbox"><label for="vmovn_high_u16"><div>uint8x16_t <b><b>vmovn_high_u16</b></b> (uint8x8_t r, uint16x8_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN2</a> Vd.16B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_high_u32" type="checkbox"><label for="vmovn_high_u32"><div>uint16x8_t <b><b>vmovn_high_u32</b></b> (uint16x4_t r, uint32x4_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN2</a> Vd.8H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovn_high_u64" type="checkbox"><label for="vmovn_high_u64"><div>uint32x4_t <b><b>vmovn_high_u64</b></b> (uint32x2_t r, uint64x2_t a)<span class="right">Extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, narrows each value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/xtn-xtn2-extract-narrow">XTN2</a> Vd.4S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_s8" type="checkbox"><label for="vmovl_s8"><div>int16x8_t <b><b>vmovl_s8</b></b> (int8x8_t a)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL</a> Vd.8H,Vn.8B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_s16" type="checkbox"><label for="vmovl_s16"><div>int32x4_t <b><b>vmovl_s16</b></b> (int16x4_t a)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL</a> Vd.4S,Vn.4H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_s32" type="checkbox"><label for="vmovl_s32"><div>int64x2_t <b><b>vmovl_s32</b></b> (int32x2_t a)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL</a> Vd.2D,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_u8" type="checkbox"><label for="vmovl_u8"><div>uint16x8_t <b><b>vmovl_u8</b></b> (uint8x8_t a)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL</a> Vd.8H,Vn.8B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_u16" type="checkbox"><label for="vmovl_u16"><div>uint32x4_t <b><b>vmovl_u16</b></b> (uint16x4_t a)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL</a> Vd.4S,Vn.4H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_u32" type="checkbox"><label for="vmovl_u32"><div>uint64x2_t <b><b>vmovl_u32</b></b> (uint32x2_t a)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL</a> Vd.2D,Vn.2S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_high_s8" type="checkbox"><label for="vmovl_high_s8"><div>int16x8_t <b><b>vmovl_high_s8</b></b> (int8x16_t a)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL2</a> Vd.8H,Vn.16B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_high_s16" type="checkbox"><label for="vmovl_high_s16"><div>int32x4_t <b><b>vmovl_high_s16</b></b> (int16x8_t a)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL2</a> Vd.4S,Vn.8H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_high_s32" type="checkbox"><label for="vmovl_high_s32"><div>int64x2_t <b><b>vmovl_high_s32</b></b> (int32x4_t a)<span class="right">Signed shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Shift Left Long (immediate). This instruction reads each vector element from the source SIMD&amp;FP register, left shifts each vector element by the specified shift amount, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sshll-sshll2-signed-shift-left-long-immediate">SSHLL2</a> Vd.2D,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_high_u8" type="checkbox"><label for="vmovl_high_u8"><div>uint16x8_t <b><b>vmovl_high_u8</b></b> (uint8x16_t a)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL2</a> Vd.8H,Vn.16B,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_high_u16" type="checkbox"><label for="vmovl_high_u16"><div>uint32x4_t <b><b>vmovl_high_u16</b></b> (uint16x8_t a)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL2</a> Vd.4S,Vn.8H,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovl_high_u32" type="checkbox"><label for="vmovl_high_u32"><div>uint64x2_t <b><b>vmovl_high_u32</b></b> (uint32x4_t a)<span class="right">Unsigned shift left long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Shift Left Long (immediate). This instruction reads each vector element in the lower or upper half of the source SIMD&amp;FP register, shifts the unsigned integer value left by the specified number of bits, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ushll-ushll2-unsigned-shift-left-long-immediate">USHLL2</a> Vd.2D,Vn.4S,#0
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize*2) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned) &lt;&lt; shift;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = element&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_s16" type="checkbox"><label for="vqmovn_s16"><div>int8x8_t <b><b>vqmovn_s16</b></b> (int16x8_t a)<span class="right">Signed saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates the value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtn-sqxtn2-signed-saturating-extract-narrow">SQXTN</a> Vd.8B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_s32" type="checkbox"><label for="vqmovn_s32"><div>int16x4_t <b><b>vqmovn_s32</b></b> (int32x4_t a)<span class="right">Signed saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates the value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtn-sqxtn2-signed-saturating-extract-narrow">SQXTN</a> Vd.4H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_s64" type="checkbox"><label for="vqmovn_s64"><div>int32x2_t <b><b>vqmovn_s64</b></b> (int64x2_t a)<span class="right">Signed saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates the value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtn-sqxtn2-signed-saturating-extract-narrow">SQXTN</a> Vd.2S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_u16" type="checkbox"><label for="vqmovn_u16"><div>uint8x8_t <b><b>vqmovn_u16</b></b> (uint16x8_t a)<span class="right">Unsigned saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates each value to half the original width, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqxtn-uqxtn2-unsigned-saturating-extract-narrow">UQXTN</a> Vd.8B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_u32" type="checkbox"><label for="vqmovn_u32"><div>uint16x4_t <b><b>vqmovn_u32</b></b> (uint32x4_t a)<span class="right">Unsigned saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates each value to half the original width, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqxtn-uqxtn2-unsigned-saturating-extract-narrow">UQXTN</a> Vd.4H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_u64" type="checkbox"><label for="vqmovn_u64"><div>uint32x2_t <b><b>vqmovn_u64</b></b> (uint64x2_t a)<span class="right">Unsigned saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates each value to half the original width, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqxtn-uqxtn2-unsigned-saturating-extract-narrow">UQXTN</a> Vd.2S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqmovnh_s16" type="checkbox"><label for="vqmovnh_s16"><div>int8_t <b><b>vqmovnh_s16</b></b> (int16_t a)<span class="right">Signed saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates the value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtn-sqxtn2-signed-saturating-extract-narrow">SQXTN</a> Bd,Hn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovns_s32" type="checkbox"><label for="vqmovns_s32"><div>int16_t <b><b>vqmovns_s32</b></b> (int32_t a)<span class="right">Signed saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates the value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtn-sqxtn2-signed-saturating-extract-narrow">SQXTN</a> Hd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovnd_s64" type="checkbox"><label for="vqmovnd_s64"><div>int32_t <b><b>vqmovnd_s64</b></b> (int64_t a)<span class="right">Signed saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates the value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtn-sqxtn2-signed-saturating-extract-narrow">SQXTN</a> Sd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovnh_u16" type="checkbox"><label for="vqmovnh_u16"><div>uint8_t <b><b>vqmovnh_u16</b></b> (uint16_t a)<span class="right">Unsigned saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates each value to half the original width, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqxtn-uqxtn2-unsigned-saturating-extract-narrow">UQXTN</a> Bd,Hn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovns_u32" type="checkbox"><label for="vqmovns_u32"><div>uint16_t <b><b>vqmovns_u32</b></b> (uint32_t a)<span class="right">Unsigned saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates each value to half the original width, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqxtn-uqxtn2-unsigned-saturating-extract-narrow">UQXTN</a> Hd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovnd_u64" type="checkbox"><label for="vqmovnd_u64"><div>uint32_t <b><b>vqmovnd_u64</b></b> (uint64_t a)<span class="right">Unsigned saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates each value to half the original width, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqxtn-uqxtn2-unsigned-saturating-extract-narrow">UQXTN</a> Sd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_high_s16" type="checkbox"><label for="vqmovn_high_s16"><div>int8x16_t <b><b>vqmovn_high_s16</b></b> (int8x8_t r, int16x8_t a)<span class="right">Signed saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates the value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtn-sqxtn2-signed-saturating-extract-narrow">SQXTN2</a> Vd.16B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_high_s32" type="checkbox"><label for="vqmovn_high_s32"><div>int16x8_t <b><b>vqmovn_high_s32</b></b> (int16x4_t r, int32x4_t a)<span class="right">Signed saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates the value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtn-sqxtn2-signed-saturating-extract-narrow">SQXTN2</a> Vd.8H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_high_s64" type="checkbox"><label for="vqmovn_high_s64"><div>int32x4_t <b><b>vqmovn_high_s64</b></b> (int32x2_t r, int64x2_t a)<span class="right">Signed saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates the value to half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtn-sqxtn2-signed-saturating-extract-narrow">SQXTN2</a> Vd.4S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_high_u16" type="checkbox"><label for="vqmovn_high_u16"><div>uint8x16_t <b><b>vqmovn_high_u16</b></b> (uint8x8_t r, uint16x8_t a)<span class="right">Unsigned saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates each value to half the original width, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqxtn-uqxtn2-unsigned-saturating-extract-narrow">UQXTN2</a> Vd.16B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_high_u32" type="checkbox"><label for="vqmovn_high_u32"><div>uint16x8_t <b><b>vqmovn_high_u32</b></b> (uint16x4_t r, uint32x4_t a)<span class="right">Unsigned saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates each value to half the original width, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqxtn-uqxtn2-unsigned-saturating-extract-narrow">UQXTN2</a> Vd.8H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovn_high_u64" type="checkbox"><label for="vqmovn_high_u64"><div>uint32x4_t <b><b>vqmovn_high_u64</b></b> (uint32x2_t r, uint64x2_t a)<span class="right">Unsigned saturating extract narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned saturating extract Narrow. This instruction reads each vector element from the source SIMD&amp;FP register, saturates each value to half the original width, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uqxtn-uqxtn2-unsigned-saturating-extract-narrow">UQXTN2</a> Vd.4S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SatQ.3" title="function: (bits(N), boolean) SatQ(integer i, integer N, boolean unsigned)">SatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(element, unsigned), esize, unsigned);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovun_s16" type="checkbox"><label for="vqmovun_s16"><div>uint8x8_t <b><b>vqmovun_s16</b></b> (int16x8_t a)<span class="right">Signed saturating extract unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Unsigned Narrow. This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, saturates the value to an unsigned integer value that is half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtun-sqxtun2-signed-saturating-extract-unsigned-narrow">SQXTUN</a> Vd.8B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(element), esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqmovun_s32" type="checkbox"><label for="vqmovun_s32"><div>uint16x4_t <b><b>vqmovun_s32</b></b> (int32x4_t a)<span class="right">Signed saturating extract unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Unsigned Narrow. This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, saturates the value to an unsigned integer value that is half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtun-sqxtun2-signed-saturating-extract-unsigned-narrow">SQXTUN</a> Vd.4H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(element), esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqmovun_s64" type="checkbox"><label for="vqmovun_s64"><div>uint32x2_t <b><b>vqmovun_s64</b></b> (int64x2_t a)<span class="right">Signed saturating extract unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Unsigned Narrow. This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, saturates the value to an unsigned integer value that is half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtun-sqxtun2-signed-saturating-extract-unsigned-narrow">SQXTUN</a> Vd.2S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(element), esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqmovunh_s16" type="checkbox"><label for="vqmovunh_s16"><div>uint8_t <b><b>vqmovunh_s16</b></b> (int16_t a)<span class="right">Signed saturating extract unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Unsigned Narrow. This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, saturates the value to an unsigned integer value that is half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtun-sqxtun2-signed-saturating-extract-unsigned-narrow">SQXTUN</a> Bd,Hn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(element), esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovuns_s32" type="checkbox"><label for="vqmovuns_s32"><div>uint16_t <b><b>vqmovuns_s32</b></b> (int32_t a)<span class="right">Signed saturating extract unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Unsigned Narrow. This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, saturates the value to an unsigned integer value that is half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtun-sqxtun2-signed-saturating-extract-unsigned-narrow">SQXTUN</a> Hd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(element), esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovund_s64" type="checkbox"><label for="vqmovund_s64"><div>uint32_t <b><b>vqmovund_s64</b></b> (int64_t a)<span class="right">Signed saturating extract unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Unsigned Narrow. This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, saturates the value to an unsigned integer value that is half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtun-sqxtun2-signed-saturating-extract-unsigned-narrow">SQXTUN</a> Sd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(element), esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovun_high_s16" type="checkbox"><label for="vqmovun_high_s16"><div>uint8x16_t <b><b>vqmovun_high_s16</b></b> (uint8x8_t r, int16x8_t a)<span class="right">Signed saturating extract unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Unsigned Narrow. This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, saturates the value to an unsigned integer value that is half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtun-sqxtun2-signed-saturating-extract-unsigned-narrow">SQXTUN2</a> Vd.16B,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.8B <br />
+a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(element), esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovun_high_s32" type="checkbox"><label for="vqmovun_high_s32"><div>uint16x8_t <b><b>vqmovun_high_s32</b></b> (uint16x4_t r, int32x4_t a)<span class="right">Signed saturating extract unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Unsigned Narrow. This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, saturates the value to an unsigned integer value that is half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtun-sqxtun2-signed-saturating-extract-unsigned-narrow">SQXTUN2</a> Vd.8H,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.4H <br />
+a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(element), esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqmovun_high_s64" type="checkbox"><label for="vqmovun_high_s64"><div>uint32x4_t <b><b>vqmovun_high_s64</b></b> (uint32x2_t r, int64x2_t a)<span class="right">Signed saturating extract unsigned narrow</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating extract Unsigned Narrow. This instruction reads each signed integer value in the vector of the source SIMD&amp;FP register, saturates the value to an unsigned integer value that is half the original width, places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&amp;FP register. The destination vector elements are half as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqxtun-sqxtun2-signed-saturating-extract-unsigned-narrow">SQXTUN2</a> Vd.4S,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>r &rarr; Vd.2S <br />
+a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(2*datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(2*esize) element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 2*esize];
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedSatQ.2" title="function: (bits(N), boolean) UnsignedSatQ(integer i, integer N)">UnsignedSatQ</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(element), esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.write.2" title="accessor: Vpart[integer n, integer part] = bits(width) value">Vpart</a>[d, part] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmla_lane_s16" type="checkbox"><label for="vmla_lane_s16"><div>int16x4_t <b><b>vmla_lane_s16</b></b> (int16x4_t a, int16x4_t b, int16x4_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_lane_s16" type="checkbox"><label for="vmlaq_lane_s16"><div>int16x8_t <b><b>vmlaq_lane_s16</b></b> (int16x8_t a, int16x8_t b, int16x4_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_lane_s32" type="checkbox"><label for="vmla_lane_s32"><div>int32x2_t <b><b>vmla_lane_s32</b></b> (int32x2_t a, int32x2_t b, int32x2_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_lane_s32" type="checkbox"><label for="vmlaq_lane_s32"><div>int32x4_t <b><b>vmlaq_lane_s32</b></b> (int32x4_t a, int32x4_t b, int32x2_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_lane_u16" type="checkbox"><label for="vmla_lane_u16"><div>uint16x4_t <b><b>vmla_lane_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16x4_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_lane_u16" type="checkbox"><label for="vmlaq_lane_u16"><div>uint16x8_t <b><b>vmlaq_lane_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16x4_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_lane_u32" type="checkbox"><label for="vmla_lane_u32"><div>uint32x2_t <b><b>vmla_lane_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32x2_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_lane_u32" type="checkbox"><label for="vmlaq_lane_u32"><div>uint32x4_t <b><b>vmlaq_lane_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32x2_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_lane_f32" type="checkbox"><label for="vmla_lane_f32"><div>float32x2_t <b><b>vmla_lane_f32</b></b> (float32x2_t a, float32x2_t b, float32x2_t v, const int lane)<span class="right">Undefined</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_lane_f32" type="checkbox"><label for="vmlaq_lane_f32"><div>float32x4_t <b><b>vmlaq_lane_f32</b></b> (float32x4_t a, float32x4_t b, float32x2_t v, const int lane)<span class="right">Undefined</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3
+</pre>      <h4>Argument Preparation</h4><pre>0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_laneq_s16" type="checkbox"><label for="vmla_laneq_s16"><div>int16x4_t <b><b>vmla_laneq_s16</b></b> (int16x4_t a, int16x4_t b, int16x8_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_laneq_s16" type="checkbox"><label for="vmlaq_laneq_s16"><div>int16x8_t <b><b>vmlaq_laneq_s16</b></b> (int16x8_t a, int16x8_t b, int16x8_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmla_laneq_s32" type="checkbox"><label for="vmla_laneq_s32"><div>int32x2_t <b><b>vmla_laneq_s32</b></b> (int32x2_t a, int32x2_t b, int32x4_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_laneq_s32" type="checkbox"><label for="vmlaq_laneq_s32"><div>int32x4_t <b><b>vmlaq_laneq_s32</b></b> (int32x4_t a, int32x4_t b, int32x4_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmla_laneq_u16" type="checkbox"><label for="vmla_laneq_u16"><div>uint16x4_t <b><b>vmla_laneq_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16x8_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_laneq_u16" type="checkbox"><label for="vmlaq_laneq_u16"><div>uint16x8_t <b><b>vmlaq_laneq_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16x8_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmla_laneq_u32" type="checkbox"><label for="vmla_laneq_u32"><div>uint32x2_t <b><b>vmla_laneq_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32x4_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_laneq_u32" type="checkbox"><label for="vmlaq_laneq_u32"><div>uint32x4_t <b><b>vmlaq_laneq_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32x4_t v, const int lane)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmla_laneq_f32" type="checkbox"><label for="vmla_laneq_f32"><div>float32x2_t <b><b>vmla_laneq_f32</b></b> (float32x2_t a, float32x2_t b, float32x4_t v, const int lane)<span class="right">Undefined</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_laneq_f32" type="checkbox"><label for="vmlaq_laneq_f32"><div>float32x4_t <b><b>vmlaq_laneq_f32</b></b> (float32x4_t a, float32x4_t b, float32x4_t v, const int lane)<span class="right">Undefined</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3
+</pre>      <h4>Argument Preparation</h4><pre>0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_lane_s16" type="checkbox"><label for="vmlal_lane_s16"><div>int32x4_t <b><b>vmlal_lane_s16</b></b> (int32x4_t a, int16x4_t b, int16x4_t v, const int lane)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_lane_s32" type="checkbox"><label for="vmlal_lane_s32"><div>int64x2_t <b><b>vmlal_lane_s32</b></b> (int64x2_t a, int32x2_t b, int32x2_t v, const int lane)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_lane_u16" type="checkbox"><label for="vmlal_lane_u16"><div>uint32x4_t <b><b>vmlal_lane_u16</b></b> (uint32x4_t a, uint16x4_t b, uint16x4_t v, const int lane)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_lane_u32" type="checkbox"><label for="vmlal_lane_u32"><div>uint64x2_t <b><b>vmlal_lane_u32</b></b> (uint64x2_t a, uint32x2_t b, uint32x2_t v, const int lane)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_lane_s16" type="checkbox"><label for="vmlal_high_lane_s16"><div>int32x4_t <b><b>vmlal_high_lane_s16</b></b> (int32x4_t a, int16x8_t b, int16x4_t v, const int lane)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_lane_s32" type="checkbox"><label for="vmlal_high_lane_s32"><div>int64x2_t <b><b>vmlal_high_lane_s32</b></b> (int64x2_t a, int32x4_t b, int32x2_t v, const int lane)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_lane_u16" type="checkbox"><label for="vmlal_high_lane_u16"><div>uint32x4_t <b><b>vmlal_high_lane_u16</b></b> (uint32x4_t a, uint16x8_t b, uint16x4_t v, const int lane)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_lane_u32" type="checkbox"><label for="vmlal_high_lane_u32"><div>uint64x2_t <b><b>vmlal_high_lane_u32</b></b> (uint64x2_t a, uint32x4_t b, uint32x2_t v, const int lane)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_laneq_s16" type="checkbox"><label for="vmlal_laneq_s16"><div>int32x4_t <b><b>vmlal_laneq_s16</b></b> (int32x4_t a, int16x4_t b, int16x8_t v, const int lane)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_laneq_s32" type="checkbox"><label for="vmlal_laneq_s32"><div>int64x2_t <b><b>vmlal_laneq_s32</b></b> (int64x2_t a, int32x2_t b, int32x4_t v, const int lane)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_laneq_u16" type="checkbox"><label for="vmlal_laneq_u16"><div>uint32x4_t <b><b>vmlal_laneq_u16</b></b> (uint32x4_t a, uint16x4_t b, uint16x8_t v, const int lane)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_laneq_u32" type="checkbox"><label for="vmlal_laneq_u32"><div>uint64x2_t <b><b>vmlal_laneq_u32</b></b> (uint64x2_t a, uint32x2_t b, uint32x4_t v, const int lane)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_laneq_s16" type="checkbox"><label for="vmlal_high_laneq_s16"><div>int32x4_t <b><b>vmlal_high_laneq_s16</b></b> (int32x4_t a, int16x8_t b, int16x8_t v, const int lane)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_laneq_s32" type="checkbox"><label for="vmlal_high_laneq_s32"><div>int64x2_t <b><b>vmlal_high_laneq_s32</b></b> (int64x2_t a, int32x4_t b, int32x4_t v, const int lane)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_laneq_u16" type="checkbox"><label for="vmlal_high_laneq_u16"><div>uint32x4_t <b><b>vmlal_high_laneq_u16</b></b> (uint32x4_t a, uint16x8_t b, uint16x8_t v, const int lane)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_laneq_u32" type="checkbox"><label for="vmlal_high_laneq_u32"><div>uint64x2_t <b><b>vmlal_high_laneq_u32</b></b> (uint64x2_t a, uint32x4_t b, uint32x4_t v, const int lane)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_lane_s16" type="checkbox"><label for="vqdmlal_lane_s16"><div>int32x4_t <b><b>vqdmlal_lane_s16</b></b> (int32x4_t a, int16x4_t b, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_lane_s32" type="checkbox"><label for="vqdmlal_lane_s32"><div>int64x2_t <b><b>vqdmlal_lane_s32</b></b> (int64x2_t a, int32x2_t b, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlalh_lane_s16" type="checkbox"><label for="vqdmlalh_lane_s16"><div>int32_t <b><b>vqdmlalh_lane_s16</b></b> (int32_t a, int16_t b, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Sd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Hn <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlals_lane_s32" type="checkbox"><label for="vqdmlals_lane_s32"><div>int64_t <b><b>vqdmlals_lane_s32</b></b> (int64_t a, int32_t b, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Dd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Sn <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_high_lane_s16" type="checkbox"><label for="vqdmlal_high_lane_s16"><div>int32x4_t <b><b>vqdmlal_high_lane_s16</b></b> (int32x4_t a, int16x8_t b, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_high_lane_s32" type="checkbox"><label for="vqdmlal_high_lane_s32"><div>int64x2_t <b><b>vqdmlal_high_lane_s32</b></b> (int64x2_t a, int32x4_t b, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_laneq_s16" type="checkbox"><label for="vqdmlal_laneq_s16"><div>int32x4_t <b><b>vqdmlal_laneq_s16</b></b> (int32x4_t a, int16x4_t b, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_laneq_s32" type="checkbox"><label for="vqdmlal_laneq_s32"><div>int64x2_t <b><b>vqdmlal_laneq_s32</b></b> (int64x2_t a, int32x2_t b, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlalh_laneq_s16" type="checkbox"><label for="vqdmlalh_laneq_s16"><div>int32_t <b><b>vqdmlalh_laneq_s16</b></b> (int32_t a, int16_t b, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Sd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Hn <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlals_laneq_s32" type="checkbox"><label for="vqdmlals_laneq_s32"><div>int64_t <b><b>vqdmlals_laneq_s32</b></b> (int64_t a, int32_t b, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Dd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Sn <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_high_laneq_s16" type="checkbox"><label for="vqdmlal_high_laneq_s16"><div>int32x4_t <b><b>vqdmlal_high_laneq_s16</b></b> (int32x4_t a, int16x8_t b, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_high_laneq_s32" type="checkbox"><label for="vqdmlal_high_laneq_s32"><div>int64x2_t <b><b>vqdmlal_high_laneq_s32</b></b> (int64x2_t a, int32x4_t b, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmls_lane_s16" type="checkbox"><label for="vmls_lane_s16"><div>int16x4_t <b><b>vmls_lane_s16</b></b> (int16x4_t a, int16x4_t b, int16x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_lane_s16" type="checkbox"><label for="vmlsq_lane_s16"><div>int16x8_t <b><b>vmlsq_lane_s16</b></b> (int16x8_t a, int16x8_t b, int16x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_lane_s32" type="checkbox"><label for="vmls_lane_s32"><div>int32x2_t <b><b>vmls_lane_s32</b></b> (int32x2_t a, int32x2_t b, int32x2_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_lane_s32" type="checkbox"><label for="vmlsq_lane_s32"><div>int32x4_t <b><b>vmlsq_lane_s32</b></b> (int32x4_t a, int32x4_t b, int32x2_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_lane_u16" type="checkbox"><label for="vmls_lane_u16"><div>uint16x4_t <b><b>vmls_lane_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_lane_u16" type="checkbox"><label for="vmlsq_lane_u16"><div>uint16x8_t <b><b>vmlsq_lane_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_lane_u32" type="checkbox"><label for="vmls_lane_u32"><div>uint32x2_t <b><b>vmls_lane_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32x2_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_lane_u32" type="checkbox"><label for="vmlsq_lane_u32"><div>uint32x4_t <b><b>vmlsq_lane_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32x2_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_lane_f32" type="checkbox"><label for="vmls_lane_f32"><div>float32x2_t <b><b>vmls_lane_f32</b></b> (float32x2_t a, float32x2_t b, float32x2_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_lane_f32" type="checkbox"><label for="vmlsq_lane_f32"><div>float32x4_t <b><b>vmlsq_lane_f32</b></b> (float32x4_t a, float32x4_t b, float32x2_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3
+</pre>      <h4>Argument Preparation</h4><pre>0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_laneq_s16" type="checkbox"><label for="vmls_laneq_s16"><div>int16x4_t <b><b>vmls_laneq_s16</b></b> (int16x4_t a, int16x4_t b, int16x8_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_laneq_s16" type="checkbox"><label for="vmlsq_laneq_s16"><div>int16x8_t <b><b>vmlsq_laneq_s16</b></b> (int16x8_t a, int16x8_t b, int16x8_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmls_laneq_s32" type="checkbox"><label for="vmls_laneq_s32"><div>int32x2_t <b><b>vmls_laneq_s32</b></b> (int32x2_t a, int32x2_t b, int32x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_laneq_s32" type="checkbox"><label for="vmlsq_laneq_s32"><div>int32x4_t <b><b>vmlsq_laneq_s32</b></b> (int32x4_t a, int32x4_t b, int32x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmls_laneq_u16" type="checkbox"><label for="vmls_laneq_u16"><div>uint16x4_t <b><b>vmls_laneq_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16x8_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_laneq_u16" type="checkbox"><label for="vmlsq_laneq_u16"><div>uint16x8_t <b><b>vmlsq_laneq_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16x8_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmls_laneq_u32" type="checkbox"><label for="vmls_laneq_u32"><div>uint32x2_t <b><b>vmls_laneq_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_laneq_u32" type="checkbox"><label for="vmlsq_laneq_u32"><div>uint32x4_t <b><b>vmlsq_laneq_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmls_laneq_f32" type="checkbox"><label for="vmls_laneq_f32"><div>float32x2_t <b><b>vmls_laneq_f32</b></b> (float32x2_t a, float32x2_t b, float32x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_laneq_f32" type="checkbox"><label for="vmlsq_laneq_f32"><div>float32x4_t <b><b>vmlsq_laneq_f32</b></b> (float32x4_t a, float32x4_t b, float32x4_t v, const int lane)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3
+</pre>      <h4>Argument Preparation</h4><pre>0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_lane_s16" type="checkbox"><label for="vmlsl_lane_s16"><div>int32x4_t <b><b>vmlsl_lane_s16</b></b> (int32x4_t a, int16x4_t b, int16x4_t v, const int lane)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_lane_s32" type="checkbox"><label for="vmlsl_lane_s32"><div>int64x2_t <b><b>vmlsl_lane_s32</b></b> (int64x2_t a, int32x2_t b, int32x2_t v, const int lane)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_lane_u16" type="checkbox"><label for="vmlsl_lane_u16"><div>uint32x4_t <b><b>vmlsl_lane_u16</b></b> (uint32x4_t a, uint16x4_t b, uint16x4_t v, const int lane)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_lane_u32" type="checkbox"><label for="vmlsl_lane_u32"><div>uint64x2_t <b><b>vmlsl_lane_u32</b></b> (uint64x2_t a, uint32x2_t b, uint32x2_t v, const int lane)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_lane_s16" type="checkbox"><label for="vmlsl_high_lane_s16"><div>int32x4_t <b><b>vmlsl_high_lane_s16</b></b> (int32x4_t a, int16x8_t b, int16x4_t v, const int lane)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_lane_s32" type="checkbox"><label for="vmlsl_high_lane_s32"><div>int64x2_t <b><b>vmlsl_high_lane_s32</b></b> (int64x2_t a, int32x4_t b, int32x2_t v, const int lane)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_lane_u16" type="checkbox"><label for="vmlsl_high_lane_u16"><div>uint32x4_t <b><b>vmlsl_high_lane_u16</b></b> (uint32x4_t a, uint16x8_t b, uint16x4_t v, const int lane)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_lane_u32" type="checkbox"><label for="vmlsl_high_lane_u32"><div>uint64x2_t <b><b>vmlsl_high_lane_u32</b></b> (uint64x2_t a, uint32x4_t b, uint32x2_t v, const int lane)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_laneq_s16" type="checkbox"><label for="vmlsl_laneq_s16"><div>int32x4_t <b><b>vmlsl_laneq_s16</b></b> (int32x4_t a, int16x4_t b, int16x8_t v, const int lane)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_laneq_s32" type="checkbox"><label for="vmlsl_laneq_s32"><div>int64x2_t <b><b>vmlsl_laneq_s32</b></b> (int64x2_t a, int32x2_t b, int32x4_t v, const int lane)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_laneq_u16" type="checkbox"><label for="vmlsl_laneq_u16"><div>uint32x4_t <b><b>vmlsl_laneq_u16</b></b> (uint32x4_t a, uint16x4_t b, uint16x8_t v, const int lane)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_laneq_u32" type="checkbox"><label for="vmlsl_laneq_u32"><div>uint64x2_t <b><b>vmlsl_laneq_u32</b></b> (uint64x2_t a, uint32x2_t b, uint32x4_t v, const int lane)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_laneq_s16" type="checkbox"><label for="vmlsl_high_laneq_s16"><div>int32x4_t <b><b>vmlsl_high_laneq_s16</b></b> (int32x4_t a, int16x8_t b, int16x8_t v, const int lane)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_laneq_s32" type="checkbox"><label for="vmlsl_high_laneq_s32"><div>int64x2_t <b><b>vmlsl_high_laneq_s32</b></b> (int64x2_t a, int32x4_t b, int32x4_t v, const int lane)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_laneq_u16" type="checkbox"><label for="vmlsl_high_laneq_u16"><div>uint32x4_t <b><b>vmlsl_high_laneq_u16</b></b> (uint32x4_t a, uint16x8_t b, uint16x8_t v, const int lane)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_laneq_u32" type="checkbox"><label for="vmlsl_high_laneq_u32"><div>uint64x2_t <b><b>vmlsl_high_laneq_u32</b></b> (uint64x2_t a, uint32x4_t b, uint32x4_t v, const int lane)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_lane_s16" type="checkbox"><label for="vqdmlsl_lane_s16"><div>int32x4_t <b><b>vqdmlsl_lane_s16</b></b> (int32x4_t a, int16x4_t b, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_lane_s32" type="checkbox"><label for="vqdmlsl_lane_s32"><div>int64x2_t <b><b>vqdmlsl_lane_s32</b></b> (int64x2_t a, int32x2_t b, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlslh_lane_s16" type="checkbox"><label for="vqdmlslh_lane_s16"><div>int32_t <b><b>vqdmlslh_lane_s16</b></b> (int32_t a, int16_t b, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Sd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Hn <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsls_lane_s32" type="checkbox"><label for="vqdmlsls_lane_s32"><div>int64_t <b><b>vqdmlsls_lane_s32</b></b> (int64_t a, int32_t b, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Dd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Sn <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_high_lane_s16" type="checkbox"><label for="vqdmlsl_high_lane_s16"><div>int32x4_t <b><b>vqdmlsl_high_lane_s16</b></b> (int32x4_t a, int16x8_t b, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_high_lane_s32" type="checkbox"><label for="vqdmlsl_high_lane_s32"><div>int64x2_t <b><b>vqdmlsl_high_lane_s32</b></b> (int64x2_t a, int32x4_t b, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_laneq_s16" type="checkbox"><label for="vqdmlsl_laneq_s16"><div>int32x4_t <b><b>vqdmlsl_laneq_s16</b></b> (int32x4_t a, int16x4_t b, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_laneq_s32" type="checkbox"><label for="vqdmlsl_laneq_s32"><div>int64x2_t <b><b>vqdmlsl_laneq_s32</b></b> (int64x2_t a, int32x2_t b, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlslh_laneq_s16" type="checkbox"><label for="vqdmlslh_laneq_s16"><div>int32_t <b><b>vqdmlslh_laneq_s16</b></b> (int32_t a, int16_t b, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Sd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sd <br />
+b &rarr; Hn <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsls_laneq_s32" type="checkbox"><label for="vqdmlsls_laneq_s32"><div>int64_t <b><b>vqdmlsls_laneq_s32</b></b> (int64_t a, int32_t b, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Dd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dd <br />
+b &rarr; Sn <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_high_laneq_s16" type="checkbox"><label for="vqdmlsl_high_laneq_s16"><div>int32x4_t <b><b>vqdmlsl_high_laneq_s16</b></b> (int32x4_t a, int16x8_t b, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_high_laneq_s32" type="checkbox"><label for="vqdmlsl_high_laneq_s32"><div>int64x2_t <b><b>vqdmlsl_high_laneq_s32</b></b> (int64x2_t a, int32x4_t b, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmul_n_s16" type="checkbox"><label for="vmul_n_s16"><div>int16x4_t <b><b>vmul_n_s16</b></b> (int16x4_t a, int16_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4H,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_n_s16" type="checkbox"><label for="vmulq_n_s16"><div>int16x8_t <b><b>vmulq_n_s16</b></b> (int16x8_t a, int16_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8H,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_n_s32" type="checkbox"><label for="vmul_n_s32"><div>int32x2_t <b><b>vmul_n_s32</b></b> (int32x2_t a, int32_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_n_s32" type="checkbox"><label for="vmulq_n_s32"><div>int32x4_t <b><b>vmulq_n_s32</b></b> (int32x4_t a, int32_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_n_u16" type="checkbox"><label for="vmul_n_u16"><div>uint16x4_t <b><b>vmul_n_u16</b></b> (uint16x4_t a, uint16_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4H,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_n_u16" type="checkbox"><label for="vmulq_n_u16"><div>uint16x8_t <b><b>vmulq_n_u16</b></b> (uint16x8_t a, uint16_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8H,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_n_u32" type="checkbox"><label for="vmul_n_u32"><div>uint32x2_t <b><b>vmul_n_u32</b></b> (uint32x2_t a, uint32_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_n_u32" type="checkbox"><label for="vmulq_n_u32"><div>uint32x4_t <b><b>vmulq_n_u32</b></b> (uint32x4_t a, uint32_t b)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_n_f32" type="checkbox"><label for="vmul_n_f32"><div>float32x2_t <b><b>vmul_n_f32</b></b> (float32x2_t a, float32_t b)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_n_f32" type="checkbox"><label for="vmulq_n_f32"><div>float32x4_t <b><b>vmulq_n_f32</b></b> (float32x4_t a, float32_t b)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_n_f64" type="checkbox"><label for="vmul_n_f64"><div>float64x1_t <b><b>vmul_n_f64</b></b> (float64x1_t a, float64_t b)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Dd,Dn,Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Vm.D[0] </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_n_f64" type="checkbox"><label for="vmulq_n_f64"><div>float64x2_t <b><b>vmulq_n_f64</b></b> (float64x2_t a, float64_t b)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.2D,Vn.2D,Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.D[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmul_lane_s16" type="checkbox"><label for="vmul_lane_s16"><div>int16x4_t <b><b>vmul_lane_s16</b></b> (int16x4_t a, int16x4_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_lane_s16" type="checkbox"><label for="vmulq_lane_s16"><div>int16x8_t <b><b>vmulq_lane_s16</b></b> (int16x8_t a, int16x4_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_lane_s32" type="checkbox"><label for="vmul_lane_s32"><div>int32x2_t <b><b>vmul_lane_s32</b></b> (int32x2_t a, int32x2_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_lane_s32" type="checkbox"><label for="vmulq_lane_s32"><div>int32x4_t <b><b>vmulq_lane_s32</b></b> (int32x4_t a, int32x2_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_lane_u16" type="checkbox"><label for="vmul_lane_u16"><div>uint16x4_t <b><b>vmul_lane_u16</b></b> (uint16x4_t a, uint16x4_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_lane_u16" type="checkbox"><label for="vmulq_lane_u16"><div>uint16x8_t <b><b>vmulq_lane_u16</b></b> (uint16x8_t a, uint16x4_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_lane_u32" type="checkbox"><label for="vmul_lane_u32"><div>uint32x2_t <b><b>vmul_lane_u32</b></b> (uint32x2_t a, uint32x2_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_lane_u32" type="checkbox"><label for="vmulq_lane_u32"><div>uint32x4_t <b><b>vmulq_lane_u32</b></b> (uint32x4_t a, uint32x2_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_lane_f32" type="checkbox"><label for="vmul_lane_f32"><div>float32x2_t <b><b>vmul_lane_f32</b></b> (float32x2_t a, float32x2_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_lane_f32" type="checkbox"><label for="vmulq_lane_f32"><div>float32x4_t <b><b>vmulq_lane_f32</b></b> (float32x4_t a, float32x2_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmul_lane_f64" type="checkbox"><label for="vmul_lane_f64"><div>float64x1_t <b><b>vmul_lane_f64</b></b> (float64x1_t a, float64x1_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_lane_f64" type="checkbox"><label for="vmulq_lane_f64"><div>float64x2_t <b><b>vmulq_lane_f64</b></b> (float64x2_t a, float64x1_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.2D,Vn.2D,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmuls_lane_f32" type="checkbox"><label for="vmuls_lane_f32"><div>float32_t <b><b>vmuls_lane_f32</b></b> (float32_t a, float32x2_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmuld_lane_f64" type="checkbox"><label for="vmuld_lane_f64"><div>float64_t <b><b>vmuld_lane_f64</b></b> (float64_t a, float64x1_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Dd,Dn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+v &rarr; Vm.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmul_laneq_s16" type="checkbox"><label for="vmul_laneq_s16"><div>int16x4_t <b><b>vmul_laneq_s16</b></b> (int16x4_t a, int16x8_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_laneq_s16" type="checkbox"><label for="vmulq_laneq_s16"><div>int16x8_t <b><b>vmulq_laneq_s16</b></b> (int16x8_t a, int16x8_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmul_laneq_s32" type="checkbox"><label for="vmul_laneq_s32"><div>int32x2_t <b><b>vmul_laneq_s32</b></b> (int32x2_t a, int32x4_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_laneq_s32" type="checkbox"><label for="vmulq_laneq_s32"><div>int32x4_t <b><b>vmulq_laneq_s32</b></b> (int32x4_t a, int32x4_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmul_laneq_u16" type="checkbox"><label for="vmul_laneq_u16"><div>uint16x4_t <b><b>vmul_laneq_u16</b></b> (uint16x4_t a, uint16x8_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_laneq_u16" type="checkbox"><label for="vmulq_laneq_u16"><div>uint16x8_t <b><b>vmulq_laneq_u16</b></b> (uint16x8_t a, uint16x8_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmul_laneq_u32" type="checkbox"><label for="vmul_laneq_u32"><div>uint32x2_t <b><b>vmul_laneq_u32</b></b> (uint32x2_t a, uint32x4_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_laneq_u32" type="checkbox"><label for="vmulq_laneq_u32"><div>uint32x4_t <b><b>vmulq_laneq_u32</b></b> (uint32x4_t a, uint32x4_t v, const int lane)<span class="right">Multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mul-vector-multiply-vector">MUL</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if poly then
+        product = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2)&lt;esize-1:0&gt;;
+    else
+        product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmul_laneq_f32" type="checkbox"><label for="vmul_laneq_f32"><div>float32x2_t <b><b>vmul_laneq_f32</b></b> (float32x2_t a, float32x4_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_laneq_f32" type="checkbox"><label for="vmulq_laneq_f32"><div>float32x4_t <b><b>vmulq_laneq_f32</b></b> (float32x4_t a, float32x4_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmul_laneq_f64" type="checkbox"><label for="vmul_laneq_f64"><div>float64x1_t <b><b>vmul_laneq_f64</b></b> (float64x1_t a, float64x2_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmulq_laneq_f64" type="checkbox"><label for="vmulq_laneq_f64"><div>float64x2_t <b><b>vmulq_laneq_f64</b></b> (float64x2_t a, float64x2_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Vd.2D,Vn.2D,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmuls_laneq_f32" type="checkbox"><label for="vmuls_laneq_f32"><div>float32_t <b><b>vmuls_laneq_f32</b></b> (float32_t a, float32x4_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmuld_laneq_f64" type="checkbox"><label for="vmuld_laneq_f64"><div>float64_t <b><b>vmuld_laneq_f64</b></b> (float64_t a, float64x2_t v, const int lane)<span class="right">Floating-point multiply</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Multiply (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmul-vector-floating-point-multiply-vector">FMUL</a> Dd,Dn,Vm.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+v &rarr; Vm.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMul.3" title="function: bits(N) FPMul(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMul</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_n_s16" type="checkbox"><label for="vmull_n_s16"><div>int32x4_t <b><b>vmull_n_s16</b></b> (int16x4_t a, int16_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL</a> Vd.4S,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_n_s32" type="checkbox"><label for="vmull_n_s32"><div>int64x2_t <b><b>vmull_n_s32</b></b> (int32x2_t a, int32_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL</a> Vd.2D,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_n_u16" type="checkbox"><label for="vmull_n_u16"><div>uint32x4_t <b><b>vmull_n_u16</b></b> (uint16x4_t a, uint16_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL</a> Vd.4S,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_n_u32" type="checkbox"><label for="vmull_n_u32"><div>uint64x2_t <b><b>vmull_n_u32</b></b> (uint32x2_t a, uint32_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL</a> Vd.2D,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_n_s16" type="checkbox"><label for="vmull_high_n_s16"><div>int32x4_t <b><b>vmull_high_n_s16</b></b> (int16x8_t a, int16_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL2</a> Vd.4S,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_n_s32" type="checkbox"><label for="vmull_high_n_s32"><div>int64x2_t <b><b>vmull_high_n_s32</b></b> (int32x4_t a, int32_t b)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL2</a> Vd.2D,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_n_u16" type="checkbox"><label for="vmull_high_n_u16"><div>uint32x4_t <b><b>vmull_high_n_u16</b></b> (uint16x8_t a, uint16_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL2</a> Vd.4S,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_n_u32" type="checkbox"><label for="vmull_high_n_u32"><div>uint64x2_t <b><b>vmull_high_n_u32</b></b> (uint32x4_t a, uint32_t b)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL2</a> Vd.2D,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_lane_s16" type="checkbox"><label for="vmull_lane_s16"><div>int32x4_t <b><b>vmull_lane_s16</b></b> (int16x4_t a, int16x4_t v, const int lane)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_lane_s32" type="checkbox"><label for="vmull_lane_s32"><div>int64x2_t <b><b>vmull_lane_s32</b></b> (int32x2_t a, int32x2_t v, const int lane)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_lane_u16" type="checkbox"><label for="vmull_lane_u16"><div>uint32x4_t <b><b>vmull_lane_u16</b></b> (uint16x4_t a, uint16x4_t v, const int lane)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_lane_u32" type="checkbox"><label for="vmull_lane_u32"><div>uint64x2_t <b><b>vmull_lane_u32</b></b> (uint32x2_t a, uint32x2_t v, const int lane)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_lane_s16" type="checkbox"><label for="vmull_high_lane_s16"><div>int32x4_t <b><b>vmull_high_lane_s16</b></b> (int16x8_t a, int16x4_t v, const int lane)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_lane_s32" type="checkbox"><label for="vmull_high_lane_s32"><div>int64x2_t <b><b>vmull_high_lane_s32</b></b> (int32x4_t a, int32x2_t v, const int lane)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_lane_u16" type="checkbox"><label for="vmull_high_lane_u16"><div>uint32x4_t <b><b>vmull_high_lane_u16</b></b> (uint16x8_t a, uint16x4_t v, const int lane)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_lane_u32" type="checkbox"><label for="vmull_high_lane_u32"><div>uint64x2_t <b><b>vmull_high_lane_u32</b></b> (uint32x4_t a, uint32x2_t v, const int lane)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_laneq_s16" type="checkbox"><label for="vmull_laneq_s16"><div>int32x4_t <b><b>vmull_laneq_s16</b></b> (int16x4_t a, int16x8_t v, const int lane)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_laneq_s32" type="checkbox"><label for="vmull_laneq_s32"><div>int64x2_t <b><b>vmull_laneq_s32</b></b> (int32x2_t a, int32x4_t v, const int lane)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_laneq_u16" type="checkbox"><label for="vmull_laneq_u16"><div>uint32x4_t <b><b>vmull_laneq_u16</b></b> (uint16x4_t a, uint16x8_t v, const int lane)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_laneq_u32" type="checkbox"><label for="vmull_laneq_u32"><div>uint64x2_t <b><b>vmull_laneq_u32</b></b> (uint32x2_t a, uint32x4_t v, const int lane)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_laneq_s16" type="checkbox"><label for="vmull_high_laneq_s16"><div>int32x4_t <b><b>vmull_high_laneq_s16</b></b> (int16x8_t a, int16x8_t v, const int lane)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_laneq_s32" type="checkbox"><label for="vmull_high_laneq_s32"><div>int64x2_t <b><b>vmull_high_laneq_s32</b></b> (int32x4_t a, int32x4_t v, const int lane)<span class="right">Signed multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smull-smull2-vector-signed-multiply-long-vector">SMULL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_laneq_u16" type="checkbox"><label for="vmull_high_laneq_u16"><div>uint32x4_t <b><b>vmull_high_laneq_u16</b></b> (uint16x8_t a, uint16x8_t v, const int lane)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_laneq_u32" type="checkbox"><label for="vmull_high_laneq_u32"><div>uint64x2_t <b><b>vmull_high_laneq_u32</b></b> (uint32x4_t a, uint32x4_t v, const int lane)<span class="right">Unsigned multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umull-umull2-vector-unsigned-multiply-long-vector">UMULL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = (element1*element2)&lt;2*esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_n_s16" type="checkbox"><label for="vqdmull_n_s16"><div>int32x4_t <b><b>vqdmull_n_s16</b></b> (int16x4_t a, int16_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Vd.4S,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_n_s32" type="checkbox"><label for="vqdmull_n_s32"><div>int64x2_t <b><b>vqdmull_n_s32</b></b> (int32x2_t a, int32_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Vd.2D,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_high_n_s16" type="checkbox"><label for="vqdmull_high_n_s16"><div>int32x4_t <b><b>vqdmull_high_n_s16</b></b> (int16x8_t a, int16_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL2</a> Vd.4S,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_high_n_s32" type="checkbox"><label for="vqdmull_high_n_s32"><div>int64x2_t <b><b>vqdmull_high_n_s32</b></b> (int32x4_t a, int32_t b)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL2</a> Vd.2D,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_lane_s16" type="checkbox"><label for="vqdmull_lane_s16"><div>int32x4_t <b><b>vqdmull_lane_s16</b></b> (int16x4_t a, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_lane_s32" type="checkbox"><label for="vqdmull_lane_s32"><div>int64x2_t <b><b>vqdmull_lane_s32</b></b> (int32x2_t a, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmullh_lane_s16" type="checkbox"><label for="vqdmullh_lane_s16"><div>int32_t <b><b>vqdmullh_lane_s16</b></b> (int16_t a, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Sd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulls_lane_s32" type="checkbox"><label for="vqdmulls_lane_s32"><div>int64_t <b><b>vqdmulls_lane_s32</b></b> (int32_t a, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Dd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_high_lane_s16" type="checkbox"><label for="vqdmull_high_lane_s16"><div>int32x4_t <b><b>vqdmull_high_lane_s16</b></b> (int16x8_t a, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_high_lane_s32" type="checkbox"><label for="vqdmull_high_lane_s32"><div>int64x2_t <b><b>vqdmull_high_lane_s32</b></b> (int32x4_t a, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_laneq_s16" type="checkbox"><label for="vqdmull_laneq_s16"><div>int32x4_t <b><b>vqdmull_laneq_s16</b></b> (int16x4_t a, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Vd.4S,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_laneq_s32" type="checkbox"><label for="vqdmull_laneq_s32"><div>int64x2_t <b><b>vqdmull_laneq_s32</b></b> (int32x2_t a, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Vd.2D,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmullh_laneq_s16" type="checkbox"><label for="vqdmullh_laneq_s16"><div>int32_t <b><b>vqdmullh_laneq_s16</b></b> (int16_t a, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Sd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulls_laneq_s32" type="checkbox"><label for="vqdmulls_laneq_s32"><div>int64_t <b><b>vqdmulls_laneq_s32</b></b> (int32_t a, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL</a> Dd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_high_laneq_s16" type="checkbox"><label for="vqdmull_high_laneq_s16"><div>int32x4_t <b><b>vqdmull_high_laneq_s16</b></b> (int16x8_t a, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL2</a> Vd.4S,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmull_high_laneq_s32" type="checkbox"><label for="vqdmull_high_laneq_s32"><div>int64x2_t <b><b>vqdmull_high_laneq_s32</b></b> (int32x4_t a, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply Long. This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, doubles the results, places the final results in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmull-sqdmull2-vector-signed-saturating-doubling-multiply-long">SQDMULL2</a> Vd.2D,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = product;
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulh_n_s16" type="checkbox"><label for="vqdmulh_n_s16"><div>int16x4_t <b><b>vqdmulh_n_s16</b></b> (int16x4_t a, int16_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.4H,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhq_n_s16" type="checkbox"><label for="vqdmulhq_n_s16"><div>int16x8_t <b><b>vqdmulhq_n_s16</b></b> (int16x8_t a, int16_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.8H,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulh_n_s32" type="checkbox"><label for="vqdmulh_n_s32"><div>int32x2_t <b><b>vqdmulh_n_s32</b></b> (int32x2_t a, int32_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhq_n_s32" type="checkbox"><label for="vqdmulhq_n_s32"><div>int32x4_t <b><b>vqdmulhq_n_s32</b></b> (int32x4_t a, int32_t b)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulh_lane_s16" type="checkbox"><label for="vqdmulh_lane_s16"><div>int16x4_t <b><b>vqdmulh_lane_s16</b></b> (int16x4_t a, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhq_lane_s16" type="checkbox"><label for="vqdmulhq_lane_s16"><div>int16x8_t <b><b>vqdmulhq_lane_s16</b></b> (int16x8_t a, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulh_lane_s32" type="checkbox"><label for="vqdmulh_lane_s32"><div>int32x2_t <b><b>vqdmulh_lane_s32</b></b> (int32x2_t a, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhq_lane_s32" type="checkbox"><label for="vqdmulhq_lane_s32"><div>int32x4_t <b><b>vqdmulhq_lane_s32</b></b> (int32x4_t a, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhh_lane_s16" type="checkbox"><label for="vqdmulhh_lane_s16"><div>int16_t <b><b>vqdmulhh_lane_s16</b></b> (int16_t a, int16x4_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Hd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhs_lane_s32" type="checkbox"><label for="vqdmulhs_lane_s32"><div>int32_t <b><b>vqdmulhs_lane_s32</b></b> (int32_t a, int32x2_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Sd,Sn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulh_laneq_s16" type="checkbox"><label for="vqdmulh_laneq_s16"><div>int16x4_t <b><b>vqdmulh_laneq_s16</b></b> (int16x4_t a, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhq_laneq_s16" type="checkbox"><label for="vqdmulhq_laneq_s16"><div>int16x8_t <b><b>vqdmulhq_laneq_s16</b></b> (int16x8_t a, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulh_laneq_s32" type="checkbox"><label for="vqdmulh_laneq_s32"><div>int32x2_t <b><b>vqdmulh_laneq_s32</b></b> (int32x2_t a, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhq_laneq_s32" type="checkbox"><label for="vqdmulhq_laneq_s32"><div>int32x4_t <b><b>vqdmulhq_laneq_s32</b></b> (int32x4_t a, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhh_laneq_s16" type="checkbox"><label for="vqdmulhh_laneq_s16"><div>int16_t <b><b>vqdmulhh_laneq_s16</b></b> (int16_t a, int16x8_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Hd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmulhs_laneq_s32" type="checkbox"><label for="vqdmulhs_laneq_s32"><div>int32_t <b><b>vqdmulhs_laneq_s32</b></b> (int32_t a, int32x4_t v, const int lane)<span class="right">Signed saturating doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmulh-vector-signed-saturating-doubling-multiply-returning-high-half">SQDMULH</a> Sd,Sn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulh_n_s16" type="checkbox"><label for="vqrdmulh_n_s16"><div>int16x4_t <b><b>vqrdmulh_n_s16</b></b> (int16x4_t a, int16_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.4H,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhq_n_s16" type="checkbox"><label for="vqrdmulhq_n_s16"><div>int16x8_t <b><b>vqrdmulhq_n_s16</b></b> (int16x8_t a, int16_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.8H,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulh_n_s32" type="checkbox"><label for="vqrdmulh_n_s32"><div>int32x2_t <b><b>vqrdmulh_n_s32</b></b> (int32x2_t a, int32_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhq_n_s32" type="checkbox"><label for="vqrdmulhq_n_s32"><div>int32x4_t <b><b>vqrdmulhq_n_s32</b></b> (int32x4_t a, int32_t b)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulh_lane_s16" type="checkbox"><label for="vqrdmulh_lane_s16"><div>int16x4_t <b><b>vqrdmulh_lane_s16</b></b> (int16x4_t a, int16x4_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhq_lane_s16" type="checkbox"><label for="vqrdmulhq_lane_s16"><div>int16x8_t <b><b>vqrdmulhq_lane_s16</b></b> (int16x8_t a, int16x4_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulh_lane_s32" type="checkbox"><label for="vqrdmulh_lane_s32"><div>int32x2_t <b><b>vqrdmulh_lane_s32</b></b> (int32x2_t a, int32x2_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhq_lane_s32" type="checkbox"><label for="vqrdmulhq_lane_s32"><div>int32x4_t <b><b>vqrdmulhq_lane_s32</b></b> (int32x4_t a, int32x2_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhh_lane_s16" type="checkbox"><label for="vqrdmulhh_lane_s16"><div>int16_t <b><b>vqrdmulhh_lane_s16</b></b> (int16_t a, int16x4_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Hd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+v &rarr; Vm.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhs_lane_s32" type="checkbox"><label for="vqrdmulhs_lane_s32"><div>int32_t <b><b>vqrdmulhs_lane_s32</b></b> (int32_t a, int32x2_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulh_laneq_s16" type="checkbox"><label for="vqrdmulh_laneq_s16"><div>int16x4_t <b><b>vqrdmulh_laneq_s16</b></b> (int16x4_t a, int16x8_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.4H,Vn.4H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhq_laneq_s16" type="checkbox"><label for="vqrdmulhq_laneq_s16"><div>int16x8_t <b><b>vqrdmulhq_laneq_s16</b></b> (int16x8_t a, int16x8_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.8H,Vn.8H,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulh_laneq_s32" type="checkbox"><label for="vqrdmulh_laneq_s32"><div>int32x2_t <b><b>vqrdmulh_laneq_s32</b></b> (int32x2_t a, int32x4_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.2S,Vn.2S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhq_laneq_s32" type="checkbox"><label for="vqrdmulhq_laneq_s32"><div>int32x4_t <b><b>vqrdmulhq_laneq_s32</b></b> (int32x4_t a, int32x4_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Vd.4S,Vn.4S,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhh_laneq_s16" type="checkbox"><label for="vqrdmulhh_laneq_s16"><div>int16_t <b><b>vqrdmulhh_laneq_s16</b></b> (int16_t a, int16x8_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Hd,Hn,Vm.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn <br />
+v &rarr; Vm.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqrdmulhs_laneq_s32" type="checkbox"><label for="vqrdmulhs_laneq_s32"><div>int32_t <b><b>vqrdmulhs_laneq_s32</b></b> (int32_t a, int32x4_t v, const int lane)<span class="right">Signed saturating rounding doubling multiply returning high half</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Rounding Doubling Multiply returning High half. This instruction multiplies the values of corresponding elements of the two source SIMD&amp;FP registers, doubles the results, places the most significant half of the final results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqrdmulh-vector-signed-saturating-rounding-doubling-multiply-returning-high-half">SQRDMULH</a> Sd,Sn,Vm.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+v &rarr; Vm.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+integer round_const = if rounding then 1 &lt;&lt; (esize - 1) else 0;
+integer element1;
+integer element2;
+integer product;
+boolean sat;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    product = (2 * element1 * element2) + round_const;
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(product &gt;&gt; esize, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmla_n_s16" type="checkbox"><label for="vmla_n_s16"><div>int16x4_t <b><b>vmla_n_s16</b></b> (int16x4_t a, int16x4_t b, int16_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4H,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_n_s16" type="checkbox"><label for="vmlaq_n_s16"><div>int16x8_t <b><b>vmlaq_n_s16</b></b> (int16x8_t a, int16x8_t b, int16_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8H,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_n_s32" type="checkbox"><label for="vmla_n_s32"><div>int32x2_t <b><b>vmla_n_s32</b></b> (int32x2_t a, int32x2_t b, int32_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_n_s32" type="checkbox"><label for="vmlaq_n_s32"><div>int32x4_t <b><b>vmlaq_n_s32</b></b> (int32x4_t a, int32x4_t b, int32_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_n_u16" type="checkbox"><label for="vmla_n_u16"><div>uint16x4_t <b><b>vmla_n_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4H,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_n_u16" type="checkbox"><label for="vmlaq_n_u16"><div>uint16x8_t <b><b>vmlaq_n_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.8H,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_n_u32" type="checkbox"><label for="vmla_n_u32"><div>uint32x2_t <b><b>vmla_n_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_n_u32" type="checkbox"><label for="vmlaq_n_u32"><div>uint32x4_t <b><b>vmlaq_n_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32_t c)<span class="right">Multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Add to accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mla-vector-multiply-add-to-accumulator-vector">MLA</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmla_n_f32" type="checkbox"><label for="vmla_n_f32"><div>float32x2_t <b><b>vmla_n_f32</b></b> (float32x2_t a, float32x2_t b, float32_t c)<span class="right">Undefined</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * c) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlaq_n_f32" type="checkbox"><label for="vmlaq_n_f32"><div>float32x4_t <b><b>vmlaq_n_f32</b></b> (float32x4_t a, float32x4_t b, float32_t c)<span class="right">Undefined</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] + (b[i] * c) for i = 0 to 3
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_n_s16" type="checkbox"><label for="vmlal_n_s16"><div>int32x4_t <b><b>vmlal_n_s16</b></b> (int32x4_t a, int16x4_t b, int16_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL</a> Vd.4S,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_n_s32" type="checkbox"><label for="vmlal_n_s32"><div>int64x2_t <b><b>vmlal_n_s32</b></b> (int64x2_t a, int32x2_t b, int32_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL</a> Vd.2D,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_n_u16" type="checkbox"><label for="vmlal_n_u16"><div>uint32x4_t <b><b>vmlal_n_u16</b></b> (uint32x4_t a, uint16x4_t b, uint16_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL</a> Vd.4S,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_n_u32" type="checkbox"><label for="vmlal_n_u32"><div>uint64x2_t <b><b>vmlal_n_u32</b></b> (uint64x2_t a, uint32x2_t b, uint32_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL</a> Vd.2D,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_n_s16" type="checkbox"><label for="vmlal_high_n_s16"><div>int32x4_t <b><b>vmlal_high_n_s16</b></b> (int32x4_t a, int16x8_t b, int16_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL2</a> Vd.4S,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_n_s32" type="checkbox"><label for="vmlal_high_n_s32"><div>int64x2_t <b><b>vmlal_high_n_s32</b></b> (int64x2_t a, int32x4_t b, int32_t c)<span class="right">Signed multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Add Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlal-smlal2-vector-signed-multiply-add-long-vector">SMLAL2</a> Vd.2D,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_n_u16" type="checkbox"><label for="vmlal_high_n_u16"><div>uint32x4_t <b><b>vmlal_high_n_u16</b></b> (uint32x4_t a, uint16x8_t b, uint16_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL2</a> Vd.4S,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlal_high_n_u32" type="checkbox"><label for="vmlal_high_n_u32"><div>uint64x2_t <b><b>vmlal_high_n_u32</b></b> (uint64x2_t a, uint32x4_t b, uint32_t c)<span class="right">Unsigned multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Add Long (vector). This instruction multiplies the vector elements in the lower or upper half of the first source SIMD&amp;FP register by the corresponding vector elements of the second source SIMD&amp;FP register, and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlal-umlal2-vector-unsigned-multiply-add-long-vector">UMLAL2</a> Vd.2D,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_n_s16" type="checkbox"><label for="vqdmlal_n_s16"><div>int32x4_t <b><b>vqdmlal_n_s16</b></b> (int32x4_t a, int16x4_t b, int16_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Vd.4S,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_n_s32" type="checkbox"><label for="vqdmlal_n_s32"><div>int64x2_t <b><b>vqdmlal_n_s32</b></b> (int64x2_t a, int32x2_t b, int32_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL</a> Vd.2D,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_high_n_s16" type="checkbox"><label for="vqdmlal_high_n_s16"><div>int32x4_t <b><b>vqdmlal_high_n_s16</b></b> (int32x4_t a, int16x8_t b, int16_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL2</a> Vd.4S,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlal_high_n_s32" type="checkbox"><label for="vqdmlal_high_n_s32"><div>int64x2_t <b><b>vqdmlal_high_n_s32</b></b> (int64x2_t a, int32x4_t b, int32_t c)<span class="right">Signed saturating doubling multiply-add long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Add Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and accumulates the final results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlal-sqdmlal2-vector-signed-saturating-doubling-multiply-add-long">SQDMLAL2</a> Vd.2D,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmls_n_s16" type="checkbox"><label for="vmls_n_s16"><div>int16x4_t <b><b>vmls_n_s16</b></b> (int16x4_t a, int16x4_t b, int16_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4H,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_n_s16" type="checkbox"><label for="vmlsq_n_s16"><div>int16x8_t <b><b>vmlsq_n_s16</b></b> (int16x8_t a, int16x8_t b, int16_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8H,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_n_s32" type="checkbox"><label for="vmls_n_s32"><div>int32x2_t <b><b>vmls_n_s32</b></b> (int32x2_t a, int32x2_t b, int32_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_n_s32" type="checkbox"><label for="vmlsq_n_s32"><div>int32x4_t <b><b>vmlsq_n_s32</b></b> (int32x4_t a, int32x4_t b, int32_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_n_u16" type="checkbox"><label for="vmls_n_u16"><div>uint16x4_t <b><b>vmls_n_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4H,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_n_u16" type="checkbox"><label for="vmlsq_n_u16"><div>uint16x8_t <b><b>vmlsq_n_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.8H,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_n_u32" type="checkbox"><label for="vmls_n_u32"><div>uint32x2_t <b><b>vmls_n_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_n_u32" type="checkbox"><label for="vmlsq_n_u32"><div>uint32x4_t <b><b>vmlsq_n_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding elements in the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mls-vector-multiply-subtract-from-accumulator-vector">MLS</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+bits(esize) product;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    product = (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element1)*<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(element2))&lt;esize-1:0&gt;;
+    if sub_op then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] - product;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize] + product;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmls_n_f32" type="checkbox"><label for="vmls_n_f32"><div>float32x2_t <b><b>vmls_n_f32</b></b> (float32x2_t a, float32x2_t b, float32_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * c) for i = 0 to 1
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsq_n_f32" type="checkbox"><label for="vmlsq_n_f32"><div>float32x4_t <b><b>vmlsq_n_f32</b></b> (float32x4_t a, float32x4_t b, float32_t c)<span class="right">Multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre>RESULT[I] = a[i] - (b[i] * c) for i = 0 to 3
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; N/A <br />
+b &rarr; N/A <br />
+c &rarr; N/A </pre>      <h4>Results</h4>      <pre>N/A &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_n_s16" type="checkbox"><label for="vmlsl_n_s16"><div>int32x4_t <b><b>vmlsl_n_s16</b></b> (int32x4_t a, int16x4_t b, int16_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL</a> Vd.4S,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_n_s32" type="checkbox"><label for="vmlsl_n_s32"><div>int64x2_t <b><b>vmlsl_n_s32</b></b> (int64x2_t a, int32x2_t b, int32_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL</a> Vd.2D,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_n_u16" type="checkbox"><label for="vmlsl_n_u16"><div>uint32x4_t <b><b>vmlsl_n_u16</b></b> (uint32x4_t a, uint16x4_t b, uint16_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL</a> Vd.4S,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_n_u32" type="checkbox"><label for="vmlsl_n_u32"><div>uint64x2_t <b><b>vmlsl_n_u32</b></b> (uint64x2_t a, uint32x2_t b, uint32_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL</a> Vd.2D,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_n_s16" type="checkbox"><label for="vmlsl_high_n_s16"><div>int32x4_t <b><b>vmlsl_high_n_s16</b></b> (int32x4_t a, int16x8_t b, int16_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL2</a> Vd.4S,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_n_s32" type="checkbox"><label for="vmlsl_high_n_s32"><div>int64x2_t <b><b>vmlsl_high_n_s32</b></b> (int64x2_t a, int32x4_t b, int32_t c)<span class="right">Signed multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Multiply-Subtract Long (vector). This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smlsl-smlsl2-vector-signed-multiply-subtract-long-vector">SMLSL2</a> Vd.2D,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_n_u16" type="checkbox"><label for="vmlsl_high_n_u16"><div>uint32x4_t <b><b>vmlsl_high_n_u16</b></b> (uint32x4_t a, uint16x8_t b, uint16_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL2</a> Vd.4S,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmlsl_high_n_u32" type="checkbox"><label for="vmlsl_high_n_u32"><div>uint64x2_t <b><b>vmlsl_high_n_u32</b></b> (uint64x2_t a, uint32x4_t b, uint32_t c)<span class="right">Unsigned multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Multiply-Subtract Long (vector). This instruction multiplies corresponding vector elements in the lower or upper half of the two source SIMD&amp;FP registers, and subtracts the results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umlsl-umlsl2-vector-unsigned-multiply-subtract-long-vector">UMLSL2</a> Vd.2D,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+bits(2*esize) accum;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize], unsigned);
+    product = (element1*element2)&lt;2*esize-1:0&gt;;
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] - product;
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize] + product;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = accum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_n_s16" type="checkbox"><label for="vqdmlsl_n_s16"><div>int32x4_t <b><b>vqdmlsl_n_s16</b></b> (int32x4_t a, int16x4_t b, int16_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Vd.4S,Vn.4H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_n_s32" type="checkbox"><label for="vqdmlsl_n_s32"><div>int64x2_t <b><b>vqdmlsl_n_s32</b></b> (int64x2_t a, int32x2_t b, int32_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL</a> Vd.2D,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_high_n_s16" type="checkbox"><label for="vqdmlsl_high_n_s16"><div>int32x4_t <b><b>vqdmlsl_high_n_s16</b></b> (int32x4_t a, int16x8_t b, int16_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL2</a> Vd.4S,Vn.8H,Vm.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H <br />
+c &rarr; Vm.H[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqdmlsl_high_n_s32" type="checkbox"><label for="vqdmlsl_high_n_s32"><div>int64x2_t <b><b>vqdmlsl_high_n_s32</b></b> (int64x2_t a, int32x4_t b, int32_t c)<span class="right">Signed saturating doubling multiply-subtract long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Doubling Multiply-Subtract Long. This instruction multiplies corresponding signed integer values in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, doubles the results, and subtracts the final results from the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqdmlsl-sqdmlsl2-vector-signed-saturating-doubling-multiply-subtract-long">SQDMLSL2</a> Vd.2D,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S <br />
+c &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(2*datasize) result;
+integer element1;
+integer element2;
+bits(2*esize) product;
+integer accum;
+boolean sat1;
+boolean sat2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize]);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize]);
+    (product, sat1) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(2 * element1 * element2, 2 * esize);
+    if sub_op then
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) - <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    else
+        accum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, 2*esize]) + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(product);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize], sat2) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(accum, 2 * esize);
+    if sat1 || sat2 then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabs_s8" type="checkbox"><label for="vabs_s8"><div>int8x8_t <b><b>vabs_s8</b></b> (int8x8_t a)<span class="right">Absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/abs-absolute-value-vector">ABS</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabsq_s8" type="checkbox"><label for="vabsq_s8"><div>int8x16_t <b><b>vabsq_s8</b></b> (int8x16_t a)<span class="right">Absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/abs-absolute-value-vector">ABS</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabs_s16" type="checkbox"><label for="vabs_s16"><div>int16x4_t <b><b>vabs_s16</b></b> (int16x4_t a)<span class="right">Absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/abs-absolute-value-vector">ABS</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabsq_s16" type="checkbox"><label for="vabsq_s16"><div>int16x8_t <b><b>vabsq_s16</b></b> (int16x8_t a)<span class="right">Absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/abs-absolute-value-vector">ABS</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabs_s32" type="checkbox"><label for="vabs_s32"><div>int32x2_t <b><b>vabs_s32</b></b> (int32x2_t a)<span class="right">Absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/abs-absolute-value-vector">ABS</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabsq_s32" type="checkbox"><label for="vabsq_s32"><div>int32x4_t <b><b>vabsq_s32</b></b> (int32x4_t a)<span class="right">Absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/abs-absolute-value-vector">ABS</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabs_f32" type="checkbox"><label for="vabs_f32"><div>float32x2_t <b><b>vabs_f32</b></b> (float32x2_t a)<span class="right">Floating-point absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabs-vector-floating-point-absolute-value-vector">FABS</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    if neg then
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element);
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabsq_f32" type="checkbox"><label for="vabsq_f32"><div>float32x4_t <b><b>vabsq_f32</b></b> (float32x4_t a)<span class="right">Floating-point absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabs-vector-floating-point-absolute-value-vector">FABS</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    if neg then
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element);
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vabs_s64" type="checkbox"><label for="vabs_s64"><div>int64x1_t <b><b>vabs_s64</b></b> (int64x1_t a)<span class="right">Absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/abs-absolute-value-vector">ABS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabsd_s64" type="checkbox"><label for="vabsd_s64"><div>int64_t <b><b>vabsd_s64</b></b> (int64_t a)<span class="right">Absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/abs-absolute-value-vector">ABS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabsq_s64" type="checkbox"><label for="vabsq_s64"><div>int64x2_t <b><b>vabsq_s64</b></b> (int64x2_t a)<span class="right">Absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/abs-absolute-value-vector">ABS</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabs_f64" type="checkbox"><label for="vabs_f64"><div>float64x1_t <b><b>vabs_f64</b></b> (float64x1_t a)<span class="right">Floating-point absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabs-vector-floating-point-absolute-value-vector">FABS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    if neg then
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element);
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vabsq_f64" type="checkbox"><label for="vabsq_f64"><div>float64x2_t <b><b>vabsq_f64</b></b> (float64x2_t a)<span class="right">Floating-point absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Absolute value (vector). This instruction calculates the absolute value of each vector element in the source SIMD&amp;FP register, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fabs-vector-floating-point-absolute-value-vector">FABS</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    if neg then
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element);
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqabs_s8" type="checkbox"><label for="vqabs_s8"><div>int8x8_t <b><b>vqabs_s8</b></b> (int8x8_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqabsq_s8" type="checkbox"><label for="vqabsq_s8"><div>int8x16_t <b><b>vqabsq_s8</b></b> (int8x16_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqabs_s16" type="checkbox"><label for="vqabs_s16"><div>int16x4_t <b><b>vqabs_s16</b></b> (int16x4_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqabsq_s16" type="checkbox"><label for="vqabsq_s16"><div>int16x8_t <b><b>vqabsq_s16</b></b> (int16x8_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqabs_s32" type="checkbox"><label for="vqabs_s32"><div>int32x2_t <b><b>vqabs_s32</b></b> (int32x2_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqabsq_s32" type="checkbox"><label for="vqabsq_s32"><div>int32x4_t <b><b>vqabsq_s32</b></b> (int32x4_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqabs_s64" type="checkbox"><label for="vqabs_s64"><div>int64x1_t <b><b>vqabs_s64</b></b> (int64x1_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqabsq_s64" type="checkbox"><label for="vqabsq_s64"><div>int64x2_t <b><b>vqabsq_s64</b></b> (int64x2_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqabsb_s8" type="checkbox"><label for="vqabsb_s8"><div>int8_t <b><b>vqabsb_s8</b></b> (int8_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Bd,Bn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqabsh_s16" type="checkbox"><label for="vqabsh_s16"><div>int16_t <b><b>vqabsh_s16</b></b> (int16_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Hd,Hn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqabss_s32" type="checkbox"><label for="vqabss_s32"><div>int32_t <b><b>vqabss_s32</b></b> (int32_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqabsd_s64" type="checkbox"><label for="vqabsd_s64"><div>int64_t <b><b>vqabsd_s64</b></b> (int64_t a)<span class="right">Signed saturating absolute value</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Absolute value. This instruction reads each vector element from the source SIMD&amp;FP register, puts the absolute value of the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqabs-signed-saturating-absolute-value">SQABS</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vneg_s8" type="checkbox"><label for="vneg_s8"><div>int8x8_t <b><b>vneg_s8</b></b> (int8x8_t a)<span class="right">Negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Negate (vector). This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/neg-vector-negate-vector">NEG</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vnegq_s8" type="checkbox"><label for="vnegq_s8"><div>int8x16_t <b><b>vnegq_s8</b></b> (int8x16_t a)<span class="right">Negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Negate (vector). This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/neg-vector-negate-vector">NEG</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vneg_s16" type="checkbox"><label for="vneg_s16"><div>int16x4_t <b><b>vneg_s16</b></b> (int16x4_t a)<span class="right">Negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Negate (vector). This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/neg-vector-negate-vector">NEG</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vnegq_s16" type="checkbox"><label for="vnegq_s16"><div>int16x8_t <b><b>vnegq_s16</b></b> (int16x8_t a)<span class="right">Negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Negate (vector). This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/neg-vector-negate-vector">NEG</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vneg_s32" type="checkbox"><label for="vneg_s32"><div>int32x2_t <b><b>vneg_s32</b></b> (int32x2_t a)<span class="right">Negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Negate (vector). This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/neg-vector-negate-vector">NEG</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vnegq_s32" type="checkbox"><label for="vnegq_s32"><div>int32x4_t <b><b>vnegq_s32</b></b> (int32x4_t a)<span class="right">Negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Negate (vector). This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/neg-vector-negate-vector">NEG</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vneg_f32" type="checkbox"><label for="vneg_f32"><div>float32x2_t <b><b>vneg_f32</b></b> (float32x2_t a)<span class="right">Floating-point negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Negate (vector). This instruction negates the value of each vector element in the source SIMD&amp;FP register, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fneg-vector-floating-point-negate-vector">FNEG</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    if neg then
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element);
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vnegq_f32" type="checkbox"><label for="vnegq_f32"><div>float32x4_t <b><b>vnegq_f32</b></b> (float32x4_t a)<span class="right">Floating-point negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Negate (vector). This instruction negates the value of each vector element in the source SIMD&amp;FP register, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fneg-vector-floating-point-negate-vector">FNEG</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    if neg then
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element);
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vneg_s64" type="checkbox"><label for="vneg_s64"><div>int64x1_t <b><b>vneg_s64</b></b> (int64x1_t a)<span class="right">Negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Negate (vector). This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/neg-vector-negate-vector">NEG</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vnegd_s64" type="checkbox"><label for="vnegd_s64"><div>int64_t <b><b>vnegd_s64</b></b> (int64_t a)<span class="right">Negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Negate (vector). This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/neg-vector-negate-vector">NEG</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vnegq_s64" type="checkbox"><label for="vnegq_s64"><div>int64x2_t <b><b>vnegq_s64</b></b> (int64x2_t a)<span class="right">Negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Negate (vector). This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, puts the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/neg-vector-negate-vector">NEG</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vneg_f64" type="checkbox"><label for="vneg_f64"><div>float64x1_t <b><b>vneg_f64</b></b> (float64x1_t a)<span class="right">Floating-point negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Negate (vector). This instruction negates the value of each vector element in the source SIMD&amp;FP register, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fneg-vector-floating-point-negate-vector">FNEG</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    if neg then
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element);
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vnegq_f64" type="checkbox"><label for="vnegq_f64"><div>float64x2_t <b><b>vnegq_f64</b></b> (float64x2_t a)<span class="right">Floating-point negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Negate (vector). This instruction negates the value of each vector element in the source SIMD&amp;FP register, writes the result to a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fneg-vector-floating-point-negate-vector">FNEG</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    if neg then
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element);
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAbs.1" title="function: bits(N) FPAbs(bits(N) op)">FPAbs</a>(element);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqneg_s8" type="checkbox"><label for="vqneg_s8"><div>int8x8_t <b><b>vqneg_s8</b></b> (int8x8_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqnegq_s8" type="checkbox"><label for="vqnegq_s8"><div>int8x16_t <b><b>vqnegq_s8</b></b> (int8x16_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqneg_s16" type="checkbox"><label for="vqneg_s16"><div>int16x4_t <b><b>vqneg_s16</b></b> (int16x4_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqnegq_s16" type="checkbox"><label for="vqnegq_s16"><div>int16x8_t <b><b>vqnegq_s16</b></b> (int16x8_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqneg_s32" type="checkbox"><label for="vqneg_s32"><div>int32x2_t <b><b>vqneg_s32</b></b> (int32x2_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqnegq_s32" type="checkbox"><label for="vqnegq_s32"><div>int32x4_t <b><b>vqnegq_s32</b></b> (int32x4_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqneg_s64" type="checkbox"><label for="vqneg_s64"><div>int64x1_t <b><b>vqneg_s64</b></b> (int64x1_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqnegq_s64" type="checkbox"><label for="vqnegq_s64"><div>int64x2_t <b><b>vqnegq_s64</b></b> (int64x2_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqnegb_s8" type="checkbox"><label for="vqnegb_s8"><div>int8_t <b><b>vqnegb_s8</b></b> (int8_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Bd,Bn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Bn </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqnegh_s16" type="checkbox"><label for="vqnegh_s16"><div>int16_t <b><b>vqnegh_s16</b></b> (int16_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Hd,Hn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Hn </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqnegs_s32" type="checkbox"><label for="vqnegs_s32"><div>int32_t <b><b>vqnegs_s32</b></b> (int32_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqnegd_s64" type="checkbox"><label for="vqnegd_s64"><div>int64_t <b><b>vqnegd_s64</b></b> (int64_t a)<span class="right">Signed saturating negate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed saturating Negate. This instruction reads each vector element from the source SIMD&amp;FP register, negates each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sqneg-signed-saturating-negate">SQNEG</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element;
+boolean sat;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SInt.1" title="function: integer SInt(bits(N) x)">SInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    if neg then
+        element = -element;
+    else
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Abs.1" title="function: integer Abs(integer x)">Abs</a>(element);
+    (<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize], sat) = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignedSatQ.2" title="function: (bits(N), boolean) SignedSatQ(integer i, integer N)">SignedSatQ</a>(element, esize);
+    if sat then FPSR.QC = '1';
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcls_s8" type="checkbox"><label for="vcls_s8"><div>int8x8_t <b><b>vcls_s8</b></b> (int8x8_t a)<span class="right">Count leading sign bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Sign bits (vector). This instruction counts the number of consecutive bits following the most significant bit that are the same as the most significant bit in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The count does not include the most significant bit itself.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cls-vector-count-leading-sign-bits-vector">CLS</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclsq_s8" type="checkbox"><label for="vclsq_s8"><div>int8x16_t <b><b>vclsq_s8</b></b> (int8x16_t a)<span class="right">Count leading sign bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Sign bits (vector). This instruction counts the number of consecutive bits following the most significant bit that are the same as the most significant bit in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The count does not include the most significant bit itself.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cls-vector-count-leading-sign-bits-vector">CLS</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcls_s16" type="checkbox"><label for="vcls_s16"><div>int16x4_t <b><b>vcls_s16</b></b> (int16x4_t a)<span class="right">Count leading sign bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Sign bits (vector). This instruction counts the number of consecutive bits following the most significant bit that are the same as the most significant bit in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The count does not include the most significant bit itself.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cls-vector-count-leading-sign-bits-vector">CLS</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclsq_s16" type="checkbox"><label for="vclsq_s16"><div>int16x8_t <b><b>vclsq_s16</b></b> (int16x8_t a)<span class="right">Count leading sign bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Sign bits (vector). This instruction counts the number of consecutive bits following the most significant bit that are the same as the most significant bit in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The count does not include the most significant bit itself.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cls-vector-count-leading-sign-bits-vector">CLS</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcls_s32" type="checkbox"><label for="vcls_s32"><div>int32x2_t <b><b>vcls_s32</b></b> (int32x2_t a)<span class="right">Count leading sign bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Sign bits (vector). This instruction counts the number of consecutive bits following the most significant bit that are the same as the most significant bit in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The count does not include the most significant bit itself.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cls-vector-count-leading-sign-bits-vector">CLS</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclsq_s32" type="checkbox"><label for="vclsq_s32"><div>int32x4_t <b><b>vclsq_s32</b></b> (int32x4_t a)<span class="right">Count leading sign bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Sign bits (vector). This instruction counts the number of consecutive bits following the most significant bit that are the same as the most significant bit in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The count does not include the most significant bit itself.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cls-vector-count-leading-sign-bits-vector">CLS</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclz_s8" type="checkbox"><label for="vclz_s8"><div>int8x8_t <b><b>vclz_s8</b></b> (int8x8_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclzq_s8" type="checkbox"><label for="vclzq_s8"><div>int8x16_t <b><b>vclzq_s8</b></b> (int8x16_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclz_s16" type="checkbox"><label for="vclz_s16"><div>int16x4_t <b><b>vclz_s16</b></b> (int16x4_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclzq_s16" type="checkbox"><label for="vclzq_s16"><div>int16x8_t <b><b>vclzq_s16</b></b> (int16x8_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclz_s32" type="checkbox"><label for="vclz_s32"><div>int32x2_t <b><b>vclz_s32</b></b> (int32x2_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclzq_s32" type="checkbox"><label for="vclzq_s32"><div>int32x4_t <b><b>vclzq_s32</b></b> (int32x4_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclz_u8" type="checkbox"><label for="vclz_u8"><div>uint8x8_t <b><b>vclz_u8</b></b> (uint8x8_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclzq_u8" type="checkbox"><label for="vclzq_u8"><div>uint8x16_t <b><b>vclzq_u8</b></b> (uint8x16_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclz_u16" type="checkbox"><label for="vclz_u16"><div>uint16x4_t <b><b>vclz_u16</b></b> (uint16x4_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclzq_u16" type="checkbox"><label for="vclzq_u16"><div>uint16x8_t <b><b>vclzq_u16</b></b> (uint16x8_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclz_u32" type="checkbox"><label for="vclz_u32"><div>uint32x2_t <b><b>vclz_u32</b></b> (uint32x2_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vclzq_u32" type="checkbox"><label for="vclzq_u32"><div>uint32x4_t <b><b>vclzq_u32</b></b> (uint32x4_t a)<span class="right">Count leading zero bits</span></div></label><article>      <h4>Description</h4><p><p class="aml">Count Leading Zero bits (vector). This instruction counts the number of consecutive zeros, starting from the most significant bit, in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/clz-vector-count-leading-zero-bits-vector">CLZ</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    if countop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#CountOp_CLS" title="enumeration CountOp     {CountOp_CLZ, CountOp_CLS, CountOp_CNT}">CountOp_CLS</a> then
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingSignBits.1" title="function: integer CountLeadingSignBits(bits(N) x)">CountLeadingSignBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    else
+        count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.CountLeadingZeroBits.1" title="function: integer CountLeadingZeroBits(bits(N) x)">CountLeadingZeroBits</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcnt_s8" type="checkbox"><label for="vcnt_s8"><div>int8x8_t <b><b>vcnt_s8</b></b> (int8x8_t a)<span class="right">Population count per byte</span></div></label><article>      <h4>Description</h4><p><p class="aml">Population Count per byte. This instruction counts the number of bits that have a value of one in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cnt-population-count-per-byte">CNT</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitCount.1" title="function: integer BitCount(bits(N) x)">BitCount</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcntq_s8" type="checkbox"><label for="vcntq_s8"><div>int8x16_t <b><b>vcntq_s8</b></b> (int8x16_t a)<span class="right">Population count per byte</span></div></label><article>      <h4>Description</h4><p><p class="aml">Population Count per byte. This instruction counts the number of bits that have a value of one in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cnt-population-count-per-byte">CNT</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitCount.1" title="function: integer BitCount(bits(N) x)">BitCount</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcnt_u8" type="checkbox"><label for="vcnt_u8"><div>uint8x8_t <b><b>vcnt_u8</b></b> (uint8x8_t a)<span class="right">Population count per byte</span></div></label><article>      <h4>Description</h4><p><p class="aml">Population Count per byte. This instruction counts the number of bits that have a value of one in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cnt-population-count-per-byte">CNT</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitCount.1" title="function: integer BitCount(bits(N) x)">BitCount</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcntq_u8" type="checkbox"><label for="vcntq_u8"><div>uint8x16_t <b><b>vcntq_u8</b></b> (uint8x16_t a)<span class="right">Population count per byte</span></div></label><article>      <h4>Description</h4><p><p class="aml">Population Count per byte. This instruction counts the number of bits that have a value of one in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cnt-population-count-per-byte">CNT</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitCount.1" title="function: integer BitCount(bits(N) x)">BitCount</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcnt_p8" type="checkbox"><label for="vcnt_p8"><div>poly8x8_t <b><b>vcnt_p8</b></b> (poly8x8_t a)<span class="right">Population count per byte</span></div></label><article>      <h4>Description</h4><p><p class="aml">Population Count per byte. This instruction counts the number of bits that have a value of one in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cnt-population-count-per-byte">CNT</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitCount.1" title="function: integer BitCount(bits(N) x)">BitCount</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcntq_p8" type="checkbox"><label for="vcntq_p8"><div>poly8x16_t <b><b>vcntq_p8</b></b> (poly8x16_t a)<span class="right">Population count per byte</span></div></label><article>      <h4>Description</h4><p><p class="aml">Population Count per byte. This instruction counts the number of bits that have a value of one in each vector element in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cnt-population-count-per-byte">CNT</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+integer count;
+for e = 0 to elements-1
+    count = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitCount.1" title="function: integer BitCount(bits(N) x)">BitCount</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize]);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = count&lt;esize-1:0&gt;;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrecpe_u32" type="checkbox"><label for="vrecpe_u32"><div>uint32x2_t <b><b>vrecpe_u32</b></b> (uint32x2_t a)<span class="right">Unsigned reciprocal estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Reciprocal Estimate. This instruction reads each vector element from the source SIMD&amp;FP register, calculates an approximate inverse for the unsigned integer value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urecpe-unsigned-reciprocal-estimate">URECPE</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(32) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 32];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 32] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedRecipEstimate.1" title="function: bits(N) UnsignedRecipEstimate(bits(N) operand)">UnsignedRecipEstimate</a>(element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrecpeq_u32" type="checkbox"><label for="vrecpeq_u32"><div>uint32x4_t <b><b>vrecpeq_u32</b></b> (uint32x4_t a)<span class="right">Unsigned reciprocal estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Reciprocal Estimate. This instruction reads each vector element from the source SIMD&amp;FP register, calculates an approximate inverse for the unsigned integer value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/urecpe-unsigned-reciprocal-estimate">URECPE</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(32) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 32];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 32] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedRecipEstimate.1" title="function: bits(N) UnsignedRecipEstimate(bits(N) operand)">UnsignedRecipEstimate</a>(element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrecpe_f32" type="checkbox"><label for="vrecpe_f32"><div>float32x2_t <b><b>vrecpe_f32</b></b> (float32x2_t a)<span class="right">Floating-point reciprocal estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Estimate. This instruction finds an approximate reciprocal estimate for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecpe-floating-point-reciprocal-estimate">FRECPE</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRecipEstimate.2" title="function: bits(N) FPRecipEstimate(bits(N) operand, FPCRType fpcr)">FPRecipEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrecpeq_f32" type="checkbox"><label for="vrecpeq_f32"><div>float32x4_t <b><b>vrecpeq_f32</b></b> (float32x4_t a)<span class="right">Floating-point reciprocal estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Estimate. This instruction finds an approximate reciprocal estimate for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecpe-floating-point-reciprocal-estimate">FRECPE</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRecipEstimate.2" title="function: bits(N) FPRecipEstimate(bits(N) operand, FPCRType fpcr)">FPRecipEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrecpe_f64" type="checkbox"><label for="vrecpe_f64"><div>float64x1_t <b><b>vrecpe_f64</b></b> (float64x1_t a)<span class="right">Floating-point reciprocal estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Estimate. This instruction finds an approximate reciprocal estimate for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecpe-floating-point-reciprocal-estimate">FRECPE</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRecipEstimate.2" title="function: bits(N) FPRecipEstimate(bits(N) operand, FPCRType fpcr)">FPRecipEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrecpeq_f64" type="checkbox"><label for="vrecpeq_f64"><div>float64x2_t <b><b>vrecpeq_f64</b></b> (float64x2_t a)<span class="right">Floating-point reciprocal estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Estimate. This instruction finds an approximate reciprocal estimate for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecpe-floating-point-reciprocal-estimate">FRECPE</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRecipEstimate.2" title="function: bits(N) FPRecipEstimate(bits(N) operand, FPCRType fpcr)">FPRecipEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrecpes_f32" type="checkbox"><label for="vrecpes_f32"><div>float32_t <b><b>vrecpes_f32</b></b> (float32_t a)<span class="right">Floating-point reciprocal estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Estimate. This instruction finds an approximate reciprocal estimate for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecpe-floating-point-reciprocal-estimate">FRECPE</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRecipEstimate.2" title="function: bits(N) FPRecipEstimate(bits(N) operand, FPCRType fpcr)">FPRecipEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrecped_f64" type="checkbox"><label for="vrecped_f64"><div>float64_t <b><b>vrecped_f64</b></b> (float64_t a)<span class="right">Floating-point reciprocal estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Estimate. This instruction finds an approximate reciprocal estimate for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecpe-floating-point-reciprocal-estimate">FRECPE</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRecipEstimate.2" title="function: bits(N) FPRecipEstimate(bits(N) operand, FPCRType fpcr)">FPRecipEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrecps_f32" type="checkbox"><label for="vrecps_f32"><div>float32x2_t <b><b>vrecps_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point reciprocal step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Step. This instruction multiplies the corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 2.0, places the resulting floating-point values in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecps-floating-point-reciprocal-step">FRECPS</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRecipStepFused.2" title="function: bits(N) FPRecipStepFused(bits(N) op1, bits(N) op2)">FPRecipStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrecpsq_f32" type="checkbox"><label for="vrecpsq_f32"><div>float32x4_t <b><b>vrecpsq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point reciprocal step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Step. This instruction multiplies the corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 2.0, places the resulting floating-point values in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecps-floating-point-reciprocal-step">FRECPS</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRecipStepFused.2" title="function: bits(N) FPRecipStepFused(bits(N) op1, bits(N) op2)">FPRecipStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrecps_f64" type="checkbox"><label for="vrecps_f64"><div>float64x1_t <b><b>vrecps_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point reciprocal step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Step. This instruction multiplies the corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 2.0, places the resulting floating-point values in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecps-floating-point-reciprocal-step">FRECPS</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRecipStepFused.2" title="function: bits(N) FPRecipStepFused(bits(N) op1, bits(N) op2)">FPRecipStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrecpsq_f64" type="checkbox"><label for="vrecpsq_f64"><div>float64x2_t <b><b>vrecpsq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point reciprocal step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Step. This instruction multiplies the corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 2.0, places the resulting floating-point values in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecps-floating-point-reciprocal-step">FRECPS</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRecipStepFused.2" title="function: bits(N) FPRecipStepFused(bits(N) op1, bits(N) op2)">FPRecipStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrecpss_f32" type="checkbox"><label for="vrecpss_f32"><div>float32_t <b><b>vrecpss_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point reciprocal step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Step. This instruction multiplies the corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 2.0, places the resulting floating-point values in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecps-floating-point-reciprocal-step">FRECPS</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRecipStepFused.2" title="function: bits(N) FPRecipStepFused(bits(N) op1, bits(N) op2)">FPRecipStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrecpsd_f64" type="checkbox"><label for="vrecpsd_f64"><div>float64_t <b><b>vrecpsd_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point reciprocal step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Step. This instruction multiplies the corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 2.0, places the resulting floating-point values in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecps-floating-point-reciprocal-step">FRECPS</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRecipStepFused.2" title="function: bits(N) FPRecipStepFused(bits(N) op1, bits(N) op2)">FPRecipStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqrt_f32" type="checkbox"><label for="vsqrt_f32"><div>float32x2_t <b><b>vsqrt_f32</b></b> (float32x2_t a)<span class="right">Floating-point square root</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Square Root (vector). This instruction calculates the square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fsqrt-vector-floating-point-square-root-vector">FSQRT</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSqrt.2" title="function: bits(N) FPSqrt(bits(N) op, FPCRType fpcr)">FPSqrt</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqrtq_f32" type="checkbox"><label for="vsqrtq_f32"><div>float32x4_t <b><b>vsqrtq_f32</b></b> (float32x4_t a)<span class="right">Floating-point square root</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Square Root (vector). This instruction calculates the square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fsqrt-vector-floating-point-square-root-vector">FSQRT</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSqrt.2" title="function: bits(N) FPSqrt(bits(N) op, FPCRType fpcr)">FPSqrt</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqrt_f64" type="checkbox"><label for="vsqrt_f64"><div>float64x1_t <b><b>vsqrt_f64</b></b> (float64x1_t a)<span class="right">Floating-point square root</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Square Root (vector). This instruction calculates the square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fsqrt-vector-floating-point-square-root-vector">FSQRT</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSqrt.2" title="function: bits(N) FPSqrt(bits(N) op, FPCRType fpcr)">FPSqrt</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsqrtq_f64" type="checkbox"><label for="vsqrtq_f64"><div>float64x2_t <b><b>vsqrtq_f64</b></b> (float64x2_t a)<span class="right">Floating-point square root</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Square Root (vector). This instruction calculates the square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fsqrt-vector-floating-point-square-root-vector">FSQRT</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPSqrt.2" title="function: bits(N) FPSqrt(bits(N) op, FPCRType fpcr)">FPSqrt</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrte_u32" type="checkbox"><label for="vrsqrte_u32"><div>uint32x2_t <b><b>vrsqrte_u32</b></b> (uint32x2_t a)<span class="right">Unsigned reciprocal square root estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Reciprocal Square Root Estimate. This instruction reads each vector element from the source SIMD&amp;FP register, calculates an approximate inverse square root for each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursqrte-unsigned-reciprocal-square-root-estimate">URSQRTE</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(32) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 32];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 32] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedRSqrtEstimate.1" title="function: bits(N) UnsignedRSqrtEstimate(bits(N) operand)">UnsignedRSqrtEstimate</a>(element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrteq_u32" type="checkbox"><label for="vrsqrteq_u32"><div>uint32x4_t <b><b>vrsqrteq_u32</b></b> (uint32x4_t a)<span class="right">Unsigned reciprocal square root estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Reciprocal Square Root Estimate. This instruction reads each vector element from the source SIMD&amp;FP register, calculates an approximate inverse square root for each value, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ursqrte-unsigned-reciprocal-square-root-estimate">URSQRTE</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(32) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, 32];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 32] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UnsignedRSqrtEstimate.1" title="function: bits(N) UnsignedRSqrtEstimate(bits(N) operand)">UnsignedRSqrtEstimate</a>(element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrte_f32" type="checkbox"><label for="vrsqrte_f32"><div>float32x2_t <b><b>vrsqrte_f32</b></b> (float32x2_t a)<span class="right">Floating-point reciprocal square root estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Estimate. This instruction calculates an approximate square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrte-floating-point-reciprocal-square-root-estimate">FRSQRTE</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRSqrtEstimate.2" title="function: bits(N) FPRSqrtEstimate(bits(N) operand, FPCRType fpcr)">FPRSqrtEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrteq_f32" type="checkbox"><label for="vrsqrteq_f32"><div>float32x4_t <b><b>vrsqrteq_f32</b></b> (float32x4_t a)<span class="right">Floating-point reciprocal square root estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Estimate. This instruction calculates an approximate square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrte-floating-point-reciprocal-square-root-estimate">FRSQRTE</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRSqrtEstimate.2" title="function: bits(N) FPRSqrtEstimate(bits(N) operand, FPCRType fpcr)">FPRSqrtEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrte_f64" type="checkbox"><label for="vrsqrte_f64"><div>float64x1_t <b><b>vrsqrte_f64</b></b> (float64x1_t a)<span class="right">Floating-point reciprocal square root estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Estimate. This instruction calculates an approximate square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrte-floating-point-reciprocal-square-root-estimate">FRSQRTE</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRSqrtEstimate.2" title="function: bits(N) FPRSqrtEstimate(bits(N) operand, FPCRType fpcr)">FPRSqrtEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrteq_f64" type="checkbox"><label for="vrsqrteq_f64"><div>float64x2_t <b><b>vrsqrteq_f64</b></b> (float64x2_t a)<span class="right">Floating-point reciprocal square root estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Estimate. This instruction calculates an approximate square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrte-floating-point-reciprocal-square-root-estimate">FRSQRTE</a> Vd.2D,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRSqrtEstimate.2" title="function: bits(N) FPRSqrtEstimate(bits(N) operand, FPCRType fpcr)">FPRSqrtEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrtes_f32" type="checkbox"><label for="vrsqrtes_f32"><div>float32_t <b><b>vrsqrtes_f32</b></b> (float32_t a)<span class="right">Floating-point reciprocal square root estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Estimate. This instruction calculates an approximate square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrte-floating-point-reciprocal-square-root-estimate">FRSQRTE</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRSqrtEstimate.2" title="function: bits(N) FPRSqrtEstimate(bits(N) operand, FPCRType fpcr)">FPRSqrtEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrted_f64" type="checkbox"><label for="vrsqrted_f64"><div>float64_t <b><b>vrsqrted_f64</b></b> (float64_t a)<span class="right">Floating-point reciprocal square root estimate</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Estimate. This instruction calculates an approximate square root for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrte-floating-point-reciprocal-square-root-estimate">FRSQRTE</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRSqrtEstimate.2" title="function: bits(N) FPRSqrtEstimate(bits(N) operand, FPCRType fpcr)">FPRSqrtEstimate</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrts_f32" type="checkbox"><label for="vrsqrts_f32"><div>float32x2_t <b><b>vrsqrts_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point reciprocal square root step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Step. This instruction multiplies corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 3.0, divides these results by 2.0, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrts-floating-point-reciprocal-square-root-step">FRSQRTS</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRSqrtStepFused.2" title="function: bits(N) FPRSqrtStepFused(bits(N) op1, bits(N) op2)">FPRSqrtStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrtsq_f32" type="checkbox"><label for="vrsqrtsq_f32"><div>float32x4_t <b><b>vrsqrtsq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point reciprocal square root step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Step. This instruction multiplies corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 3.0, divides these results by 2.0, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrts-floating-point-reciprocal-square-root-step">FRSQRTS</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRSqrtStepFused.2" title="function: bits(N) FPRSqrtStepFused(bits(N) op1, bits(N) op2)">FPRSqrtStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrts_f64" type="checkbox"><label for="vrsqrts_f64"><div>float64x1_t <b><b>vrsqrts_f64</b></b> (float64x1_t a, float64x1_t b)<span class="right">Floating-point reciprocal square root step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Step. This instruction multiplies corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 3.0, divides these results by 2.0, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrts-floating-point-reciprocal-square-root-step">FRSQRTS</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRSqrtStepFused.2" title="function: bits(N) FPRSqrtStepFused(bits(N) op1, bits(N) op2)">FPRSqrtStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrtsq_f64" type="checkbox"><label for="vrsqrtsq_f64"><div>float64x2_t <b><b>vrsqrtsq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point reciprocal square root step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Step. This instruction multiplies corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 3.0, divides these results by 2.0, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrts-floating-point-reciprocal-square-root-step">FRSQRTS</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRSqrtStepFused.2" title="function: bits(N) FPRSqrtStepFused(bits(N) op1, bits(N) op2)">FPRSqrtStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrtss_f32" type="checkbox"><label for="vrsqrtss_f32"><div>float32_t <b><b>vrsqrtss_f32</b></b> (float32_t a, float32_t b)<span class="right">Floating-point reciprocal square root step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Step. This instruction multiplies corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 3.0, divides these results by 2.0, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrts-floating-point-reciprocal-square-root-step">FRSQRTS</a> Sd,Sn,Sm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn <br />
+b &rarr; Sm </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRSqrtStepFused.2" title="function: bits(N) FPRSqrtStepFused(bits(N) op1, bits(N) op2)">FPRSqrtStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrsqrtsd_f64" type="checkbox"><label for="vrsqrtsd_f64"><div>float64_t <b><b>vrsqrtsd_f64</b></b> (float64_t a, float64_t b)<span class="right">Floating-point reciprocal square root step</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal Square Root Step. This instruction multiplies corresponding floating-point values in the vectors of the two source SIMD&amp;FP registers, subtracts each of the products from 3.0, divides these results by 2.0, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frsqrts-floating-point-reciprocal-square-root-step">FRSQRTS</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.FPRSqrtStepFused.2" title="function: bits(N) FPRSqrtStepFused(bits(N) op1, bits(N) op2)">FPRSqrtStepFused</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmvn_s8" type="checkbox"><label for="vmvn_s8"><div>int8x8_t <b><b>vmvn_s8</b></b> (int8x8_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvnq_s8" type="checkbox"><label for="vmvnq_s8"><div>int8x16_t <b><b>vmvnq_s8</b></b> (int8x16_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvn_s16" type="checkbox"><label for="vmvn_s16"><div>int16x4_t <b><b>vmvn_s16</b></b> (int16x4_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvnq_s16" type="checkbox"><label for="vmvnq_s16"><div>int16x8_t <b><b>vmvnq_s16</b></b> (int16x8_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvn_s32" type="checkbox"><label for="vmvn_s32"><div>int32x2_t <b><b>vmvn_s32</b></b> (int32x2_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvnq_s32" type="checkbox"><label for="vmvnq_s32"><div>int32x4_t <b><b>vmvnq_s32</b></b> (int32x4_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvn_u8" type="checkbox"><label for="vmvn_u8"><div>uint8x8_t <b><b>vmvn_u8</b></b> (uint8x8_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvnq_u8" type="checkbox"><label for="vmvnq_u8"><div>uint8x16_t <b><b>vmvnq_u8</b></b> (uint8x16_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvn_u16" type="checkbox"><label for="vmvn_u16"><div>uint16x4_t <b><b>vmvn_u16</b></b> (uint16x4_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvnq_u16" type="checkbox"><label for="vmvnq_u16"><div>uint16x8_t <b><b>vmvnq_u16</b></b> (uint16x8_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvn_u32" type="checkbox"><label for="vmvn_u32"><div>uint32x2_t <b><b>vmvn_u32</b></b> (uint32x2_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvnq_u32" type="checkbox"><label for="vmvnq_u32"><div>uint32x4_t <b><b>vmvnq_u32</b></b> (uint32x4_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvn_p8" type="checkbox"><label for="vmvn_p8"><div>poly8x8_t <b><b>vmvn_p8</b></b> (poly8x8_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmvnq_p8" type="checkbox"><label for="vmvnq_p8"><div>poly8x16_t <b><b>vmvnq_p8</b></b> (poly8x16_t a)<span class="right">Bitwise NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise NOT (vector). This instruction reads each vector element from the source SIMD&amp;FP register, places the inverse of each value into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/mvn-bitwise-not-vector-an-alias-of-not">MVN</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4><p>The description of 
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-simd-and-floating-point-instructions-alphabetic-order/not-bitwise-not-vector">NOT</a> 
+        gives the operational pseudocode for this instruction.</p>      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vand_s8" type="checkbox"><label for="vand_s8"><div>int8x8_t <b><b>vand_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vandq_s8" type="checkbox"><label for="vandq_s8"><div>int8x16_t <b><b>vandq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vand_s16" type="checkbox"><label for="vand_s16"><div>int16x4_t <b><b>vand_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vandq_s16" type="checkbox"><label for="vandq_s16"><div>int16x8_t <b><b>vandq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vand_s32" type="checkbox"><label for="vand_s32"><div>int32x2_t <b><b>vand_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vandq_s32" type="checkbox"><label for="vandq_s32"><div>int32x4_t <b><b>vandq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vand_s64" type="checkbox"><label for="vand_s64"><div>int64x1_t <b><b>vand_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Dd,Dn,Dm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn <br />
+b &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vandq_s64" type="checkbox"><label for="vandq_s64"><div>int64x2_t <b><b>vandq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vand_u8" type="checkbox"><label for="vand_u8"><div>uint8x8_t <b><b>vand_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vandq_u8" type="checkbox"><label for="vandq_u8"><div>uint8x16_t <b><b>vandq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vand_u16" type="checkbox"><label for="vand_u16"><div>uint16x4_t <b><b>vand_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vandq_u16" type="checkbox"><label for="vandq_u16"><div>uint16x8_t <b><b>vandq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vand_u32" type="checkbox"><label for="vand_u32"><div>uint32x2_t <b><b>vand_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vandq_u32" type="checkbox"><label for="vandq_u32"><div>uint32x4_t <b><b>vandq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vand_u64" type="checkbox"><label for="vand_u64"><div>uint64x1_t <b><b>vand_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vandq_u64" type="checkbox"><label for="vandq_u64"><div>uint64x2_t <b><b>vandq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Bitwise AND</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise AND (vector). This instruction performs a bitwise AND between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/and-vector-bitwise-and-vector">AND</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorr_s8" type="checkbox"><label for="vorr_s8"><div>int8x8_t <b><b>vorr_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorrq_s8" type="checkbox"><label for="vorrq_s8"><div>int8x16_t <b><b>vorrq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorr_s16" type="checkbox"><label for="vorr_s16"><div>int16x4_t <b><b>vorr_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorrq_s16" type="checkbox"><label for="vorrq_s16"><div>int16x8_t <b><b>vorrq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorr_s32" type="checkbox"><label for="vorr_s32"><div>int32x2_t <b><b>vorr_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorrq_s32" type="checkbox"><label for="vorrq_s32"><div>int32x4_t <b><b>vorrq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorr_s64" type="checkbox"><label for="vorr_s64"><div>int64x1_t <b><b>vorr_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorrq_s64" type="checkbox"><label for="vorrq_s64"><div>int64x2_t <b><b>vorrq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorr_u8" type="checkbox"><label for="vorr_u8"><div>uint8x8_t <b><b>vorr_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorrq_u8" type="checkbox"><label for="vorrq_u8"><div>uint8x16_t <b><b>vorrq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorr_u16" type="checkbox"><label for="vorr_u16"><div>uint16x4_t <b><b>vorr_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorrq_u16" type="checkbox"><label for="vorrq_u16"><div>uint16x8_t <b><b>vorrq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorr_u32" type="checkbox"><label for="vorr_u32"><div>uint32x2_t <b><b>vorr_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorrq_u32" type="checkbox"><label for="vorrq_u32"><div>uint32x4_t <b><b>vorrq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorr_u64" type="checkbox"><label for="vorr_u64"><div>uint64x1_t <b><b>vorr_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorrq_u64" type="checkbox"><label for="vorrq_u64"><div>uint64x2_t <b><b>vorrq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Bitwise inclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR (vector, register). This instruction performs a bitwise OR between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orr-vector-register-bitwise-inclusive-or-vector-register">ORR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veor_s8" type="checkbox"><label for="veor_s8"><div>int8x8_t <b><b>veor_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veorq_s8" type="checkbox"><label for="veorq_s8"><div>int8x16_t <b><b>veorq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veor_s16" type="checkbox"><label for="veor_s16"><div>int16x4_t <b><b>veor_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veorq_s16" type="checkbox"><label for="veorq_s16"><div>int16x8_t <b><b>veorq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veor_s32" type="checkbox"><label for="veor_s32"><div>int32x2_t <b><b>veor_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veorq_s32" type="checkbox"><label for="veorq_s32"><div>int32x4_t <b><b>veorq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veor_s64" type="checkbox"><label for="veor_s64"><div>int64x1_t <b><b>veor_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veorq_s64" type="checkbox"><label for="veorq_s64"><div>int64x2_t <b><b>veorq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veor_u8" type="checkbox"><label for="veor_u8"><div>uint8x8_t <b><b>veor_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veorq_u8" type="checkbox"><label for="veorq_u8"><div>uint8x16_t <b><b>veorq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veor_u16" type="checkbox"><label for="veor_u16"><div>uint16x4_t <b><b>veor_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veorq_u16" type="checkbox"><label for="veorq_u16"><div>uint16x8_t <b><b>veorq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veor_u32" type="checkbox"><label for="veor_u32"><div>uint32x2_t <b><b>veor_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veorq_u32" type="checkbox"><label for="veorq_u32"><div>uint32x4_t <b><b>veorq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veor_u64" type="checkbox"><label for="veor_u64"><div>uint64x1_t <b><b>veor_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="veorq_u64" type="checkbox"><label for="veorq_u64"><div>uint64x2_t <b><b>veorq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Bitwise exclusive OR</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Exclusive OR (vector). This instruction performs a bitwise Exclusive OR operation between the two source SIMD&amp;FP registers, and places the result in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/eor-vector-bitwise-exclusive-or-vector">EOR</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand2;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Ones.0" title="function: bits(N) Ones()">Ones</a>();
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand2 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbic_s8" type="checkbox"><label for="vbic_s8"><div>int8x8_t <b><b>vbic_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbicq_s8" type="checkbox"><label for="vbicq_s8"><div>int8x16_t <b><b>vbicq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbic_s16" type="checkbox"><label for="vbic_s16"><div>int16x4_t <b><b>vbic_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbicq_s16" type="checkbox"><label for="vbicq_s16"><div>int16x8_t <b><b>vbicq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbic_s32" type="checkbox"><label for="vbic_s32"><div>int32x2_t <b><b>vbic_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbicq_s32" type="checkbox"><label for="vbicq_s32"><div>int32x4_t <b><b>vbicq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbic_s64" type="checkbox"><label for="vbic_s64"><div>int64x1_t <b><b>vbic_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbicq_s64" type="checkbox"><label for="vbicq_s64"><div>int64x2_t <b><b>vbicq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbic_u8" type="checkbox"><label for="vbic_u8"><div>uint8x8_t <b><b>vbic_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbicq_u8" type="checkbox"><label for="vbicq_u8"><div>uint8x16_t <b><b>vbicq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbic_u16" type="checkbox"><label for="vbic_u16"><div>uint16x4_t <b><b>vbic_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbicq_u16" type="checkbox"><label for="vbicq_u16"><div>uint16x8_t <b><b>vbicq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbic_u32" type="checkbox"><label for="vbic_u32"><div>uint32x2_t <b><b>vbic_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbicq_u32" type="checkbox"><label for="vbicq_u32"><div>uint32x4_t <b><b>vbicq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbic_u64" type="checkbox"><label for="vbic_u64"><div>uint64x1_t <b><b>vbic_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbicq_u64" type="checkbox"><label for="vbicq_u64"><div>uint64x2_t <b><b>vbicq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Bitwise bit clear</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise bit Clear (vector, register). This instruction performs a bitwise AND between the first source SIMD&amp;FP register and the complement of the second source SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bic-vector-register-bitwise-bit-clear-vector-register">BIC</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 AND operand2;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorn_s8" type="checkbox"><label for="vorn_s8"><div>int8x8_t <b><b>vorn_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vornq_s8" type="checkbox"><label for="vornq_s8"><div>int8x16_t <b><b>vornq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorn_s16" type="checkbox"><label for="vorn_s16"><div>int16x4_t <b><b>vorn_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vornq_s16" type="checkbox"><label for="vornq_s16"><div>int16x8_t <b><b>vornq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorn_s32" type="checkbox"><label for="vorn_s32"><div>int32x2_t <b><b>vorn_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vornq_s32" type="checkbox"><label for="vornq_s32"><div>int32x4_t <b><b>vornq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorn_s64" type="checkbox"><label for="vorn_s64"><div>int64x1_t <b><b>vorn_s64</b></b> (int64x1_t a, int64x1_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vornq_s64" type="checkbox"><label for="vornq_s64"><div>int64x2_t <b><b>vornq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorn_u8" type="checkbox"><label for="vorn_u8"><div>uint8x8_t <b><b>vorn_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vornq_u8" type="checkbox"><label for="vornq_u8"><div>uint8x16_t <b><b>vornq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorn_u16" type="checkbox"><label for="vorn_u16"><div>uint16x4_t <b><b>vorn_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vornq_u16" type="checkbox"><label for="vornq_u16"><div>uint16x8_t <b><b>vornq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorn_u32" type="checkbox"><label for="vorn_u32"><div>uint32x2_t <b><b>vorn_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vornq_u32" type="checkbox"><label for="vornq_u32"><div>uint32x4_t <b><b>vornq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vorn_u64" type="checkbox"><label for="vorn_u64"><div>uint64x1_t <b><b>vorn_u64</b></b> (uint64x1_t a, uint64x1_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vornq_u64" type="checkbox"><label for="vornq_u64"><div>uint64x2_t <b><b>vornq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Bitwise inclusive OR NOT</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise inclusive OR NOT (vector). This instruction performs a bitwise OR NOT between the two source SIMD&amp;FP registers, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/orn-vector-bitwise-inclusive-or-not-vector">ORN</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+operand2 = NOT(operand2);
+
+result = operand1 OR operand2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_s8" type="checkbox"><label for="vbsl_s8"><div>int8x8_t <b><b>vbsl_s8</b></b> (uint8x8_t a, int8x8_t b, int8x8_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_s8" type="checkbox"><label for="vbslq_s8"><div>int8x16_t <b><b>vbslq_s8</b></b> (uint8x16_t a, int8x16_t b, int8x16_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_s16" type="checkbox"><label for="vbsl_s16"><div>int16x4_t <b><b>vbsl_s16</b></b> (uint16x4_t a, int16x4_t b, int16x4_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_s16" type="checkbox"><label for="vbslq_s16"><div>int16x8_t <b><b>vbslq_s16</b></b> (uint16x8_t a, int16x8_t b, int16x8_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_s32" type="checkbox"><label for="vbsl_s32"><div>int32x2_t <b><b>vbsl_s32</b></b> (uint32x2_t a, int32x2_t b, int32x2_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_s32" type="checkbox"><label for="vbslq_s32"><div>int32x4_t <b><b>vbslq_s32</b></b> (uint32x4_t a, int32x4_t b, int32x4_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_s64" type="checkbox"><label for="vbsl_s64"><div>int64x1_t <b><b>vbsl_s64</b></b> (uint64x1_t a, int64x1_t b, int64x1_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_s64" type="checkbox"><label for="vbslq_s64"><div>int64x2_t <b><b>vbslq_s64</b></b> (uint64x2_t a, int64x2_t b, int64x2_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_u8" type="checkbox"><label for="vbsl_u8"><div>uint8x8_t <b><b>vbsl_u8</b></b> (uint8x8_t a, uint8x8_t b, uint8x8_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_u8" type="checkbox"><label for="vbslq_u8"><div>uint8x16_t <b><b>vbslq_u8</b></b> (uint8x16_t a, uint8x16_t b, uint8x16_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_u16" type="checkbox"><label for="vbsl_u16"><div>uint16x4_t <b><b>vbsl_u16</b></b> (uint16x4_t a, uint16x4_t b, uint16x4_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_u16" type="checkbox"><label for="vbslq_u16"><div>uint16x8_t <b><b>vbslq_u16</b></b> (uint16x8_t a, uint16x8_t b, uint16x8_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_u32" type="checkbox"><label for="vbsl_u32"><div>uint32x2_t <b><b>vbsl_u32</b></b> (uint32x2_t a, uint32x2_t b, uint32x2_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_u32" type="checkbox"><label for="vbslq_u32"><div>uint32x4_t <b><b>vbslq_u32</b></b> (uint32x4_t a, uint32x4_t b, uint32x4_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_u64" type="checkbox"><label for="vbsl_u64"><div>uint64x1_t <b><b>vbsl_u64</b></b> (uint64x1_t a, uint64x1_t b, uint64x1_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_u64" type="checkbox"><label for="vbslq_u64"><div>uint64x2_t <b><b>vbslq_u64</b></b> (uint64x2_t a, uint64x2_t b, uint64x2_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_p64" type="checkbox"><label for="vbsl_p64"><div>poly64x1_t <b><b>vbsl_p64</b></b> (poly64x1_t a, poly64x1_t b, poly64x1_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_p64" type="checkbox"><label for="vbslq_p64"><div>poly64x2_t <b><b>vbslq_p64</b></b> (poly64x2_t a, poly64x2_t b, poly64x2_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_f32" type="checkbox"><label for="vbsl_f32"><div>float32x2_t <b><b>vbsl_f32</b></b> (uint32x2_t a, float32x2_t b, float32x2_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_f32" type="checkbox"><label for="vbslq_f32"><div>float32x4_t <b><b>vbslq_f32</b></b> (uint32x4_t a, float32x4_t b, float32x4_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_p8" type="checkbox"><label for="vbsl_p8"><div>poly8x8_t <b><b>vbsl_p8</b></b> (uint8x8_t a, poly8x8_t b, poly8x8_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_p8" type="checkbox"><label for="vbslq_p8"><div>poly8x16_t <b><b>vbslq_p8</b></b> (uint8x16_t a, poly8x16_t b, poly8x16_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_p16" type="checkbox"><label for="vbsl_p16"><div>poly16x4_t <b><b>vbsl_p16</b></b> (uint16x4_t a, poly16x4_t b, poly16x4_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_p16" type="checkbox"><label for="vbslq_p16"><div>poly16x8_t <b><b>vbslq_p16</b></b> (uint16x8_t a, poly16x8_t b, poly16x8_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vbsl_f64" type="checkbox"><label for="vbsl_f64"><div>float64x1_t <b><b>vbsl_f64</b></b> (uint64x1_t a, float64x1_t b, float64x1_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+b &rarr; Vn.8B <br />
+c &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vbslq_f64" type="checkbox"><label for="vbslq_f64"><div>float64x2_t <b><b>vbslq_f64</b></b> (uint64x2_t a, float64x2_t b, float64x2_t c)<span class="right">Bitwise select</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Select. This instruction sets each bit in the destination SIMD&amp;FP register to the corresponding bit from the first source SIMD&amp;FP register when the original destination bit was 1, otherwise from the second source SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bsl-bitwise-select">BSL</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+b &rarr; Vn.16B <br />
+c &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_s8" type="checkbox"><label for="vcopy_lane_s8"><div>int8x8_t <b><b>vcopy_lane_s8</b></b> (int8x8_t a, const int lane1, int8x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.8B <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_s8" type="checkbox"><label for="vcopyq_lane_s8"><div>int8x16_t <b><b>vcopyq_lane_s8</b></b> (int8x16_t a, const int lane1, int8x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+0 &lt;&lt; lane1 &lt;&lt; 15 <br />
+b &rarr; Vn.8B <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_s16" type="checkbox"><label for="vcopy_lane_s16"><div>int16x4_t <b><b>vcopy_lane_s16</b></b> (int16x4_t a, const int lane1, int16x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.4H <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_s16" type="checkbox"><label for="vcopyq_lane_s16"><div>int16x8_t <b><b>vcopyq_lane_s16</b></b> (int16x8_t a, const int lane1, int16x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.4H <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_s32" type="checkbox"><label for="vcopy_lane_s32"><div>int32x2_t <b><b>vcopy_lane_s32</b></b> (int32x2_t a, const int lane1, int32x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.2S <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_s32" type="checkbox"><label for="vcopyq_lane_s32"><div>int32x4_t <b><b>vcopyq_lane_s32</b></b> (int32x4_t a, const int lane1, int32x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.2S <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_s64" type="checkbox"><label for="vcopy_lane_s64"><div>int64x1_t <b><b>vcopy_lane_s64</b></b> (int64x1_t a, const int lane1, int64x1_t b, const int lane2)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; UNUSED <br />
+0 &lt;&lt; lane1 &lt;&lt; 0 <br />
+b &rarr; Vn.1D <br />
+0 &lt;&lt; lane2 &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_s64" type="checkbox"><label for="vcopyq_lane_s64"><div>int64x2_t <b><b>vcopyq_lane_s64</b></b> (int64x2_t a, const int lane1, int64x1_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane1],Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.1D <br />
+0 &lt;&lt; lane2 &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_u8" type="checkbox"><label for="vcopy_lane_u8"><div>uint8x8_t <b><b>vcopy_lane_u8</b></b> (uint8x8_t a, const int lane1, uint8x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.8B <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_u8" type="checkbox"><label for="vcopyq_lane_u8"><div>uint8x16_t <b><b>vcopyq_lane_u8</b></b> (uint8x16_t a, const int lane1, uint8x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+0 &lt;&lt; lane1 &lt;&lt; 15 <br />
+b &rarr; Vn.8B <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_u16" type="checkbox"><label for="vcopy_lane_u16"><div>uint16x4_t <b><b>vcopy_lane_u16</b></b> (uint16x4_t a, const int lane1, uint16x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.4H <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_u16" type="checkbox"><label for="vcopyq_lane_u16"><div>uint16x8_t <b><b>vcopyq_lane_u16</b></b> (uint16x8_t a, const int lane1, uint16x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.4H <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_u32" type="checkbox"><label for="vcopy_lane_u32"><div>uint32x2_t <b><b>vcopy_lane_u32</b></b> (uint32x2_t a, const int lane1, uint32x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.2S <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_u32" type="checkbox"><label for="vcopyq_lane_u32"><div>uint32x4_t <b><b>vcopyq_lane_u32</b></b> (uint32x4_t a, const int lane1, uint32x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.2S <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_u64" type="checkbox"><label for="vcopy_lane_u64"><div>uint64x1_t <b><b>vcopy_lane_u64</b></b> (uint64x1_t a, const int lane1, uint64x1_t b, const int lane2)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; UNUSED <br />
+0 &lt;&lt; lane1 &lt;&lt; 0 <br />
+b &rarr; Vn.1D <br />
+0 &lt;&lt; lane2 &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_u64" type="checkbox"><label for="vcopyq_lane_u64"><div>uint64x2_t <b><b>vcopyq_lane_u64</b></b> (uint64x2_t a, const int lane1, uint64x1_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane1],Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.1D <br />
+0 &lt;&lt; lane2 &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_p64" type="checkbox"><label for="vcopy_lane_p64"><div>poly64x1_t <b><b>vcopy_lane_p64</b></b> (poly64x1_t a, const int lane1, poly64x1_t b, const int lane2)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; UNUSED <br />
+0 &lt;&lt; lane1 &lt;&lt; 0 <br />
+b &rarr; Vn.1D <br />
+0 &lt;&lt; lane2 &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_p64" type="checkbox"><label for="vcopyq_lane_p64"><div>poly64x2_t <b><b>vcopyq_lane_p64</b></b> (poly64x2_t a, const int lane1, poly64x1_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane1],Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.1D <br />
+0 &lt;&lt; lane2 &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_f32" type="checkbox"><label for="vcopy_lane_f32"><div>float32x2_t <b><b>vcopy_lane_f32</b></b> (float32x2_t a, const int lane1, float32x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.2S <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_f32" type="checkbox"><label for="vcopyq_lane_f32"><div>float32x4_t <b><b>vcopyq_lane_f32</b></b> (float32x4_t a, const int lane1, float32x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.2S <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_f64" type="checkbox"><label for="vcopy_lane_f64"><div>float64x1_t <b><b>vcopy_lane_f64</b></b> (float64x1_t a, const int lane1, float64x1_t b, const int lane2)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; UNUSED <br />
+0 &lt;&lt; lane1 &lt;&lt; 0 <br />
+b &rarr; Vn.1D <br />
+0 &lt;&lt; lane2 &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_f64" type="checkbox"><label for="vcopyq_lane_f64"><div>float64x2_t <b><b>vcopyq_lane_f64</b></b> (float64x2_t a, const int lane1, float64x1_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane1],Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.1D <br />
+0 &lt;&lt; lane2 &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_p8" type="checkbox"><label for="vcopy_lane_p8"><div>poly8x8_t <b><b>vcopy_lane_p8</b></b> (poly8x8_t a, const int lane1, poly8x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.8B <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_p8" type="checkbox"><label for="vcopyq_lane_p8"><div>poly8x16_t <b><b>vcopyq_lane_p8</b></b> (poly8x16_t a, const int lane1, poly8x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+0 &lt;&lt; lane1 &lt;&lt; 15 <br />
+b &rarr; Vn.8B <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_lane_p16" type="checkbox"><label for="vcopy_lane_p16"><div>poly16x4_t <b><b>vcopy_lane_p16</b></b> (poly16x4_t a, const int lane1, poly16x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.4H <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_lane_p16" type="checkbox"><label for="vcopyq_lane_p16"><div>poly16x8_t <b><b>vcopyq_lane_p16</b></b> (poly16x8_t a, const int lane1, poly16x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.4H <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_s8" type="checkbox"><label for="vcopy_laneq_s8"><div>int8x8_t <b><b>vcopy_laneq_s8</b></b> (int8x8_t a, const int lane1, int8x16_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.16B <br />
+0 &lt;&lt; lane2 &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_s8" type="checkbox"><label for="vcopyq_laneq_s8"><div>int8x16_t <b><b>vcopyq_laneq_s8</b></b> (int8x16_t a, const int lane1, int8x16_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+0 &lt;&lt; lane1 &lt;&lt; 15 <br />
+b &rarr; Vn.16B <br />
+0 &lt;&lt; lane2 &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_s16" type="checkbox"><label for="vcopy_laneq_s16"><div>int16x4_t <b><b>vcopy_laneq_s16</b></b> (int16x4_t a, const int lane1, int16x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.8H <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_s16" type="checkbox"><label for="vcopyq_laneq_s16"><div>int16x8_t <b><b>vcopyq_laneq_s16</b></b> (int16x8_t a, const int lane1, int16x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.8H <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_s32" type="checkbox"><label for="vcopy_laneq_s32"><div>int32x2_t <b><b>vcopy_laneq_s32</b></b> (int32x2_t a, const int lane1, int32x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.4S <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_s32" type="checkbox"><label for="vcopyq_laneq_s32"><div>int32x4_t <b><b>vcopyq_laneq_s32</b></b> (int32x4_t a, const int lane1, int32x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.4S <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_s64" type="checkbox"><label for="vcopy_laneq_s64"><div>int64x1_t <b><b>vcopy_laneq_s64</b></b> (int64x1_t a, const int lane1, int64x2_t b, const int lane2)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; UNUSED <br />
+0 &lt;&lt; lane1 &lt;&lt; 0 <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_s64" type="checkbox"><label for="vcopyq_laneq_s64"><div>int64x2_t <b><b>vcopyq_laneq_s64</b></b> (int64x2_t a, const int lane1, int64x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane1],Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_u8" type="checkbox"><label for="vcopy_laneq_u8"><div>uint8x8_t <b><b>vcopy_laneq_u8</b></b> (uint8x8_t a, const int lane1, uint8x16_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.16B <br />
+0 &lt;&lt; lane2 &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_u8" type="checkbox"><label for="vcopyq_laneq_u8"><div>uint8x16_t <b><b>vcopyq_laneq_u8</b></b> (uint8x16_t a, const int lane1, uint8x16_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+0 &lt;&lt; lane1 &lt;&lt; 15 <br />
+b &rarr; Vn.16B <br />
+0 &lt;&lt; lane2 &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_u16" type="checkbox"><label for="vcopy_laneq_u16"><div>uint16x4_t <b><b>vcopy_laneq_u16</b></b> (uint16x4_t a, const int lane1, uint16x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.8H <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_u16" type="checkbox"><label for="vcopyq_laneq_u16"><div>uint16x8_t <b><b>vcopyq_laneq_u16</b></b> (uint16x8_t a, const int lane1, uint16x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.8H <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_u32" type="checkbox"><label for="vcopy_laneq_u32"><div>uint32x2_t <b><b>vcopy_laneq_u32</b></b> (uint32x2_t a, const int lane1, uint32x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.4S <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_u32" type="checkbox"><label for="vcopyq_laneq_u32"><div>uint32x4_t <b><b>vcopyq_laneq_u32</b></b> (uint32x4_t a, const int lane1, uint32x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.4S <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_u64" type="checkbox"><label for="vcopy_laneq_u64"><div>uint64x1_t <b><b>vcopy_laneq_u64</b></b> (uint64x1_t a, const int lane1, uint64x2_t b, const int lane2)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; UNUSED <br />
+0 &lt;&lt; lane1 &lt;&lt; 0 <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_u64" type="checkbox"><label for="vcopyq_laneq_u64"><div>uint64x2_t <b><b>vcopyq_laneq_u64</b></b> (uint64x2_t a, const int lane1, uint64x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane1],Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_p64" type="checkbox"><label for="vcopy_laneq_p64"><div>poly64x1_t <b><b>vcopy_laneq_p64</b></b> (poly64x1_t a, const int lane1, poly64x2_t b, const int lane2)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; UNUSED <br />
+0 &lt;&lt; lane1 &lt;&lt; 0 <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_p64" type="checkbox"><label for="vcopyq_laneq_p64"><div>poly64x2_t <b><b>vcopyq_laneq_p64</b></b> (poly64x2_t a, const int lane1, poly64x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane1],Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_f32" type="checkbox"><label for="vcopy_laneq_f32"><div>float32x2_t <b><b>vcopy_laneq_f32</b></b> (float32x2_t a, const int lane1, float32x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.4S <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_f32" type="checkbox"><label for="vcopyq_laneq_f32"><div>float32x4_t <b><b>vcopyq_laneq_f32</b></b> (float32x4_t a, const int lane1, float32x4_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane1],Vn.S[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.4S <br />
+0 &lt;&lt; lane2 &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_f64" type="checkbox"><label for="vcopy_laneq_f64"><div>float64x1_t <b><b>vcopy_laneq_f64</b></b> (float64x1_t a, const int lane1, float64x2_t b, const int lane2)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; UNUSED <br />
+0 &lt;&lt; lane1 &lt;&lt; 0 <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_f64" type="checkbox"><label for="vcopyq_laneq_f64"><div>float64x2_t <b><b>vcopyq_laneq_f64</b></b> (float64x2_t a, const int lane1, float64x2_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane1],Vn.D[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+0 &lt;&lt; lane1 &lt;&lt; 1 <br />
+b &rarr; Vn.2D <br />
+0 &lt;&lt; lane2 &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_p8" type="checkbox"><label for="vcopy_laneq_p8"><div>poly8x8_t <b><b>vcopy_laneq_p8</b></b> (poly8x8_t a, const int lane1, poly8x16_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.16B <br />
+0 &lt;&lt; lane2 &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_p8" type="checkbox"><label for="vcopyq_laneq_p8"><div>poly8x16_t <b><b>vcopyq_laneq_p8</b></b> (poly8x16_t a, const int lane1, poly8x16_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane1],Vn.B[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+0 &lt;&lt; lane1 &lt;&lt; 15 <br />
+b &rarr; Vn.16B <br />
+0 &lt;&lt; lane2 &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopy_laneq_p16" type="checkbox"><label for="vcopy_laneq_p16"><div>poly16x4_t <b><b>vcopy_laneq_p16</b></b> (poly16x4_t a, const int lane1, poly16x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+0 &lt;&lt; lane1 &lt;&lt; 3 <br />
+b &rarr; Vn.8H <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcopyq_laneq_p16" type="checkbox"><label for="vcopyq_laneq_p16"><div>poly16x8_t <b><b>vcopyq_laneq_p16</b></b> (poly16x8_t a, const int lane1, poly16x8_t b, const int lane2)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane1],Vn.H[lane2]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+0 &lt;&lt; lane1 &lt;&lt; 7 <br />
+b &rarr; Vn.8H <br />
+0 &lt;&lt; lane2 &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrbit_s8" type="checkbox"><label for="vrbit_s8"><div>int8x8_t <b><b>vrbit_s8</b></b> (int8x8_t a)<span class="right">Reverse bit order</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse Bit order (vector). This instruction reads each vector element from the source SIMD&amp;FP register, reverses the bits of the element, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rbit-vector-reverse-bit-order-vector">RBIT</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+bits(esize) rev;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    for i = 0 to esize-1
+        rev&lt;esize-1-i&gt; = element&lt;i&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = rev;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrbitq_s8" type="checkbox"><label for="vrbitq_s8"><div>int8x16_t <b><b>vrbitq_s8</b></b> (int8x16_t a)<span class="right">Reverse bit order</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse Bit order (vector). This instruction reads each vector element from the source SIMD&amp;FP register, reverses the bits of the element, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rbit-vector-reverse-bit-order-vector">RBIT</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+bits(esize) rev;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    for i = 0 to esize-1
+        rev&lt;esize-1-i&gt; = element&lt;i&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = rev;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrbit_u8" type="checkbox"><label for="vrbit_u8"><div>uint8x8_t <b><b>vrbit_u8</b></b> (uint8x8_t a)<span class="right">Reverse bit order</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse Bit order (vector). This instruction reads each vector element from the source SIMD&amp;FP register, reverses the bits of the element, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rbit-vector-reverse-bit-order-vector">RBIT</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+bits(esize) rev;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    for i = 0 to esize-1
+        rev&lt;esize-1-i&gt; = element&lt;i&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = rev;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrbitq_u8" type="checkbox"><label for="vrbitq_u8"><div>uint8x16_t <b><b>vrbitq_u8</b></b> (uint8x16_t a)<span class="right">Reverse bit order</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse Bit order (vector). This instruction reads each vector element from the source SIMD&amp;FP register, reverses the bits of the element, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rbit-vector-reverse-bit-order-vector">RBIT</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+bits(esize) rev;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    for i = 0 to esize-1
+        rev&lt;esize-1-i&gt; = element&lt;i&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = rev;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrbit_p8" type="checkbox"><label for="vrbit_p8"><div>poly8x8_t <b><b>vrbit_p8</b></b> (poly8x8_t a)<span class="right">Reverse bit order</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse Bit order (vector). This instruction reads each vector element from the source SIMD&amp;FP register, reverses the bits of the element, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rbit-vector-reverse-bit-order-vector">RBIT</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+bits(esize) rev;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    for i = 0 to esize-1
+        rev&lt;esize-1-i&gt; = element&lt;i&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = rev;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrbitq_p8" type="checkbox"><label for="vrbitq_p8"><div>poly8x16_t <b><b>vrbitq_p8</b></b> (poly8x16_t a)<span class="right">Reverse bit order</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse Bit order (vector). This instruction reads each vector element from the source SIMD&amp;FP register, reverses the bits of the element, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rbit-vector-reverse-bit-order-vector">RBIT</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+bits(esize) rev;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    for i = 0 to esize-1
+        rev&lt;esize-1-i&gt; = element&lt;i&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = rev;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_s8" type="checkbox"><label for="vcreate_s8"><div>int8x8_t <b><b>vcreate_s8</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_s16" type="checkbox"><label for="vcreate_s16"><div>int16x4_t <b><b>vcreate_s16</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_s32" type="checkbox"><label for="vcreate_s32"><div>int32x2_t <b><b>vcreate_s32</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_s64" type="checkbox"><label for="vcreate_s64"><div>int64x1_t <b><b>vcreate_s64</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_u8" type="checkbox"><label for="vcreate_u8"><div>uint8x8_t <b><b>vcreate_u8</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_u16" type="checkbox"><label for="vcreate_u16"><div>uint16x4_t <b><b>vcreate_u16</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_u32" type="checkbox"><label for="vcreate_u32"><div>uint32x2_t <b><b>vcreate_u32</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_u64" type="checkbox"><label for="vcreate_u64"><div>uint64x1_t <b><b>vcreate_u64</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_p64" type="checkbox"><label for="vcreate_p64"><div>poly64x1_t <b><b>vcreate_p64</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_f16" type="checkbox"><label for="vcreate_f16"><div>float16x4_t <b><b>vcreate_f16</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_f32" type="checkbox"><label for="vcreate_f32"><div>float32x2_t <b><b>vcreate_f32</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_p8" type="checkbox"><label for="vcreate_p8"><div>poly8x8_t <b><b>vcreate_p8</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_p16" type="checkbox"><label for="vcreate_p16"><div>poly16x4_t <b><b>vcreate_p16</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcreate_f64" type="checkbox"><label for="vcreate_f64"><div>float64x1_t <b><b>vcreate_f64</b></b> (uint64_t a)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[0],Xn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_s8" type="checkbox"><label for="vdup_n_s8"><div>int8x8_t <b><b>vdup_n_s8</b></b> (int8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_s8" type="checkbox"><label for="vdupq_n_s8"><div>int8x16_t <b><b>vdupq_n_s8</b></b> (int8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_s16" type="checkbox"><label for="vdup_n_s16"><div>int16x4_t <b><b>vdup_n_s16</b></b> (int16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_s16" type="checkbox"><label for="vdupq_n_s16"><div>int16x8_t <b><b>vdupq_n_s16</b></b> (int16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_s32" type="checkbox"><label for="vdup_n_s32"><div>int32x2_t <b><b>vdup_n_s32</b></b> (int32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_s32" type="checkbox"><label for="vdupq_n_s32"><div>int32x4_t <b><b>vdupq_n_s32</b></b> (int32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_s64" type="checkbox"><label for="vdup_n_s64"><div>int64x1_t <b><b>vdup_n_s64</b></b> (int64_t value)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Dd.D[0],xn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_s64" type="checkbox"><label for="vdupq_n_s64"><div>int64x2_t <b><b>vdupq_n_s64</b></b> (int64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_u8" type="checkbox"><label for="vdup_n_u8"><div>uint8x8_t <b><b>vdup_n_u8</b></b> (uint8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_u8" type="checkbox"><label for="vdupq_n_u8"><div>uint8x16_t <b><b>vdupq_n_u8</b></b> (uint8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_u16" type="checkbox"><label for="vdup_n_u16"><div>uint16x4_t <b><b>vdup_n_u16</b></b> (uint16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_u16" type="checkbox"><label for="vdupq_n_u16"><div>uint16x8_t <b><b>vdupq_n_u16</b></b> (uint16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_u32" type="checkbox"><label for="vdup_n_u32"><div>uint32x2_t <b><b>vdup_n_u32</b></b> (uint32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_u32" type="checkbox"><label for="vdupq_n_u32"><div>uint32x4_t <b><b>vdupq_n_u32</b></b> (uint32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_u64" type="checkbox"><label for="vdup_n_u64"><div>uint64x1_t <b><b>vdup_n_u64</b></b> (uint64_t value)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Dd.D[0],xn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_u64" type="checkbox"><label for="vdupq_n_u64"><div>uint64x2_t <b><b>vdupq_n_u64</b></b> (uint64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_p64" type="checkbox"><label for="vdup_n_p64"><div>poly64x1_t <b><b>vdup_n_p64</b></b> (poly64_t value)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Dd.D[0],xn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_p64" type="checkbox"><label for="vdupq_n_p64"><div>poly64x2_t <b><b>vdupq_n_p64</b></b> (poly64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_f32" type="checkbox"><label for="vdup_n_f32"><div>float32x2_t <b><b>vdup_n_f32</b></b> (float32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_f32" type="checkbox"><label for="vdupq_n_f32"><div>float32x4_t <b><b>vdupq_n_f32</b></b> (float32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_p8" type="checkbox"><label for="vdup_n_p8"><div>poly8x8_t <b><b>vdup_n_p8</b></b> (poly8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_p8" type="checkbox"><label for="vdupq_n_p8"><div>poly8x16_t <b><b>vdupq_n_p8</b></b> (poly8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_p16" type="checkbox"><label for="vdup_n_p16"><div>poly16x4_t <b><b>vdup_n_p16</b></b> (poly16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_p16" type="checkbox"><label for="vdupq_n_p16"><div>poly16x8_t <b><b>vdupq_n_p16</b></b> (poly16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_n_f64" type="checkbox"><label for="vdup_n_f64"><div>float64x1_t <b><b>vdup_n_f64</b></b> (float64_t value)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Dd.D[0],xn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_n_f64" type="checkbox"><label for="vdupq_n_f64"><div>float64x2_t <b><b>vdupq_n_f64</b></b> (float64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_s8" type="checkbox"><label for="vmov_n_s8"><div>int8x8_t <b><b>vmov_n_s8</b></b> (int8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_s8" type="checkbox"><label for="vmovq_n_s8"><div>int8x16_t <b><b>vmovq_n_s8</b></b> (int8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_s16" type="checkbox"><label for="vmov_n_s16"><div>int16x4_t <b><b>vmov_n_s16</b></b> (int16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_s16" type="checkbox"><label for="vmovq_n_s16"><div>int16x8_t <b><b>vmovq_n_s16</b></b> (int16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_s32" type="checkbox"><label for="vmov_n_s32"><div>int32x2_t <b><b>vmov_n_s32</b></b> (int32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_s32" type="checkbox"><label for="vmovq_n_s32"><div>int32x4_t <b><b>vmovq_n_s32</b></b> (int32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_s64" type="checkbox"><label for="vmov_n_s64"><div>int64x1_t <b><b>vmov_n_s64</b></b> (int64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_s64" type="checkbox"><label for="vmovq_n_s64"><div>int64x2_t <b><b>vmovq_n_s64</b></b> (int64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_u8" type="checkbox"><label for="vmov_n_u8"><div>uint8x8_t <b><b>vmov_n_u8</b></b> (uint8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_u8" type="checkbox"><label for="vmovq_n_u8"><div>uint8x16_t <b><b>vmovq_n_u8</b></b> (uint8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_u16" type="checkbox"><label for="vmov_n_u16"><div>uint16x4_t <b><b>vmov_n_u16</b></b> (uint16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_u16" type="checkbox"><label for="vmovq_n_u16"><div>uint16x8_t <b><b>vmovq_n_u16</b></b> (uint16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_u32" type="checkbox"><label for="vmov_n_u32"><div>uint32x2_t <b><b>vmov_n_u32</b></b> (uint32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_u32" type="checkbox"><label for="vmovq_n_u32"><div>uint32x4_t <b><b>vmovq_n_u32</b></b> (uint32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_u64" type="checkbox"><label for="vmov_n_u64"><div>uint64x1_t <b><b>vmov_n_u64</b></b> (uint64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_u64" type="checkbox"><label for="vmovq_n_u64"><div>uint64x2_t <b><b>vmovq_n_u64</b></b> (uint64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_f32" type="checkbox"><label for="vmov_n_f32"><div>float32x2_t <b><b>vmov_n_f32</b></b> (float32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_f32" type="checkbox"><label for="vmovq_n_f32"><div>float32x4_t <b><b>vmovq_n_f32</b></b> (float32_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_p8" type="checkbox"><label for="vmov_n_p8"><div>poly8x8_t <b><b>vmov_n_p8</b></b> (poly8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_p8" type="checkbox"><label for="vmovq_n_p8"><div>poly8x16_t <b><b>vmovq_n_p8</b></b> (poly8_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_p16" type="checkbox"><label for="vmov_n_p16"><div>poly16x4_t <b><b>vmov_n_p16</b></b> (poly16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_p16" type="checkbox"><label for="vmovq_n_p16"><div>poly16x8_t <b><b>vmovq_n_p16</b></b> (poly16_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmov_n_f64" type="checkbox"><label for="vmov_n_f64"><div>float64x1_t <b><b>vmov_n_f64</b></b> (float64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmovq_n_f64" type="checkbox"><label for="vmovq_n_f64"><div>float64x2_t <b><b>vmovq_n_f64</b></b> (float64_t value)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,rn
+</pre>      <h4>Argument Preparation</h4><pre>value &rarr; rn </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_s8" type="checkbox"><label for="vdup_lane_s8"><div>int8x8_t <b><b>vdup_lane_s8</b></b> (int8x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_s8" type="checkbox"><label for="vdupq_lane_s8"><div>int8x16_t <b><b>vdupq_lane_s8</b></b> (int8x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_s16" type="checkbox"><label for="vdup_lane_s16"><div>int16x4_t <b><b>vdup_lane_s16</b></b> (int16x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_s16" type="checkbox"><label for="vdupq_lane_s16"><div>int16x8_t <b><b>vdupq_lane_s16</b></b> (int16x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_s32" type="checkbox"><label for="vdup_lane_s32"><div>int32x2_t <b><b>vdup_lane_s32</b></b> (int32x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_s32" type="checkbox"><label for="vdupq_lane_s32"><div>int32x4_t <b><b>vdupq_lane_s32</b></b> (int32x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_s64" type="checkbox"><label for="vdup_lane_s64"><div>int64x1_t <b><b>vdup_lane_s64</b></b> (int64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_s64" type="checkbox"><label for="vdupq_lane_s64"><div>int64x2_t <b><b>vdupq_lane_s64</b></b> (int64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_u8" type="checkbox"><label for="vdup_lane_u8"><div>uint8x8_t <b><b>vdup_lane_u8</b></b> (uint8x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_u8" type="checkbox"><label for="vdupq_lane_u8"><div>uint8x16_t <b><b>vdupq_lane_u8</b></b> (uint8x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_u16" type="checkbox"><label for="vdup_lane_u16"><div>uint16x4_t <b><b>vdup_lane_u16</b></b> (uint16x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_u16" type="checkbox"><label for="vdupq_lane_u16"><div>uint16x8_t <b><b>vdupq_lane_u16</b></b> (uint16x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_u32" type="checkbox"><label for="vdup_lane_u32"><div>uint32x2_t <b><b>vdup_lane_u32</b></b> (uint32x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_u32" type="checkbox"><label for="vdupq_lane_u32"><div>uint32x4_t <b><b>vdupq_lane_u32</b></b> (uint32x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_u64" type="checkbox"><label for="vdup_lane_u64"><div>uint64x1_t <b><b>vdup_lane_u64</b></b> (uint64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_u64" type="checkbox"><label for="vdupq_lane_u64"><div>uint64x2_t <b><b>vdupq_lane_u64</b></b> (uint64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_p64" type="checkbox"><label for="vdup_lane_p64"><div>poly64x1_t <b><b>vdup_lane_p64</b></b> (poly64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_p64" type="checkbox"><label for="vdupq_lane_p64"><div>poly64x2_t <b><b>vdupq_lane_p64</b></b> (poly64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_f32" type="checkbox"><label for="vdup_lane_f32"><div>float32x2_t <b><b>vdup_lane_f32</b></b> (float32x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_f32" type="checkbox"><label for="vdupq_lane_f32"><div>float32x4_t <b><b>vdupq_lane_f32</b></b> (float32x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_p8" type="checkbox"><label for="vdup_lane_p8"><div>poly8x8_t <b><b>vdup_lane_p8</b></b> (poly8x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_p8" type="checkbox"><label for="vdupq_lane_p8"><div>poly8x16_t <b><b>vdupq_lane_p8</b></b> (poly8x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_p16" type="checkbox"><label for="vdup_lane_p16"><div>poly16x4_t <b><b>vdup_lane_p16</b></b> (poly16x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_p16" type="checkbox"><label for="vdupq_lane_p16"><div>poly16x8_t <b><b>vdupq_lane_p16</b></b> (poly16x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_lane_f64" type="checkbox"><label for="vdup_lane_f64"><div>float64x1_t <b><b>vdup_lane_f64</b></b> (float64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_lane_f64" type="checkbox"><label for="vdupq_lane_f64"><div>float64x2_t <b><b>vdupq_lane_f64</b></b> (float64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_s8" type="checkbox"><label for="vdup_laneq_s8"><div>int8x8_t <b><b>vdup_laneq_s8</b></b> (int8x16_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_s8" type="checkbox"><label for="vdupq_laneq_s8"><div>int8x16_t <b><b>vdupq_laneq_s8</b></b> (int8x16_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_s16" type="checkbox"><label for="vdup_laneq_s16"><div>int16x4_t <b><b>vdup_laneq_s16</b></b> (int16x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_s16" type="checkbox"><label for="vdupq_laneq_s16"><div>int16x8_t <b><b>vdupq_laneq_s16</b></b> (int16x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_s32" type="checkbox"><label for="vdup_laneq_s32"><div>int32x2_t <b><b>vdup_laneq_s32</b></b> (int32x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_s32" type="checkbox"><label for="vdupq_laneq_s32"><div>int32x4_t <b><b>vdupq_laneq_s32</b></b> (int32x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_s64" type="checkbox"><label for="vdup_laneq_s64"><div>int64x1_t <b><b>vdup_laneq_s64</b></b> (int64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_s64" type="checkbox"><label for="vdupq_laneq_s64"><div>int64x2_t <b><b>vdupq_laneq_s64</b></b> (int64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_u8" type="checkbox"><label for="vdup_laneq_u8"><div>uint8x8_t <b><b>vdup_laneq_u8</b></b> (uint8x16_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_u8" type="checkbox"><label for="vdupq_laneq_u8"><div>uint8x16_t <b><b>vdupq_laneq_u8</b></b> (uint8x16_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_u16" type="checkbox"><label for="vdup_laneq_u16"><div>uint16x4_t <b><b>vdup_laneq_u16</b></b> (uint16x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_u16" type="checkbox"><label for="vdupq_laneq_u16"><div>uint16x8_t <b><b>vdupq_laneq_u16</b></b> (uint16x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_u32" type="checkbox"><label for="vdup_laneq_u32"><div>uint32x2_t <b><b>vdup_laneq_u32</b></b> (uint32x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_u32" type="checkbox"><label for="vdupq_laneq_u32"><div>uint32x4_t <b><b>vdupq_laneq_u32</b></b> (uint32x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_u64" type="checkbox"><label for="vdup_laneq_u64"><div>uint64x1_t <b><b>vdup_laneq_u64</b></b> (uint64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_u64" type="checkbox"><label for="vdupq_laneq_u64"><div>uint64x2_t <b><b>vdupq_laneq_u64</b></b> (uint64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_p64" type="checkbox"><label for="vdup_laneq_p64"><div>poly64x1_t <b><b>vdup_laneq_p64</b></b> (poly64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_p64" type="checkbox"><label for="vdupq_laneq_p64"><div>poly64x2_t <b><b>vdupq_laneq_p64</b></b> (poly64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_f32" type="checkbox"><label for="vdup_laneq_f32"><div>float32x2_t <b><b>vdup_laneq_f32</b></b> (float32x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_f32" type="checkbox"><label for="vdupq_laneq_f32"><div>float32x4_t <b><b>vdupq_laneq_f32</b></b> (float32x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4S,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_p8" type="checkbox"><label for="vdup_laneq_p8"><div>poly8x8_t <b><b>vdup_laneq_p8</b></b> (poly8x16_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_p8" type="checkbox"><label for="vdupq_laneq_p8"><div>poly8x16_t <b><b>vdupq_laneq_p8</b></b> (poly8x16_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.16B,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_p16" type="checkbox"><label for="vdup_laneq_p16"><div>poly16x4_t <b><b>vdup_laneq_p16</b></b> (poly16x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.4H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_p16" type="checkbox"><label for="vdupq_laneq_p16"><div>poly16x8_t <b><b>vdupq_laneq_p16</b></b> (poly16x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.8H,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdup_laneq_f64" type="checkbox"><label for="vdup_laneq_f64"><div>float64x1_t <b><b>vdup_laneq_f64</b></b> (float64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupq_laneq_f64" type="checkbox"><label for="vdupq_laneq_f64"><div>float64x2_t <b><b>vdupq_laneq_f64</b></b> (float64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.2D,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_s8" type="checkbox"><label for="vcombine_s8"><div>int8x16_t <b><b>vcombine_s8</b></b> (int8x8_t low, int8x8_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.8B <br />
+high &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_s16" type="checkbox"><label for="vcombine_s16"><div>int16x8_t <b><b>vcombine_s16</b></b> (int16x4_t low, int16x4_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.4H <br />
+high &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_s32" type="checkbox"><label for="vcombine_s32"><div>int32x4_t <b><b>vcombine_s32</b></b> (int32x2_t low, int32x2_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.2S <br />
+high &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_s64" type="checkbox"><label for="vcombine_s64"><div>int64x2_t <b><b>vcombine_s64</b></b> (int64x1_t low, int64x1_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.1D <br />
+high &rarr; Vm.1D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_u8" type="checkbox"><label for="vcombine_u8"><div>uint8x16_t <b><b>vcombine_u8</b></b> (uint8x8_t low, uint8x8_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.8B <br />
+high &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_u16" type="checkbox"><label for="vcombine_u16"><div>uint16x8_t <b><b>vcombine_u16</b></b> (uint16x4_t low, uint16x4_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.4H <br />
+high &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_u32" type="checkbox"><label for="vcombine_u32"><div>uint32x4_t <b><b>vcombine_u32</b></b> (uint32x2_t low, uint32x2_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.2S <br />
+high &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_u64" type="checkbox"><label for="vcombine_u64"><div>uint64x2_t <b><b>vcombine_u64</b></b> (uint64x1_t low, uint64x1_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.1D <br />
+high &rarr; Vm.1D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_p64" type="checkbox"><label for="vcombine_p64"><div>poly64x2_t <b><b>vcombine_p64</b></b> (poly64x1_t low, poly64x1_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.1D <br />
+high &rarr; Vm.1D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_f16" type="checkbox"><label for="vcombine_f16"><div>float16x8_t <b><b>vcombine_f16</b></b> (float16x4_t low, float16x4_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.4H <br />
+high &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_f32" type="checkbox"><label for="vcombine_f32"><div>float32x4_t <b><b>vcombine_f32</b></b> (float32x2_t low, float32x2_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.2S <br />
+high &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_p8" type="checkbox"><label for="vcombine_p8"><div>poly8x16_t <b><b>vcombine_p8</b></b> (poly8x8_t low, poly8x8_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.8B <br />
+high &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_p16" type="checkbox"><label for="vcombine_p16"><div>poly16x8_t <b><b>vcombine_p16</b></b> (poly16x4_t low, poly16x4_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.4H <br />
+high &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vcombine_f64" type="checkbox"><label for="vcombine_f64"><div>float64x2_t <b><b>vcombine_f64</b></b> (float64x1_t low, float64x1_t high)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[1],Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>low &rarr; Vn.1D <br />
+high &rarr; Vm.1D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_s8" type="checkbox"><label for="vget_high_s8"><div>int8x8_t <b><b>vget_high_s8</b></b> (int8x16_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_s16" type="checkbox"><label for="vget_high_s16"><div>int16x4_t <b><b>vget_high_s16</b></b> (int16x8_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_s32" type="checkbox"><label for="vget_high_s32"><div>int32x2_t <b><b>vget_high_s32</b></b> (int32x4_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_s64" type="checkbox"><label for="vget_high_s64"><div>int64x1_t <b><b>vget_high_s64</b></b> (int64x2_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_u8" type="checkbox"><label for="vget_high_u8"><div>uint8x8_t <b><b>vget_high_u8</b></b> (uint8x16_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_u16" type="checkbox"><label for="vget_high_u16"><div>uint16x4_t <b><b>vget_high_u16</b></b> (uint16x8_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_u32" type="checkbox"><label for="vget_high_u32"><div>uint32x2_t <b><b>vget_high_u32</b></b> (uint32x4_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_u64" type="checkbox"><label for="vget_high_u64"><div>uint64x1_t <b><b>vget_high_u64</b></b> (uint64x2_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_p64" type="checkbox"><label for="vget_high_p64"><div>poly64x1_t <b><b>vget_high_p64</b></b> (poly64x2_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_f16" type="checkbox"><label for="vget_high_f16"><div>float16x4_t <b><b>vget_high_f16</b></b> (float16x8_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_f32" type="checkbox"><label for="vget_high_f32"><div>float32x2_t <b><b>vget_high_f32</b></b> (float32x4_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_p8" type="checkbox"><label for="vget_high_p8"><div>poly8x8_t <b><b>vget_high_p8</b></b> (poly8x16_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_p16" type="checkbox"><label for="vget_high_p16"><div>poly16x4_t <b><b>vget_high_p16</b></b> (poly16x8_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_high_f64" type="checkbox"><label for="vget_high_f64"><div>float64x1_t <b><b>vget_high_f64</b></b> (float64x2_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[1]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_s8" type="checkbox"><label for="vget_low_s8"><div>int8x8_t <b><b>vget_low_s8</b></b> (int8x16_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_s16" type="checkbox"><label for="vget_low_s16"><div>int16x4_t <b><b>vget_low_s16</b></b> (int16x8_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_s32" type="checkbox"><label for="vget_low_s32"><div>int32x2_t <b><b>vget_low_s32</b></b> (int32x4_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_s64" type="checkbox"><label for="vget_low_s64"><div>int64x1_t <b><b>vget_low_s64</b></b> (int64x2_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_u8" type="checkbox"><label for="vget_low_u8"><div>uint8x8_t <b><b>vget_low_u8</b></b> (uint8x16_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_u16" type="checkbox"><label for="vget_low_u16"><div>uint16x4_t <b><b>vget_low_u16</b></b> (uint16x8_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_u32" type="checkbox"><label for="vget_low_u32"><div>uint32x2_t <b><b>vget_low_u32</b></b> (uint32x4_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_u64" type="checkbox"><label for="vget_low_u64"><div>uint64x1_t <b><b>vget_low_u64</b></b> (uint64x2_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_p64" type="checkbox"><label for="vget_low_p64"><div>poly64x1_t <b><b>vget_low_p64</b></b> (poly64x2_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_f16" type="checkbox"><label for="vget_low_f16"><div>float16x4_t <b><b>vget_low_f16</b></b> (float16x8_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_f32" type="checkbox"><label for="vget_low_f32"><div>float32x2_t <b><b>vget_low_f32</b></b> (float32x4_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_p8" type="checkbox"><label for="vget_low_p8"><div>poly8x8_t <b><b>vget_low_p8</b></b> (poly8x16_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_p16" type="checkbox"><label for="vget_low_p16"><div>poly16x4_t <b><b>vget_low_p16</b></b> (poly16x8_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_low_f64" type="checkbox"><label for="vget_low_f64"><div>float64x1_t <b><b>vget_low_f64</b></b> (float64x2_t a)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Vd.1D,Vn.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupb_lane_s8" type="checkbox"><label for="vdupb_lane_s8"><div>int8_t <b><b>vdupb_lane_s8</b></b> (int8x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Bd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vduph_lane_s16" type="checkbox"><label for="vduph_lane_s16"><div>int16_t <b><b>vduph_lane_s16</b></b> (int16x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Hd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdups_lane_s32" type="checkbox"><label for="vdups_lane_s32"><div>int32_t <b><b>vdups_lane_s32</b></b> (int32x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Sd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupd_lane_s64" type="checkbox"><label for="vdupd_lane_s64"><div>int64_t <b><b>vdupd_lane_s64</b></b> (int64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupb_lane_u8" type="checkbox"><label for="vdupb_lane_u8"><div>uint8_t <b><b>vdupb_lane_u8</b></b> (uint8x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Bd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vduph_lane_u16" type="checkbox"><label for="vduph_lane_u16"><div>uint16_t <b><b>vduph_lane_u16</b></b> (uint16x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Hd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdups_lane_u32" type="checkbox"><label for="vdups_lane_u32"><div>uint32_t <b><b>vdups_lane_u32</b></b> (uint32x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Sd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupd_lane_u64" type="checkbox"><label for="vdupd_lane_u64"><div>uint64_t <b><b>vdupd_lane_u64</b></b> (uint64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdups_lane_f32" type="checkbox"><label for="vdups_lane_f32"><div>float32_t <b><b>vdups_lane_f32</b></b> (float32x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Sd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupd_lane_f64" type="checkbox"><label for="vdupd_lane_f64"><div>float64_t <b><b>vdupd_lane_f64</b></b> (float64x1_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupb_lane_p8" type="checkbox"><label for="vdupb_lane_p8"><div>poly8_t <b><b>vdupb_lane_p8</b></b> (poly8x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Bd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vduph_lane_p16" type="checkbox"><label for="vduph_lane_p16"><div>poly16_t <b><b>vduph_lane_p16</b></b> (poly16x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Hd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupb_laneq_s8" type="checkbox"><label for="vdupb_laneq_s8"><div>int8_t <b><b>vdupb_laneq_s8</b></b> (int8x16_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Bd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vduph_laneq_s16" type="checkbox"><label for="vduph_laneq_s16"><div>int16_t <b><b>vduph_laneq_s16</b></b> (int16x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Hd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdups_laneq_s32" type="checkbox"><label for="vdups_laneq_s32"><div>int32_t <b><b>vdups_laneq_s32</b></b> (int32x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Sd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupd_laneq_s64" type="checkbox"><label for="vdupd_laneq_s64"><div>int64_t <b><b>vdupd_laneq_s64</b></b> (int64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupb_laneq_u8" type="checkbox"><label for="vdupb_laneq_u8"><div>uint8_t <b><b>vdupb_laneq_u8</b></b> (uint8x16_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Bd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vduph_laneq_u16" type="checkbox"><label for="vduph_laneq_u16"><div>uint16_t <b><b>vduph_laneq_u16</b></b> (uint16x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Hd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdups_laneq_u32" type="checkbox"><label for="vdups_laneq_u32"><div>uint32_t <b><b>vdups_laneq_u32</b></b> (uint32x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Sd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupd_laneq_u64" type="checkbox"><label for="vdupd_laneq_u64"><div>uint64_t <b><b>vdupd_laneq_u64</b></b> (uint64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdups_laneq_f32" type="checkbox"><label for="vdups_laneq_f32"><div>float32_t <b><b>vdups_laneq_f32</b></b> (float32x4_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Sd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupd_laneq_f64" type="checkbox"><label for="vdupd_laneq_f64"><div>float64_t <b><b>vdupd_laneq_f64</b></b> (float64x2_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vdupb_laneq_p8" type="checkbox"><label for="vdupb_laneq_p8"><div>poly8_t <b><b>vdupb_laneq_p8</b></b> (poly8x16_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Bd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vduph_laneq_p16" type="checkbox"><label for="vduph_laneq_p16"><div>poly16_t <b><b>vduph_laneq_p16</b></b> (poly16x8_t vec, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Hd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s8" type="checkbox"><label for="vld1_s8"><div>int8x8_t <b><b>vld1_s8</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s8" type="checkbox"><label for="vld1q_s8"><div>int8x16_t <b><b>vld1q_s8</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s16" type="checkbox"><label for="vld1_s16"><div>int16x4_t <b><b>vld1_s16</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s16" type="checkbox"><label for="vld1q_s16"><div>int16x8_t <b><b>vld1q_s16</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s32" type="checkbox"><label for="vld1_s32"><div>int32x2_t <b><b>vld1_s32</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s32" type="checkbox"><label for="vld1q_s32"><div>int32x4_t <b><b>vld1q_s32</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s64" type="checkbox"><label for="vld1_s64"><div>int64x1_t <b><b>vld1_s64</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s64" type="checkbox"><label for="vld1q_s64"><div>int64x2_t <b><b>vld1q_s64</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u8" type="checkbox"><label for="vld1_u8"><div>uint8x8_t <b><b>vld1_u8</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u8" type="checkbox"><label for="vld1q_u8"><div>uint8x16_t <b><b>vld1q_u8</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u16" type="checkbox"><label for="vld1_u16"><div>uint16x4_t <b><b>vld1_u16</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u16" type="checkbox"><label for="vld1q_u16"><div>uint16x8_t <b><b>vld1q_u16</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u32" type="checkbox"><label for="vld1_u32"><div>uint32x2_t <b><b>vld1_u32</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u32" type="checkbox"><label for="vld1q_u32"><div>uint32x4_t <b><b>vld1q_u32</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u64" type="checkbox"><label for="vld1_u64"><div>uint64x1_t <b><b>vld1_u64</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u64" type="checkbox"><label for="vld1q_u64"><div>uint64x2_t <b><b>vld1q_u64</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p64" type="checkbox"><label for="vld1_p64"><div>poly64x1_t <b><b>vld1_p64</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p64" type="checkbox"><label for="vld1q_p64"><div>poly64x2_t <b><b>vld1q_p64</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f16" type="checkbox"><label for="vld1_f16"><div>float16x4_t <b><b>vld1_f16</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f16" type="checkbox"><label for="vld1q_f16"><div>float16x8_t <b><b>vld1q_f16</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f32" type="checkbox"><label for="vld1_f32"><div>float32x2_t <b><b>vld1_f32</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f32" type="checkbox"><label for="vld1q_f32"><div>float32x4_t <b><b>vld1q_f32</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p8" type="checkbox"><label for="vld1_p8"><div>poly8x8_t <b><b>vld1_p8</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p8" type="checkbox"><label for="vld1q_p8"><div>poly8x16_t <b><b>vld1q_p8</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p16" type="checkbox"><label for="vld1_p16"><div>poly16x4_t <b><b>vld1_p16</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p16" type="checkbox"><label for="vld1q_p16"><div>poly16x8_t <b><b>vld1q_p16</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f64" type="checkbox"><label for="vld1_f64"><div>float64x1_t <b><b>vld1_f64</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f64" type="checkbox"><label for="vld1q_f64"><div>float64x2_t <b><b>vld1q_f64</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_s8" type="checkbox"><label for="vld1_lane_s8"><div>int8x8_t <b><b>vld1_lane_s8</b></b> (int8_t const * ptr, int8x8_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_s8" type="checkbox"><label for="vld1q_lane_s8"><div>int8x16_t <b><b>vld1q_lane_s8</b></b> (int8_t const * ptr, int8x16_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_s16" type="checkbox"><label for="vld1_lane_s16"><div>int16x4_t <b><b>vld1_lane_s16</b></b> (int16_t const * ptr, int16x4_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.H}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_s16" type="checkbox"><label for="vld1q_lane_s16"><div>int16x8_t <b><b>vld1q_lane_s16</b></b> (int16_t const * ptr, int16x8_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.H}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_s32" type="checkbox"><label for="vld1_lane_s32"><div>int32x2_t <b><b>vld1_lane_s32</b></b> (int32_t const * ptr, int32x2_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.S}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_s32" type="checkbox"><label for="vld1q_lane_s32"><div>int32x4_t <b><b>vld1q_lane_s32</b></b> (int32_t const * ptr, int32x4_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.S}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_s64" type="checkbox"><label for="vld1_lane_s64"><div>int64x1_t <b><b>vld1_lane_s64</b></b> (int64_t const * ptr, int64x1_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.D}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_s64" type="checkbox"><label for="vld1q_lane_s64"><div>int64x2_t <b><b>vld1q_lane_s64</b></b> (int64_t const * ptr, int64x2_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.D}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_u8" type="checkbox"><label for="vld1_lane_u8"><div>uint8x8_t <b><b>vld1_lane_u8</b></b> (uint8_t const * ptr, uint8x8_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.B}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_u8" type="checkbox"><label for="vld1q_lane_u8"><div>uint8x16_t <b><b>vld1q_lane_u8</b></b> (uint8_t const * ptr, uint8x16_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.B}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_u16" type="checkbox"><label for="vld1_lane_u16"><div>uint16x4_t <b><b>vld1_lane_u16</b></b> (uint16_t const * ptr, uint16x4_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.H}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_u16" type="checkbox"><label for="vld1q_lane_u16"><div>uint16x8_t <b><b>vld1q_lane_u16</b></b> (uint16_t const * ptr, uint16x8_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.H}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_u32" type="checkbox"><label for="vld1_lane_u32"><div>uint32x2_t <b><b>vld1_lane_u32</b></b> (uint32_t const * ptr, uint32x2_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.S}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_u32" type="checkbox"><label for="vld1q_lane_u32"><div>uint32x4_t <b><b>vld1q_lane_u32</b></b> (uint32_t const * ptr, uint32x4_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.S}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_u64" type="checkbox"><label for="vld1_lane_u64"><div>uint64x1_t <b><b>vld1_lane_u64</b></b> (uint64_t const * ptr, uint64x1_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.D}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_u64" type="checkbox"><label for="vld1q_lane_u64"><div>uint64x2_t <b><b>vld1q_lane_u64</b></b> (uint64_t const * ptr, uint64x2_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.D}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_p64" type="checkbox"><label for="vld1_lane_p64"><div>poly64x1_t <b><b>vld1_lane_p64</b></b> (poly64_t const * ptr, poly64x1_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.D}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_p64" type="checkbox"><label for="vld1q_lane_p64"><div>poly64x2_t <b><b>vld1q_lane_p64</b></b> (poly64_t const * ptr, poly64x2_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.D}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_f16" type="checkbox"><label for="vld1_lane_f16"><div>float16x4_t <b><b>vld1_lane_f16</b></b> (float16_t const * ptr, float16x4_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.H}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_f16" type="checkbox"><label for="vld1q_lane_f16"><div>float16x8_t <b><b>vld1q_lane_f16</b></b> (float16_t const * ptr, float16x8_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.H}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_f32" type="checkbox"><label for="vld1_lane_f32"><div>float32x2_t <b><b>vld1_lane_f32</b></b> (float32_t const * ptr, float32x2_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.S}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_f32" type="checkbox"><label for="vld1q_lane_f32"><div>float32x4_t <b><b>vld1q_lane_f32</b></b> (float32_t const * ptr, float32x4_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.S}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_p8" type="checkbox"><label for="vld1_lane_p8"><div>poly8x8_t <b><b>vld1_lane_p8</b></b> (poly8_t const * ptr, poly8x8_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.B}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_p8" type="checkbox"><label for="vld1q_lane_p8"><div>poly8x16_t <b><b>vld1q_lane_p8</b></b> (poly8_t const * ptr, poly8x16_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.B}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_p16" type="checkbox"><label for="vld1_lane_p16"><div>poly16x4_t <b><b>vld1_lane_p16</b></b> (poly16_t const * ptr, poly16x4_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.H}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_p16" type="checkbox"><label for="vld1q_lane_p16"><div>poly16x8_t <b><b>vld1q_lane_p16</b></b> (poly16_t const * ptr, poly16x8_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.H}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_lane_f64" type="checkbox"><label for="vld1_lane_f64"><div>float64x1_t <b><b>vld1_lane_f64</b></b> (float64_t const * ptr, float64x1_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.D}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_lane_f64" type="checkbox"><label for="vld1q_lane_f64"><div>float64x2_t <b><b>vld1q_lane_f64</b></b> (float64_t const * ptr, float64x2_t src, const int lane)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.D}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_s8" type="checkbox"><label for="vld1_dup_s8"><div>int8x8_t <b><b>vld1_dup_s8</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_s8" type="checkbox"><label for="vld1q_dup_s8"><div>int8x16_t <b><b>vld1q_dup_s8</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_s16" type="checkbox"><label for="vld1_dup_s16"><div>int16x4_t <b><b>vld1_dup_s16</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_s16" type="checkbox"><label for="vld1q_dup_s16"><div>int16x8_t <b><b>vld1q_dup_s16</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_s32" type="checkbox"><label for="vld1_dup_s32"><div>int32x2_t <b><b>vld1_dup_s32</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_s32" type="checkbox"><label for="vld1q_dup_s32"><div>int32x4_t <b><b>vld1q_dup_s32</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_s64" type="checkbox"><label for="vld1_dup_s64"><div>int64x1_t <b><b>vld1_dup_s64</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_s64" type="checkbox"><label for="vld1q_dup_s64"><div>int64x2_t <b><b>vld1q_dup_s64</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_u8" type="checkbox"><label for="vld1_dup_u8"><div>uint8x8_t <b><b>vld1_dup_u8</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_u8" type="checkbox"><label for="vld1q_dup_u8"><div>uint8x16_t <b><b>vld1q_dup_u8</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_u16" type="checkbox"><label for="vld1_dup_u16"><div>uint16x4_t <b><b>vld1_dup_u16</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_u16" type="checkbox"><label for="vld1q_dup_u16"><div>uint16x8_t <b><b>vld1q_dup_u16</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_u32" type="checkbox"><label for="vld1_dup_u32"><div>uint32x2_t <b><b>vld1_dup_u32</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_u32" type="checkbox"><label for="vld1q_dup_u32"><div>uint32x4_t <b><b>vld1q_dup_u32</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_u64" type="checkbox"><label for="vld1_dup_u64"><div>uint64x1_t <b><b>vld1_dup_u64</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_u64" type="checkbox"><label for="vld1q_dup_u64"><div>uint64x2_t <b><b>vld1q_dup_u64</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_p64" type="checkbox"><label for="vld1_dup_p64"><div>poly64x1_t <b><b>vld1_dup_p64</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_p64" type="checkbox"><label for="vld1q_dup_p64"><div>poly64x2_t <b><b>vld1q_dup_p64</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_f16" type="checkbox"><label for="vld1_dup_f16"><div>float16x4_t <b><b>vld1_dup_f16</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_f16" type="checkbox"><label for="vld1q_dup_f16"><div>float16x8_t <b><b>vld1q_dup_f16</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_f32" type="checkbox"><label for="vld1_dup_f32"><div>float32x2_t <b><b>vld1_dup_f32</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_f32" type="checkbox"><label for="vld1q_dup_f32"><div>float32x4_t <b><b>vld1q_dup_f32</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_p8" type="checkbox"><label for="vld1_dup_p8"><div>poly8x8_t <b><b>vld1_dup_p8</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_p8" type="checkbox"><label for="vld1q_dup_p8"><div>poly8x16_t <b><b>vld1q_dup_p8</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_p16" type="checkbox"><label for="vld1_dup_p16"><div>poly16x4_t <b><b>vld1_dup_p16</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_p16" type="checkbox"><label for="vld1q_dup_p16"><div>poly16x8_t <b><b>vld1q_dup_p16</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_dup_f64" type="checkbox"><label for="vld1_dup_f64"><div>float64x1_t <b><b>vld1_dup_f64</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_dup_f64" type="checkbox"><label for="vld1q_dup_f64"><div>float64x2_t <b><b>vld1q_dup_f64</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure and replicate to all lanes (of one register)</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure and Replicate to all lanes (of one register). This instruction loads a single-element structure from memory and replicates the structure to all the lanes of the SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1r-load-one-single-element-structure-and-replicate-to-all-lanes-of-one-register">LD1R</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s8" type="checkbox"><label for="vst1_s8"><div>void <b><b>vst1_s8</b></b> (int8_t * ptr, int8x8_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s8" type="checkbox"><label for="vst1q_s8"><div>void <b><b>vst1q_s8</b></b> (int8_t * ptr, int8x16_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s16" type="checkbox"><label for="vst1_s16"><div>void <b><b>vst1_s16</b></b> (int16_t * ptr, int16x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s16" type="checkbox"><label for="vst1q_s16"><div>void <b><b>vst1q_s16</b></b> (int16_t * ptr, int16x8_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s32" type="checkbox"><label for="vst1_s32"><div>void <b><b>vst1_s32</b></b> (int32_t * ptr, int32x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s32" type="checkbox"><label for="vst1q_s32"><div>void <b><b>vst1q_s32</b></b> (int32_t * ptr, int32x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s64" type="checkbox"><label for="vst1_s64"><div>void <b><b>vst1_s64</b></b> (int64_t * ptr, int64x1_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s64" type="checkbox"><label for="vst1q_s64"><div>void <b><b>vst1q_s64</b></b> (int64_t * ptr, int64x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u8" type="checkbox"><label for="vst1_u8"><div>void <b><b>vst1_u8</b></b> (uint8_t * ptr, uint8x8_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u8" type="checkbox"><label for="vst1q_u8"><div>void <b><b>vst1q_u8</b></b> (uint8_t * ptr, uint8x16_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u16" type="checkbox"><label for="vst1_u16"><div>void <b><b>vst1_u16</b></b> (uint16_t * ptr, uint16x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u16" type="checkbox"><label for="vst1q_u16"><div>void <b><b>vst1q_u16</b></b> (uint16_t * ptr, uint16x8_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u32" type="checkbox"><label for="vst1_u32"><div>void <b><b>vst1_u32</b></b> (uint32_t * ptr, uint32x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u32" type="checkbox"><label for="vst1q_u32"><div>void <b><b>vst1q_u32</b></b> (uint32_t * ptr, uint32x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u64" type="checkbox"><label for="vst1_u64"><div>void <b><b>vst1_u64</b></b> (uint64_t * ptr, uint64x1_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u64" type="checkbox"><label for="vst1q_u64"><div>void <b><b>vst1q_u64</b></b> (uint64_t * ptr, uint64x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p64" type="checkbox"><label for="vst1_p64"><div>void <b><b>vst1_p64</b></b> (poly64_t * ptr, poly64x1_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p64" type="checkbox"><label for="vst1q_p64"><div>void <b><b>vst1q_p64</b></b> (poly64_t * ptr, poly64x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f16" type="checkbox"><label for="vst1_f16"><div>void <b><b>vst1_f16</b></b> (float16_t * ptr, float16x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f16" type="checkbox"><label for="vst1q_f16"><div>void <b><b>vst1q_f16</b></b> (float16_t * ptr, float16x8_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f32" type="checkbox"><label for="vst1_f32"><div>void <b><b>vst1_f32</b></b> (float32_t * ptr, float32x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f32" type="checkbox"><label for="vst1q_f32"><div>void <b><b>vst1q_f32</b></b> (float32_t * ptr, float32x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p8" type="checkbox"><label for="vst1_p8"><div>void <b><b>vst1_p8</b></b> (poly8_t * ptr, poly8x8_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p8" type="checkbox"><label for="vst1q_p8"><div>void <b><b>vst1q_p8</b></b> (poly8_t * ptr, poly8x16_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p16" type="checkbox"><label for="vst1_p16"><div>void <b><b>vst1_p16</b></b> (poly16_t * ptr, poly16x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p16" type="checkbox"><label for="vst1q_p16"><div>void <b><b>vst1q_p16</b></b> (poly16_t * ptr, poly16x8_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f64" type="checkbox"><label for="vst1_f64"><div>void <b><b>vst1_f64</b></b> (float64_t * ptr, float64x1_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f64" type="checkbox"><label for="vst1q_f64"><div>void <b><b>vst1q_f64</b></b> (float64_t * ptr, float64x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_s8" type="checkbox"><label for="vst1_lane_s8"><div>void <b><b>vst1_lane_s8</b></b> (int8_t * ptr, int8x8_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_s8" type="checkbox"><label for="vst1q_lane_s8"><div>void <b><b>vst1q_lane_s8</b></b> (int8_t * ptr, int8x16_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_s16" type="checkbox"><label for="vst1_lane_s16"><div>void <b><b>vst1_lane_s16</b></b> (int16_t * ptr, int16x4_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_s16" type="checkbox"><label for="vst1q_lane_s16"><div>void <b><b>vst1q_lane_s16</b></b> (int16_t * ptr, int16x8_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_s32" type="checkbox"><label for="vst1_lane_s32"><div>void <b><b>vst1_lane_s32</b></b> (int32_t * ptr, int32x2_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_s32" type="checkbox"><label for="vst1q_lane_s32"><div>void <b><b>vst1q_lane_s32</b></b> (int32_t * ptr, int32x4_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_s64" type="checkbox"><label for="vst1_lane_s64"><div>void <b><b>vst1_lane_s64</b></b> (int64_t * ptr, int64x1_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_s64" type="checkbox"><label for="vst1q_lane_s64"><div>void <b><b>vst1q_lane_s64</b></b> (int64_t * ptr, int64x2_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_u8" type="checkbox"><label for="vst1_lane_u8"><div>void <b><b>vst1_lane_u8</b></b> (uint8_t * ptr, uint8x8_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_u8" type="checkbox"><label for="vst1q_lane_u8"><div>void <b><b>vst1q_lane_u8</b></b> (uint8_t * ptr, uint8x16_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_u16" type="checkbox"><label for="vst1_lane_u16"><div>void <b><b>vst1_lane_u16</b></b> (uint16_t * ptr, uint16x4_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_u16" type="checkbox"><label for="vst1q_lane_u16"><div>void <b><b>vst1q_lane_u16</b></b> (uint16_t * ptr, uint16x8_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_u32" type="checkbox"><label for="vst1_lane_u32"><div>void <b><b>vst1_lane_u32</b></b> (uint32_t * ptr, uint32x2_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_u32" type="checkbox"><label for="vst1q_lane_u32"><div>void <b><b>vst1q_lane_u32</b></b> (uint32_t * ptr, uint32x4_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_u64" type="checkbox"><label for="vst1_lane_u64"><div>void <b><b>vst1_lane_u64</b></b> (uint64_t * ptr, uint64x1_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_u64" type="checkbox"><label for="vst1q_lane_u64"><div>void <b><b>vst1q_lane_u64</b></b> (uint64_t * ptr, uint64x2_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_p64" type="checkbox"><label for="vst1_lane_p64"><div>void <b><b>vst1_lane_p64</b></b> (poly64_t * ptr, poly64x1_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_p64" type="checkbox"><label for="vst1q_lane_p64"><div>void <b><b>vst1q_lane_p64</b></b> (poly64_t * ptr, poly64x2_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_f16" type="checkbox"><label for="vst1_lane_f16"><div>void <b><b>vst1_lane_f16</b></b> (float16_t * ptr, float16x4_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_f16" type="checkbox"><label for="vst1q_lane_f16"><div>void <b><b>vst1q_lane_f16</b></b> (float16_t * ptr, float16x8_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_f32" type="checkbox"><label for="vst1_lane_f32"><div>void <b><b>vst1_lane_f32</b></b> (float32_t * ptr, float32x2_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_f32" type="checkbox"><label for="vst1q_lane_f32"><div>void <b><b>vst1q_lane_f32</b></b> (float32_t * ptr, float32x4_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_p8" type="checkbox"><label for="vst1_lane_p8"><div>void <b><b>vst1_lane_p8</b></b> (poly8_t * ptr, poly8x8_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_p8" type="checkbox"><label for="vst1q_lane_p8"><div>void <b><b>vst1q_lane_p8</b></b> (poly8_t * ptr, poly8x16_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_p16" type="checkbox"><label for="vst1_lane_p16"><div>void <b><b>vst1_lane_p16</b></b> (poly16_t * ptr, poly16x4_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_p16" type="checkbox"><label for="vst1q_lane_p16"><div>void <b><b>vst1q_lane_p16</b></b> (poly16_t * ptr, poly16x8_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_lane_f64" type="checkbox"><label for="vst1_lane_f64"><div>void <b><b>vst1_lane_f64</b></b> (float64_t * ptr, float64x1_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_lane_f64" type="checkbox"><label for="vst1q_lane_f64"><div>void <b><b>vst1q_lane_f64</b></b> (float64_t * ptr, float64x2_t val, const int lane)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2_s8" type="checkbox"><label for="vld2_s8"><div>int8x8x2_t <b><b>vld2_s8</b></b> (int8_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_s8" type="checkbox"><label for="vld2q_s8"><div>int8x16x2_t <b><b>vld2q_s8</b></b> (int8_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_s16" type="checkbox"><label for="vld2_s16"><div>int16x4x2_t <b><b>vld2_s16</b></b> (int16_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_s16" type="checkbox"><label for="vld2q_s16"><div>int16x8x2_t <b><b>vld2q_s16</b></b> (int16_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_s32" type="checkbox"><label for="vld2_s32"><div>int32x2x2_t <b><b>vld2_s32</b></b> (int32_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_s32" type="checkbox"><label for="vld2q_s32"><div>int32x4x2_t <b><b>vld2q_s32</b></b> (int32_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_u8" type="checkbox"><label for="vld2_u8"><div>uint8x8x2_t <b><b>vld2_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_u8" type="checkbox"><label for="vld2q_u8"><div>uint8x16x2_t <b><b>vld2q_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_u16" type="checkbox"><label for="vld2_u16"><div>uint16x4x2_t <b><b>vld2_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_u16" type="checkbox"><label for="vld2q_u16"><div>uint16x8x2_t <b><b>vld2q_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_u32" type="checkbox"><label for="vld2_u32"><div>uint32x2x2_t <b><b>vld2_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_u32" type="checkbox"><label for="vld2q_u32"><div>uint32x4x2_t <b><b>vld2q_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_f16" type="checkbox"><label for="vld2_f16"><div>float16x4x2_t <b><b>vld2_f16</b></b> (float16_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_f16" type="checkbox"><label for="vld2q_f16"><div>float16x8x2_t <b><b>vld2q_f16</b></b> (float16_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_f32" type="checkbox"><label for="vld2_f32"><div>float32x2x2_t <b><b>vld2_f32</b></b> (float32_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_f32" type="checkbox"><label for="vld2q_f32"><div>float32x4x2_t <b><b>vld2q_f32</b></b> (float32_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_p8" type="checkbox"><label for="vld2_p8"><div>poly8x8x2_t <b><b>vld2_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_p8" type="checkbox"><label for="vld2q_p8"><div>poly8x16x2_t <b><b>vld2q_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_p16" type="checkbox"><label for="vld2_p16"><div>poly16x4x2_t <b><b>vld2_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_p16" type="checkbox"><label for="vld2q_p16"><div>poly16x8x2_t <b><b>vld2q_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_s64" type="checkbox"><label for="vld2_s64"><div>int64x1x2_t <b><b>vld2_s64</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_u64" type="checkbox"><label for="vld2_u64"><div>uint64x1x2_t <b><b>vld2_u64</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_p64" type="checkbox"><label for="vld2_p64"><div>poly64x1x2_t <b><b>vld2_p64</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_s64" type="checkbox"><label for="vld2q_s64"><div>int64x2x2_t <b><b>vld2q_s64</b></b> (int64_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_u64" type="checkbox"><label for="vld2q_u64"><div>uint64x2x2_t <b><b>vld2q_u64</b></b> (uint64_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_p64" type="checkbox"><label for="vld2q_p64"><div>poly64x2x2_t <b><b>vld2q_p64</b></b> (poly64_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2_f64" type="checkbox"><label for="vld2_f64"><div>float64x1x2_t <b><b>vld2_f64</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_f64" type="checkbox"><label for="vld2q_f64"><div>float64x2x2_t <b><b>vld2q_f64</b></b> (float64_t const * ptr)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3_s8" type="checkbox"><label for="vld3_s8"><div>int8x8x3_t <b><b>vld3_s8</b></b> (int8_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_s8" type="checkbox"><label for="vld3q_s8"><div>int8x16x3_t <b><b>vld3q_s8</b></b> (int8_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_s16" type="checkbox"><label for="vld3_s16"><div>int16x4x3_t <b><b>vld3_s16</b></b> (int16_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_s16" type="checkbox"><label for="vld3q_s16"><div>int16x8x3_t <b><b>vld3q_s16</b></b> (int16_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_s32" type="checkbox"><label for="vld3_s32"><div>int32x2x3_t <b><b>vld3_s32</b></b> (int32_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_s32" type="checkbox"><label for="vld3q_s32"><div>int32x4x3_t <b><b>vld3q_s32</b></b> (int32_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_u8" type="checkbox"><label for="vld3_u8"><div>uint8x8x3_t <b><b>vld3_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_u8" type="checkbox"><label for="vld3q_u8"><div>uint8x16x3_t <b><b>vld3q_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_u16" type="checkbox"><label for="vld3_u16"><div>uint16x4x3_t <b><b>vld3_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_u16" type="checkbox"><label for="vld3q_u16"><div>uint16x8x3_t <b><b>vld3q_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_u32" type="checkbox"><label for="vld3_u32"><div>uint32x2x3_t <b><b>vld3_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_u32" type="checkbox"><label for="vld3q_u32"><div>uint32x4x3_t <b><b>vld3q_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_f16" type="checkbox"><label for="vld3_f16"><div>float16x4x3_t <b><b>vld3_f16</b></b> (float16_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_f16" type="checkbox"><label for="vld3q_f16"><div>float16x8x3_t <b><b>vld3q_f16</b></b> (float16_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_f32" type="checkbox"><label for="vld3_f32"><div>float32x2x3_t <b><b>vld3_f32</b></b> (float32_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_f32" type="checkbox"><label for="vld3q_f32"><div>float32x4x3_t <b><b>vld3q_f32</b></b> (float32_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_p8" type="checkbox"><label for="vld3_p8"><div>poly8x8x3_t <b><b>vld3_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_p8" type="checkbox"><label for="vld3q_p8"><div>poly8x16x3_t <b><b>vld3q_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_p16" type="checkbox"><label for="vld3_p16"><div>poly16x4x3_t <b><b>vld3_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_p16" type="checkbox"><label for="vld3q_p16"><div>poly16x8x3_t <b><b>vld3q_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_s64" type="checkbox"><label for="vld3_s64"><div>int64x1x3_t <b><b>vld3_s64</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_u64" type="checkbox"><label for="vld3_u64"><div>uint64x1x3_t <b><b>vld3_u64</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_p64" type="checkbox"><label for="vld3_p64"><div>poly64x1x3_t <b><b>vld3_p64</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_s64" type="checkbox"><label for="vld3q_s64"><div>int64x2x3_t <b><b>vld3q_s64</b></b> (int64_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_u64" type="checkbox"><label for="vld3q_u64"><div>uint64x2x3_t <b><b>vld3q_u64</b></b> (uint64_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_p64" type="checkbox"><label for="vld3q_p64"><div>poly64x2x3_t <b><b>vld3q_p64</b></b> (poly64_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3_f64" type="checkbox"><label for="vld3_f64"><div>float64x1x3_t <b><b>vld3_f64</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_f64" type="checkbox"><label for="vld3q_f64"><div>float64x2x3_t <b><b>vld3q_f64</b></b> (float64_t const * ptr)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4_s8" type="checkbox"><label for="vld4_s8"><div>int8x8x4_t <b><b>vld4_s8</b></b> (int8_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_s8" type="checkbox"><label for="vld4q_s8"><div>int8x16x4_t <b><b>vld4q_s8</b></b> (int8_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_s16" type="checkbox"><label for="vld4_s16"><div>int16x4x4_t <b><b>vld4_s16</b></b> (int16_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_s16" type="checkbox"><label for="vld4q_s16"><div>int16x8x4_t <b><b>vld4q_s16</b></b> (int16_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_s32" type="checkbox"><label for="vld4_s32"><div>int32x2x4_t <b><b>vld4_s32</b></b> (int32_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_s32" type="checkbox"><label for="vld4q_s32"><div>int32x4x4_t <b><b>vld4q_s32</b></b> (int32_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_u8" type="checkbox"><label for="vld4_u8"><div>uint8x8x4_t <b><b>vld4_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_u8" type="checkbox"><label for="vld4q_u8"><div>uint8x16x4_t <b><b>vld4q_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_u16" type="checkbox"><label for="vld4_u16"><div>uint16x4x4_t <b><b>vld4_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_u16" type="checkbox"><label for="vld4q_u16"><div>uint16x8x4_t <b><b>vld4q_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_u32" type="checkbox"><label for="vld4_u32"><div>uint32x2x4_t <b><b>vld4_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_u32" type="checkbox"><label for="vld4q_u32"><div>uint32x4x4_t <b><b>vld4q_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_f16" type="checkbox"><label for="vld4_f16"><div>float16x4x4_t <b><b>vld4_f16</b></b> (float16_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_f16" type="checkbox"><label for="vld4q_f16"><div>float16x8x4_t <b><b>vld4q_f16</b></b> (float16_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_f32" type="checkbox"><label for="vld4_f32"><div>float32x2x4_t <b><b>vld4_f32</b></b> (float32_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_f32" type="checkbox"><label for="vld4q_f32"><div>float32x4x4_t <b><b>vld4q_f32</b></b> (float32_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_p8" type="checkbox"><label for="vld4_p8"><div>poly8x8x4_t <b><b>vld4_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_p8" type="checkbox"><label for="vld4q_p8"><div>poly8x16x4_t <b><b>vld4q_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_p16" type="checkbox"><label for="vld4_p16"><div>poly16x4x4_t <b><b>vld4_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_p16" type="checkbox"><label for="vld4q_p16"><div>poly16x8x4_t <b><b>vld4q_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_s64" type="checkbox"><label for="vld4_s64"><div>int64x1x4_t <b><b>vld4_s64</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_u64" type="checkbox"><label for="vld4_u64"><div>uint64x1x4_t <b><b>vld4_u64</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_p64" type="checkbox"><label for="vld4_p64"><div>poly64x1x4_t <b><b>vld4_p64</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_s64" type="checkbox"><label for="vld4q_s64"><div>int64x2x4_t <b><b>vld4q_s64</b></b> (int64_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_u64" type="checkbox"><label for="vld4q_u64"><div>uint64x2x4_t <b><b>vld4q_u64</b></b> (uint64_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_p64" type="checkbox"><label for="vld4q_p64"><div>poly64x2x4_t <b><b>vld4q_p64</b></b> (poly64_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4_f64" type="checkbox"><label for="vld4_f64"><div>float64x1x4_t <b><b>vld4_f64</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_f64" type="checkbox"><label for="vld4q_f64"><div>float64x2x4_t <b><b>vld4q_f64</b></b> (float64_t const * ptr)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_s8" type="checkbox"><label for="vld2_dup_s8"><div>int8x8x2_t <b><b>vld2_dup_s8</b></b> (int8_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_s8" type="checkbox"><label for="vld2q_dup_s8"><div>int8x16x2_t <b><b>vld2q_dup_s8</b></b> (int8_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_s16" type="checkbox"><label for="vld2_dup_s16"><div>int16x4x2_t <b><b>vld2_dup_s16</b></b> (int16_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_s16" type="checkbox"><label for="vld2q_dup_s16"><div>int16x8x2_t <b><b>vld2q_dup_s16</b></b> (int16_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_s32" type="checkbox"><label for="vld2_dup_s32"><div>int32x2x2_t <b><b>vld2_dup_s32</b></b> (int32_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_s32" type="checkbox"><label for="vld2q_dup_s32"><div>int32x4x2_t <b><b>vld2q_dup_s32</b></b> (int32_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_u8" type="checkbox"><label for="vld2_dup_u8"><div>uint8x8x2_t <b><b>vld2_dup_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_u8" type="checkbox"><label for="vld2q_dup_u8"><div>uint8x16x2_t <b><b>vld2q_dup_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_u16" type="checkbox"><label for="vld2_dup_u16"><div>uint16x4x2_t <b><b>vld2_dup_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_u16" type="checkbox"><label for="vld2q_dup_u16"><div>uint16x8x2_t <b><b>vld2q_dup_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_u32" type="checkbox"><label for="vld2_dup_u32"><div>uint32x2x2_t <b><b>vld2_dup_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_u32" type="checkbox"><label for="vld2q_dup_u32"><div>uint32x4x2_t <b><b>vld2q_dup_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_f16" type="checkbox"><label for="vld2_dup_f16"><div>float16x4x2_t <b><b>vld2_dup_f16</b></b> (float16_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_f16" type="checkbox"><label for="vld2q_dup_f16"><div>float16x8x2_t <b><b>vld2q_dup_f16</b></b> (float16_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_f32" type="checkbox"><label for="vld2_dup_f32"><div>float32x2x2_t <b><b>vld2_dup_f32</b></b> (float32_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_f32" type="checkbox"><label for="vld2q_dup_f32"><div>float32x4x2_t <b><b>vld2q_dup_f32</b></b> (float32_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_p8" type="checkbox"><label for="vld2_dup_p8"><div>poly8x8x2_t <b><b>vld2_dup_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_p8" type="checkbox"><label for="vld2q_dup_p8"><div>poly8x16x2_t <b><b>vld2q_dup_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_p16" type="checkbox"><label for="vld2_dup_p16"><div>poly16x4x2_t <b><b>vld2_dup_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_p16" type="checkbox"><label for="vld2q_dup_p16"><div>poly16x8x2_t <b><b>vld2q_dup_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_s64" type="checkbox"><label for="vld2_dup_s64"><div>int64x1x2_t <b><b>vld2_dup_s64</b></b> (int64_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_u64" type="checkbox"><label for="vld2_dup_u64"><div>uint64x1x2_t <b><b>vld2_dup_u64</b></b> (uint64_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_p64" type="checkbox"><label for="vld2_dup_p64"><div>poly64x1x2_t <b><b>vld2_dup_p64</b></b> (poly64_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_s64" type="checkbox"><label for="vld2q_dup_s64"><div>int64x2x2_t <b><b>vld2q_dup_s64</b></b> (int64_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_u64" type="checkbox"><label for="vld2q_dup_u64"><div>uint64x2x2_t <b><b>vld2q_dup_u64</b></b> (uint64_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_p64" type="checkbox"><label for="vld2q_dup_p64"><div>poly64x2x2_t <b><b>vld2q_dup_p64</b></b> (poly64_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2_dup_f64" type="checkbox"><label for="vld2_dup_f64"><div>float64x1x2_t <b><b>vld2_dup_f64</b></b> (float64_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_dup_f64" type="checkbox"><label for="vld2q_dup_f64"><div>float64x2x2_t <b><b>vld2q_dup_f64</b></b> (float64_t const * ptr)<span class="right">Load single 2-element structure and replicate to all lanes of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure and Replicate to all lanes of two registers. This instruction loads a 2-element structure from memory and replicates the structure to all the lanes of the two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2r-load-single-2-element-structure-and-replicate-to-all-lanes-of-two-registers">LD2R</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_s8" type="checkbox"><label for="vld3_dup_s8"><div>int8x8x3_t <b><b>vld3_dup_s8</b></b> (int8_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_s8" type="checkbox"><label for="vld3q_dup_s8"><div>int8x16x3_t <b><b>vld3q_dup_s8</b></b> (int8_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_s16" type="checkbox"><label for="vld3_dup_s16"><div>int16x4x3_t <b><b>vld3_dup_s16</b></b> (int16_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_s16" type="checkbox"><label for="vld3q_dup_s16"><div>int16x8x3_t <b><b>vld3q_dup_s16</b></b> (int16_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_s32" type="checkbox"><label for="vld3_dup_s32"><div>int32x2x3_t <b><b>vld3_dup_s32</b></b> (int32_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_s32" type="checkbox"><label for="vld3q_dup_s32"><div>int32x4x3_t <b><b>vld3q_dup_s32</b></b> (int32_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_u8" type="checkbox"><label for="vld3_dup_u8"><div>uint8x8x3_t <b><b>vld3_dup_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_u8" type="checkbox"><label for="vld3q_dup_u8"><div>uint8x16x3_t <b><b>vld3q_dup_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_u16" type="checkbox"><label for="vld3_dup_u16"><div>uint16x4x3_t <b><b>vld3_dup_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_u16" type="checkbox"><label for="vld3q_dup_u16"><div>uint16x8x3_t <b><b>vld3q_dup_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_u32" type="checkbox"><label for="vld3_dup_u32"><div>uint32x2x3_t <b><b>vld3_dup_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_u32" type="checkbox"><label for="vld3q_dup_u32"><div>uint32x4x3_t <b><b>vld3q_dup_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_f16" type="checkbox"><label for="vld3_dup_f16"><div>float16x4x3_t <b><b>vld3_dup_f16</b></b> (float16_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_f16" type="checkbox"><label for="vld3q_dup_f16"><div>float16x8x3_t <b><b>vld3q_dup_f16</b></b> (float16_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_f32" type="checkbox"><label for="vld3_dup_f32"><div>float32x2x3_t <b><b>vld3_dup_f32</b></b> (float32_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_f32" type="checkbox"><label for="vld3q_dup_f32"><div>float32x4x3_t <b><b>vld3q_dup_f32</b></b> (float32_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_p8" type="checkbox"><label for="vld3_dup_p8"><div>poly8x8x3_t <b><b>vld3_dup_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_p8" type="checkbox"><label for="vld3q_dup_p8"><div>poly8x16x3_t <b><b>vld3q_dup_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_p16" type="checkbox"><label for="vld3_dup_p16"><div>poly16x4x3_t <b><b>vld3_dup_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_p16" type="checkbox"><label for="vld3q_dup_p16"><div>poly16x8x3_t <b><b>vld3q_dup_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_s64" type="checkbox"><label for="vld3_dup_s64"><div>int64x1x3_t <b><b>vld3_dup_s64</b></b> (int64_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_u64" type="checkbox"><label for="vld3_dup_u64"><div>uint64x1x3_t <b><b>vld3_dup_u64</b></b> (uint64_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_p64" type="checkbox"><label for="vld3_dup_p64"><div>poly64x1x3_t <b><b>vld3_dup_p64</b></b> (poly64_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_s64" type="checkbox"><label for="vld3q_dup_s64"><div>int64x2x3_t <b><b>vld3q_dup_s64</b></b> (int64_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_u64" type="checkbox"><label for="vld3q_dup_u64"><div>uint64x2x3_t <b><b>vld3q_dup_u64</b></b> (uint64_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_p64" type="checkbox"><label for="vld3q_dup_p64"><div>poly64x2x3_t <b><b>vld3q_dup_p64</b></b> (poly64_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3_dup_f64" type="checkbox"><label for="vld3_dup_f64"><div>float64x1x3_t <b><b>vld3_dup_f64</b></b> (float64_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_dup_f64" type="checkbox"><label for="vld3q_dup_f64"><div>float64x2x3_t <b><b>vld3q_dup_f64</b></b> (float64_t const * ptr)<span class="right">Load single 3-element structure and replicate to all lanes of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure and Replicate to all lanes of three registers. This instruction loads a 3-element structure from memory and replicates the structure to all the lanes of the three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3r-load-single-3-element-structure-and-replicate-to-all-lanes-of-three-registers">LD3R</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_s8" type="checkbox"><label for="vld4_dup_s8"><div>int8x8x4_t <b><b>vld4_dup_s8</b></b> (int8_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_s8" type="checkbox"><label for="vld4q_dup_s8"><div>int8x16x4_t <b><b>vld4q_dup_s8</b></b> (int8_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_s16" type="checkbox"><label for="vld4_dup_s16"><div>int16x4x4_t <b><b>vld4_dup_s16</b></b> (int16_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_s16" type="checkbox"><label for="vld4q_dup_s16"><div>int16x8x4_t <b><b>vld4q_dup_s16</b></b> (int16_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_s32" type="checkbox"><label for="vld4_dup_s32"><div>int32x2x4_t <b><b>vld4_dup_s32</b></b> (int32_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_s32" type="checkbox"><label for="vld4q_dup_s32"><div>int32x4x4_t <b><b>vld4q_dup_s32</b></b> (int32_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_u8" type="checkbox"><label for="vld4_dup_u8"><div>uint8x8x4_t <b><b>vld4_dup_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_u8" type="checkbox"><label for="vld4q_dup_u8"><div>uint8x16x4_t <b><b>vld4q_dup_u8</b></b> (uint8_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_u16" type="checkbox"><label for="vld4_dup_u16"><div>uint16x4x4_t <b><b>vld4_dup_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_u16" type="checkbox"><label for="vld4q_dup_u16"><div>uint16x8x4_t <b><b>vld4q_dup_u16</b></b> (uint16_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_u32" type="checkbox"><label for="vld4_dup_u32"><div>uint32x2x4_t <b><b>vld4_dup_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_u32" type="checkbox"><label for="vld4q_dup_u32"><div>uint32x4x4_t <b><b>vld4q_dup_u32</b></b> (uint32_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_f16" type="checkbox"><label for="vld4_dup_f16"><div>float16x4x4_t <b><b>vld4_dup_f16</b></b> (float16_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_f16" type="checkbox"><label for="vld4q_dup_f16"><div>float16x8x4_t <b><b>vld4q_dup_f16</b></b> (float16_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_f32" type="checkbox"><label for="vld4_dup_f32"><div>float32x2x4_t <b><b>vld4_dup_f32</b></b> (float32_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_f32" type="checkbox"><label for="vld4q_dup_f32"><div>float32x4x4_t <b><b>vld4q_dup_f32</b></b> (float32_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_p8" type="checkbox"><label for="vld4_dup_p8"><div>poly8x8x4_t <b><b>vld4_dup_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_p8" type="checkbox"><label for="vld4q_dup_p8"><div>poly8x16x4_t <b><b>vld4q_dup_p8</b></b> (poly8_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_p16" type="checkbox"><label for="vld4_dup_p16"><div>poly16x4x4_t <b><b>vld4_dup_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_p16" type="checkbox"><label for="vld4q_dup_p16"><div>poly16x8x4_t <b><b>vld4q_dup_p16</b></b> (poly16_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_s64" type="checkbox"><label for="vld4_dup_s64"><div>int64x1x4_t <b><b>vld4_dup_s64</b></b> (int64_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_u64" type="checkbox"><label for="vld4_dup_u64"><div>uint64x1x4_t <b><b>vld4_dup_u64</b></b> (uint64_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_p64" type="checkbox"><label for="vld4_dup_p64"><div>poly64x1x4_t <b><b>vld4_dup_p64</b></b> (poly64_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_s64" type="checkbox"><label for="vld4q_dup_s64"><div>int64x2x4_t <b><b>vld4q_dup_s64</b></b> (int64_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_u64" type="checkbox"><label for="vld4q_dup_u64"><div>uint64x2x4_t <b><b>vld4q_dup_u64</b></b> (uint64_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_p64" type="checkbox"><label for="vld4q_dup_p64"><div>poly64x2x4_t <b><b>vld4q_dup_p64</b></b> (poly64_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4_dup_f64" type="checkbox"><label for="vld4_dup_f64"><div>float64x1x4_t <b><b>vld4_dup_f64</b></b> (float64_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_dup_f64" type="checkbox"><label for="vld4q_dup_f64"><div>float64x2x4_t <b><b>vld4q_dup_f64</b></b> (float64_t const * ptr)<span class="right">Load single 4-element structure and replicate to all lanes of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure and Replicate to all lanes of four registers. This instruction loads a 4-element structure from memory and replicates the structure to all the lanes of the four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4r-load-single-4-element-structure-and-replicate-to-all-lanes-of-four-registers">LD4R</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2_s8" type="checkbox"><label for="vst2_s8"><div>void <b><b>vst2_s8</b></b> (int8_t * ptr, int8x8x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_s8" type="checkbox"><label for="vst2q_s8"><div>void <b><b>vst2q_s8</b></b> (int8_t * ptr, int8x16x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_s16" type="checkbox"><label for="vst2_s16"><div>void <b><b>vst2_s16</b></b> (int16_t * ptr, int16x4x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_s16" type="checkbox"><label for="vst2q_s16"><div>void <b><b>vst2q_s16</b></b> (int16_t * ptr, int16x8x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_s32" type="checkbox"><label for="vst2_s32"><div>void <b><b>vst2_s32</b></b> (int32_t * ptr, int32x2x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_s32" type="checkbox"><label for="vst2q_s32"><div>void <b><b>vst2q_s32</b></b> (int32_t * ptr, int32x4x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_u8" type="checkbox"><label for="vst2_u8"><div>void <b><b>vst2_u8</b></b> (uint8_t * ptr, uint8x8x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_u8" type="checkbox"><label for="vst2q_u8"><div>void <b><b>vst2q_u8</b></b> (uint8_t * ptr, uint8x16x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_u16" type="checkbox"><label for="vst2_u16"><div>void <b><b>vst2_u16</b></b> (uint16_t * ptr, uint16x4x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_u16" type="checkbox"><label for="vst2q_u16"><div>void <b><b>vst2q_u16</b></b> (uint16_t * ptr, uint16x8x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_u32" type="checkbox"><label for="vst2_u32"><div>void <b><b>vst2_u32</b></b> (uint32_t * ptr, uint32x2x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_u32" type="checkbox"><label for="vst2q_u32"><div>void <b><b>vst2q_u32</b></b> (uint32_t * ptr, uint32x4x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_f16" type="checkbox"><label for="vst2_f16"><div>void <b><b>vst2_f16</b></b> (float16_t * ptr, float16x4x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_f16" type="checkbox"><label for="vst2q_f16"><div>void <b><b>vst2q_f16</b></b> (float16_t * ptr, float16x8x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_f32" type="checkbox"><label for="vst2_f32"><div>void <b><b>vst2_f32</b></b> (float32_t * ptr, float32x2x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_f32" type="checkbox"><label for="vst2q_f32"><div>void <b><b>vst2q_f32</b></b> (float32_t * ptr, float32x4x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_p8" type="checkbox"><label for="vst2_p8"><div>void <b><b>vst2_p8</b></b> (poly8_t * ptr, poly8x8x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_p8" type="checkbox"><label for="vst2q_p8"><div>void <b><b>vst2q_p8</b></b> (poly8_t * ptr, poly8x16x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_p16" type="checkbox"><label for="vst2_p16"><div>void <b><b>vst2_p16</b></b> (poly16_t * ptr, poly16x4x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_p16" type="checkbox"><label for="vst2q_p16"><div>void <b><b>vst2q_p16</b></b> (poly16_t * ptr, poly16x8x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_s64" type="checkbox"><label for="vst2_s64"><div>void <b><b>vst2_s64</b></b> (int64_t * ptr, int64x1x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_u64" type="checkbox"><label for="vst2_u64"><div>void <b><b>vst2_u64</b></b> (uint64_t * ptr, uint64x1x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_p64" type="checkbox"><label for="vst2_p64"><div>void <b><b>vst2_p64</b></b> (poly64_t * ptr, poly64x1x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_s64" type="checkbox"><label for="vst2q_s64"><div>void <b><b>vst2q_s64</b></b> (int64_t * ptr, int64x2x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_u64" type="checkbox"><label for="vst2q_u64"><div>void <b><b>vst2q_u64</b></b> (uint64_t * ptr, uint64x2x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_p64" type="checkbox"><label for="vst2q_p64"><div>void <b><b>vst2q_p64</b></b> (poly64_t * ptr, poly64x2x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2_f64" type="checkbox"><label for="vst2_f64"><div>void <b><b>vst2_f64</b></b> (float64_t * ptr, float64x1x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_f64" type="checkbox"><label for="vst2q_f64"><div>void <b><b>vst2q_f64</b></b> (float64_t * ptr, float64x2x2_t val)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3_s8" type="checkbox"><label for="vst3_s8"><div>void <b><b>vst3_s8</b></b> (int8_t * ptr, int8x8x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_s8" type="checkbox"><label for="vst3q_s8"><div>void <b><b>vst3q_s8</b></b> (int8_t * ptr, int8x16x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_s16" type="checkbox"><label for="vst3_s16"><div>void <b><b>vst3_s16</b></b> (int16_t * ptr, int16x4x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_s16" type="checkbox"><label for="vst3q_s16"><div>void <b><b>vst3q_s16</b></b> (int16_t * ptr, int16x8x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_s32" type="checkbox"><label for="vst3_s32"><div>void <b><b>vst3_s32</b></b> (int32_t * ptr, int32x2x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_s32" type="checkbox"><label for="vst3q_s32"><div>void <b><b>vst3q_s32</b></b> (int32_t * ptr, int32x4x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_u8" type="checkbox"><label for="vst3_u8"><div>void <b><b>vst3_u8</b></b> (uint8_t * ptr, uint8x8x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_u8" type="checkbox"><label for="vst3q_u8"><div>void <b><b>vst3q_u8</b></b> (uint8_t * ptr, uint8x16x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_u16" type="checkbox"><label for="vst3_u16"><div>void <b><b>vst3_u16</b></b> (uint16_t * ptr, uint16x4x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_u16" type="checkbox"><label for="vst3q_u16"><div>void <b><b>vst3q_u16</b></b> (uint16_t * ptr, uint16x8x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_u32" type="checkbox"><label for="vst3_u32"><div>void <b><b>vst3_u32</b></b> (uint32_t * ptr, uint32x2x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_u32" type="checkbox"><label for="vst3q_u32"><div>void <b><b>vst3q_u32</b></b> (uint32_t * ptr, uint32x4x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_f16" type="checkbox"><label for="vst3_f16"><div>void <b><b>vst3_f16</b></b> (float16_t * ptr, float16x4x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_f16" type="checkbox"><label for="vst3q_f16"><div>void <b><b>vst3q_f16</b></b> (float16_t * ptr, float16x8x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_f32" type="checkbox"><label for="vst3_f32"><div>void <b><b>vst3_f32</b></b> (float32_t * ptr, float32x2x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_f32" type="checkbox"><label for="vst3q_f32"><div>void <b><b>vst3q_f32</b></b> (float32_t * ptr, float32x4x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_p8" type="checkbox"><label for="vst3_p8"><div>void <b><b>vst3_p8</b></b> (poly8_t * ptr, poly8x8x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_p8" type="checkbox"><label for="vst3q_p8"><div>void <b><b>vst3q_p8</b></b> (poly8_t * ptr, poly8x16x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_p16" type="checkbox"><label for="vst3_p16"><div>void <b><b>vst3_p16</b></b> (poly16_t * ptr, poly16x4x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_p16" type="checkbox"><label for="vst3q_p16"><div>void <b><b>vst3q_p16</b></b> (poly16_t * ptr, poly16x8x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_s64" type="checkbox"><label for="vst3_s64"><div>void <b><b>vst3_s64</b></b> (int64_t * ptr, int64x1x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_u64" type="checkbox"><label for="vst3_u64"><div>void <b><b>vst3_u64</b></b> (uint64_t * ptr, uint64x1x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_p64" type="checkbox"><label for="vst3_p64"><div>void <b><b>vst3_p64</b></b> (poly64_t * ptr, poly64x1x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_s64" type="checkbox"><label for="vst3q_s64"><div>void <b><b>vst3q_s64</b></b> (int64_t * ptr, int64x2x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_u64" type="checkbox"><label for="vst3q_u64"><div>void <b><b>vst3q_u64</b></b> (uint64_t * ptr, uint64x2x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_p64" type="checkbox"><label for="vst3q_p64"><div>void <b><b>vst3q_p64</b></b> (poly64_t * ptr, poly64x2x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3_f64" type="checkbox"><label for="vst3_f64"><div>void <b><b>vst3_f64</b></b> (float64_t * ptr, float64x1x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_f64" type="checkbox"><label for="vst3q_f64"><div>void <b><b>vst3q_f64</b></b> (float64_t * ptr, float64x2x3_t val)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4_s8" type="checkbox"><label for="vst4_s8"><div>void <b><b>vst4_s8</b></b> (int8_t * ptr, int8x8x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8B <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_s8" type="checkbox"><label for="vst4q_s8"><div>void <b><b>vst4q_s8</b></b> (int8_t * ptr, int8x16x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.16B <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_s16" type="checkbox"><label for="vst4_s16"><div>void <b><b>vst4_s16</b></b> (int16_t * ptr, int16x4x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_s16" type="checkbox"><label for="vst4q_s16"><div>void <b><b>vst4q_s16</b></b> (int16_t * ptr, int16x8x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_s32" type="checkbox"><label for="vst4_s32"><div>void <b><b>vst4_s32</b></b> (int32_t * ptr, int32x2x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2S <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_s32" type="checkbox"><label for="vst4q_s32"><div>void <b><b>vst4q_s32</b></b> (int32_t * ptr, int32x4x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4S <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_u8" type="checkbox"><label for="vst4_u8"><div>void <b><b>vst4_u8</b></b> (uint8_t * ptr, uint8x8x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8B <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_u8" type="checkbox"><label for="vst4q_u8"><div>void <b><b>vst4q_u8</b></b> (uint8_t * ptr, uint8x16x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.16B <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_u16" type="checkbox"><label for="vst4_u16"><div>void <b><b>vst4_u16</b></b> (uint16_t * ptr, uint16x4x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_u16" type="checkbox"><label for="vst4q_u16"><div>void <b><b>vst4q_u16</b></b> (uint16_t * ptr, uint16x8x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_u32" type="checkbox"><label for="vst4_u32"><div>void <b><b>vst4_u32</b></b> (uint32_t * ptr, uint32x2x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2S <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_u32" type="checkbox"><label for="vst4q_u32"><div>void <b><b>vst4q_u32</b></b> (uint32_t * ptr, uint32x4x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4S <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_f16" type="checkbox"><label for="vst4_f16"><div>void <b><b>vst4_f16</b></b> (float16_t * ptr, float16x4x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_f16" type="checkbox"><label for="vst4q_f16"><div>void <b><b>vst4q_f16</b></b> (float16_t * ptr, float16x8x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_f32" type="checkbox"><label for="vst4_f32"><div>void <b><b>vst4_f32</b></b> (float32_t * ptr, float32x2x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2S <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_f32" type="checkbox"><label for="vst4q_f32"><div>void <b><b>vst4q_f32</b></b> (float32_t * ptr, float32x4x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4S <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_p8" type="checkbox"><label for="vst4_p8"><div>void <b><b>vst4_p8</b></b> (poly8_t * ptr, poly8x8x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8B <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_p8" type="checkbox"><label for="vst4q_p8"><div>void <b><b>vst4q_p8</b></b> (poly8_t * ptr, poly8x16x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.16B <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_p16" type="checkbox"><label for="vst4_p16"><div>void <b><b>vst4_p16</b></b> (poly16_t * ptr, poly16x4x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_p16" type="checkbox"><label for="vst4q_p16"><div>void <b><b>vst4q_p16</b></b> (poly16_t * ptr, poly16x8x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_s64" type="checkbox"><label for="vst4_s64"><div>void <b><b>vst4_s64</b></b> (int64_t * ptr, int64x1x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_u64" type="checkbox"><label for="vst4_u64"><div>void <b><b>vst4_u64</b></b> (uint64_t * ptr, uint64x1x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_p64" type="checkbox"><label for="vst4_p64"><div>void <b><b>vst4_p64</b></b> (poly64_t * ptr, poly64x1x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_s64" type="checkbox"><label for="vst4q_s64"><div>void <b><b>vst4q_s64</b></b> (int64_t * ptr, int64x2x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_u64" type="checkbox"><label for="vst4q_u64"><div>void <b><b>vst4q_u64</b></b> (uint64_t * ptr, uint64x2x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_p64" type="checkbox"><label for="vst4q_p64"><div>void <b><b>vst4q_p64</b></b> (poly64_t * ptr, poly64x2x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4_f64" type="checkbox"><label for="vst4_f64"><div>void <b><b>vst4_f64</b></b> (float64_t * ptr, float64x1x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_f64" type="checkbox"><label for="vst4q_f64"><div>void <b><b>vst4q_f64</b></b> (float64_t * ptr, float64x2x4_t val)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_s16" type="checkbox"><label for="vld2_lane_s16"><div>int16x4x2_t <b><b>vld2_lane_s16</b></b> (int16_t const * ptr, int16x4x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_s16" type="checkbox"><label for="vld2q_lane_s16"><div>int16x8x2_t <b><b>vld2q_lane_s16</b></b> (int16_t const * ptr, int16x8x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_s32" type="checkbox"><label for="vld2_lane_s32"><div>int32x2x2_t <b><b>vld2_lane_s32</b></b> (int32_t const * ptr, int32x2x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.2S <br />
+src.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_s32" type="checkbox"><label for="vld2q_lane_s32"><div>int32x4x2_t <b><b>vld2q_lane_s32</b></b> (int32_t const * ptr, int32x4x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.4S <br />
+src.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_u16" type="checkbox"><label for="vld2_lane_u16"><div>uint16x4x2_t <b><b>vld2_lane_u16</b></b> (uint16_t const * ptr, uint16x4x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_u16" type="checkbox"><label for="vld2q_lane_u16"><div>uint16x8x2_t <b><b>vld2q_lane_u16</b></b> (uint16_t const * ptr, uint16x8x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_u32" type="checkbox"><label for="vld2_lane_u32"><div>uint32x2x2_t <b><b>vld2_lane_u32</b></b> (uint32_t const * ptr, uint32x2x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.2S <br />
+src.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_u32" type="checkbox"><label for="vld2q_lane_u32"><div>uint32x4x2_t <b><b>vld2q_lane_u32</b></b> (uint32_t const * ptr, uint32x4x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.4S <br />
+src.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_f16" type="checkbox"><label for="vld2_lane_f16"><div>float16x4x2_t <b><b>vld2_lane_f16</b></b> (float16_t const * ptr, float16x4x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_f16" type="checkbox"><label for="vld2q_lane_f16"><div>float16x8x2_t <b><b>vld2q_lane_f16</b></b> (float16_t const * ptr, float16x8x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_f32" type="checkbox"><label for="vld2_lane_f32"><div>float32x2x2_t <b><b>vld2_lane_f32</b></b> (float32_t const * ptr, float32x2x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.2S <br />
+src.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_f32" type="checkbox"><label for="vld2q_lane_f32"><div>float32x4x2_t <b><b>vld2q_lane_f32</b></b> (float32_t const * ptr, float32x4x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.4S <br />
+src.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_p16" type="checkbox"><label for="vld2_lane_p16"><div>poly16x4x2_t <b><b>vld2_lane_p16</b></b> (poly16_t const * ptr, poly16x4x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_p16" type="checkbox"><label for="vld2q_lane_p16"><div>poly16x8x2_t <b><b>vld2q_lane_p16</b></b> (poly16_t const * ptr, poly16x8x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_s8" type="checkbox"><label for="vld2_lane_s8"><div>int8x8x2_t <b><b>vld2_lane_s8</b></b> (int8_t const * ptr, int8x8x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.8B <br />
+src.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_u8" type="checkbox"><label for="vld2_lane_u8"><div>uint8x8x2_t <b><b>vld2_lane_u8</b></b> (uint8_t const * ptr, uint8x8x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.8B <br />
+src.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_p8" type="checkbox"><label for="vld2_lane_p8"><div>poly8x8x2_t <b><b>vld2_lane_p8</b></b> (poly8_t const * ptr, poly8x8x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.8B <br />
+src.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_s8" type="checkbox"><label for="vld2q_lane_s8"><div>int8x16x2_t <b><b>vld2q_lane_s8</b></b> (int8_t const * ptr, int8x16x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.16B <br />
+src.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_u8" type="checkbox"><label for="vld2q_lane_u8"><div>uint8x16x2_t <b><b>vld2q_lane_u8</b></b> (uint8_t const * ptr, uint8x16x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.16B <br />
+src.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_p8" type="checkbox"><label for="vld2q_lane_p8"><div>poly8x16x2_t <b><b>vld2q_lane_p8</b></b> (poly8_t const * ptr, poly8x16x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.16B <br />
+src.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_s64" type="checkbox"><label for="vld2_lane_s64"><div>int64x1x2_t <b><b>vld2_lane_s64</b></b> (int64_t const * ptr, int64x1x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>ptr &rarr; Xn
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_s64" type="checkbox"><label for="vld2q_lane_s64"><div>int64x2x2_t <b><b>vld2q_lane_s64</b></b> (int64_t const * ptr, int64x2x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>ptr &rarr; Xn
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_u64" type="checkbox"><label for="vld2_lane_u64"><div>uint64x1x2_t <b><b>vld2_lane_u64</b></b> (uint64_t const * ptr, uint64x1x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_u64" type="checkbox"><label for="vld2q_lane_u64"><div>uint64x2x2_t <b><b>vld2q_lane_u64</b></b> (uint64_t const * ptr, uint64x2x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_p64" type="checkbox"><label for="vld2_lane_p64"><div>poly64x1x2_t <b><b>vld2_lane_p64</b></b> (poly64_t const * ptr, poly64x1x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_p64" type="checkbox"><label for="vld2q_lane_p64"><div>poly64x2x2_t <b><b>vld2q_lane_p64</b></b> (poly64_t const * ptr, poly64x2x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2_lane_f64" type="checkbox"><label for="vld2_lane_f64"><div>float64x1x2_t <b><b>vld2_lane_f64</b></b> (float64_t const * ptr, float64x1x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld2q_lane_f64" type="checkbox"><label for="vld2q_lane_f64"><div>float64x2x2_t <b><b>vld2q_lane_f64</b></b> (float64_t const * ptr, float64x2x2_t src, const int lane)<span class="right">Load single 2-element structure to one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 2-element structure to one lane of two registers. This instruction loads a 2-element structure from memory and writes the result to the corresponding elements of the two SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld2-single-structure-load-single-2-element-structure-to-one-lane-of-two-registers">LD2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_s16" type="checkbox"><label for="vld3_lane_s16"><div>int16x4x3_t <b><b>vld3_lane_s16</b></b> (int16_t const * ptr, int16x4x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.4H <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_s16" type="checkbox"><label for="vld3q_lane_s16"><div>int16x8x3_t <b><b>vld3q_lane_s16</b></b> (int16_t const * ptr, int16x8x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.8H <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_s32" type="checkbox"><label for="vld3_lane_s32"><div>int32x2x3_t <b><b>vld3_lane_s32</b></b> (int32_t const * ptr, int32x2x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.2S <br />
+src.val[1] &rarr; Vt2.2S <br />
+src.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_s32" type="checkbox"><label for="vld3q_lane_s32"><div>int32x4x3_t <b><b>vld3q_lane_s32</b></b> (int32_t const * ptr, int32x4x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.4S <br />
+src.val[1] &rarr; Vt2.4S <br />
+src.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_u16" type="checkbox"><label for="vld3_lane_u16"><div>uint16x4x3_t <b><b>vld3_lane_u16</b></b> (uint16_t const * ptr, uint16x4x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.4H <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_u16" type="checkbox"><label for="vld3q_lane_u16"><div>uint16x8x3_t <b><b>vld3q_lane_u16</b></b> (uint16_t const * ptr, uint16x8x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.8H <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_u32" type="checkbox"><label for="vld3_lane_u32"><div>uint32x2x3_t <b><b>vld3_lane_u32</b></b> (uint32_t const * ptr, uint32x2x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.2S <br />
+src.val[1] &rarr; Vt2.2S <br />
+src.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_u32" type="checkbox"><label for="vld3q_lane_u32"><div>uint32x4x3_t <b><b>vld3q_lane_u32</b></b> (uint32_t const * ptr, uint32x4x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.4S <br />
+src.val[1] &rarr; Vt2.4S <br />
+src.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_f16" type="checkbox"><label for="vld3_lane_f16"><div>float16x4x3_t <b><b>vld3_lane_f16</b></b> (float16_t const * ptr, float16x4x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.4H <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_f16" type="checkbox"><label for="vld3q_lane_f16"><div>float16x8x3_t <b><b>vld3q_lane_f16</b></b> (float16_t const * ptr, float16x8x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.8H <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_f32" type="checkbox"><label for="vld3_lane_f32"><div>float32x2x3_t <b><b>vld3_lane_f32</b></b> (float32_t const * ptr, float32x2x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.2S <br />
+src.val[1] &rarr; Vt2.2S <br />
+src.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_f32" type="checkbox"><label for="vld3q_lane_f32"><div>float32x4x3_t <b><b>vld3q_lane_f32</b></b> (float32_t const * ptr, float32x4x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.4S <br />
+src.val[1] &rarr; Vt2.4S <br />
+src.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_p16" type="checkbox"><label for="vld3_lane_p16"><div>poly16x4x3_t <b><b>vld3_lane_p16</b></b> (poly16_t const * ptr, poly16x4x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.4H <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_p16" type="checkbox"><label for="vld3q_lane_p16"><div>poly16x8x3_t <b><b>vld3q_lane_p16</b></b> (poly16_t const * ptr, poly16x8x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.8H <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_s8" type="checkbox"><label for="vld3_lane_s8"><div>int8x8x3_t <b><b>vld3_lane_s8</b></b> (int8_t const * ptr, int8x8x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.8B <br />
+src.val[1] &rarr; Vt2.8B <br />
+src.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_u8" type="checkbox"><label for="vld3_lane_u8"><div>uint8x8x3_t <b><b>vld3_lane_u8</b></b> (uint8_t const * ptr, uint8x8x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.8B <br />
+src.val[1] &rarr; Vt2.8B <br />
+src.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_p8" type="checkbox"><label for="vld3_lane_p8"><div>poly8x8x3_t <b><b>vld3_lane_p8</b></b> (poly8_t const * ptr, poly8x8x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.8B <br />
+src.val[1] &rarr; Vt2.8B <br />
+src.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_s8" type="checkbox"><label for="vld3q_lane_s8"><div>int8x16x3_t <b><b>vld3q_lane_s8</b></b> (int8_t const * ptr, int8x16x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.16B <br />
+src.val[1] &rarr; Vt2.16B <br />
+src.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_u8" type="checkbox"><label for="vld3q_lane_u8"><div>uint8x16x3_t <b><b>vld3q_lane_u8</b></b> (uint8_t const * ptr, uint8x16x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.16B <br />
+src.val[1] &rarr; Vt2.16B <br />
+src.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_p8" type="checkbox"><label for="vld3q_lane_p8"><div>poly8x16x3_t <b><b>vld3q_lane_p8</b></b> (poly8_t const * ptr, poly8x16x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.16B <br />
+src.val[1] &rarr; Vt2.16B <br />
+src.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_s64" type="checkbox"><label for="vld3_lane_s64"><div>int64x1x3_t <b><b>vld3_lane_s64</b></b> (int64_t const * ptr, int64x1x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.1D <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_s64" type="checkbox"><label for="vld3q_lane_s64"><div>int64x2x3_t <b><b>vld3q_lane_s64</b></b> (int64_t const * ptr, int64x2x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.2D <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_u64" type="checkbox"><label for="vld3_lane_u64"><div>uint64x1x3_t <b><b>vld3_lane_u64</b></b> (uint64_t const * ptr, uint64x1x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.1D <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_u64" type="checkbox"><label for="vld3q_lane_u64"><div>uint64x2x3_t <b><b>vld3q_lane_u64</b></b> (uint64_t const * ptr, uint64x2x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.2D <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_p64" type="checkbox"><label for="vld3_lane_p64"><div>poly64x1x3_t <b><b>vld3_lane_p64</b></b> (poly64_t const * ptr, poly64x1x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.1D <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_p64" type="checkbox"><label for="vld3q_lane_p64"><div>poly64x2x3_t <b><b>vld3q_lane_p64</b></b> (poly64_t const * ptr, poly64x2x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.2D <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3_lane_f64" type="checkbox"><label for="vld3_lane_f64"><div>float64x1x3_t <b><b>vld3_lane_f64</b></b> (float64_t const * ptr, float64x1x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.1D <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld3q_lane_f64" type="checkbox"><label for="vld3q_lane_f64"><div>float64x2x3_t <b><b>vld3q_lane_f64</b></b> (float64_t const * ptr, float64x2x3_t src, const int lane)<span class="right">Load single 3-element structure to one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 3-element structure to one lane of three registers). This instruction loads a 3-element structure from memory and writes the result to the corresponding elements of the three SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld3-single-structure-load-single-3-element-structure-to-one-lane-of-three-registers">LD3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[2] &rarr; Vt3.2D <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_s16" type="checkbox"><label for="vld4_lane_s16"><div>int16x4x4_t <b><b>vld4_lane_s16</b></b> (int16_t const * ptr, int16x4x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.4H <br />
+src.val[2] &rarr; Vt3.4H <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_s16" type="checkbox"><label for="vld4q_lane_s16"><div>int16x8x4_t <b><b>vld4q_lane_s16</b></b> (int16_t const * ptr, int16x8x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.8H <br />
+src.val[2] &rarr; Vt3.8H <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_s32" type="checkbox"><label for="vld4_lane_s32"><div>int32x2x4_t <b><b>vld4_lane_s32</b></b> (int32_t const * ptr, int32x2x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.2S <br />
+src.val[2] &rarr; Vt3.2S <br />
+src.val[1] &rarr; Vt2.2S <br />
+src.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_s32" type="checkbox"><label for="vld4q_lane_s32"><div>int32x4x4_t <b><b>vld4q_lane_s32</b></b> (int32_t const * ptr, int32x4x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.4S <br />
+src.val[2] &rarr; Vt3.4S <br />
+src.val[1] &rarr; Vt2.4S <br />
+src.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_u16" type="checkbox"><label for="vld4_lane_u16"><div>uint16x4x4_t <b><b>vld4_lane_u16</b></b> (uint16_t const * ptr, uint16x4x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.4H <br />
+src.val[2] &rarr; Vt3.4H <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_u16" type="checkbox"><label for="vld4q_lane_u16"><div>uint16x8x4_t <b><b>vld4q_lane_u16</b></b> (uint16_t const * ptr, uint16x8x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.8H <br />
+src.val[2] &rarr; Vt3.8H <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_u32" type="checkbox"><label for="vld4_lane_u32"><div>uint32x2x4_t <b><b>vld4_lane_u32</b></b> (uint32_t const * ptr, uint32x2x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.2S <br />
+src.val[2] &rarr; Vt3.2S <br />
+src.val[1] &rarr; Vt2.2S <br />
+src.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_u32" type="checkbox"><label for="vld4q_lane_u32"><div>uint32x4x4_t <b><b>vld4q_lane_u32</b></b> (uint32_t const * ptr, uint32x4x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.4S <br />
+src.val[2] &rarr; Vt3.4S <br />
+src.val[1] &rarr; Vt2.4S <br />
+src.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_f16" type="checkbox"><label for="vld4_lane_f16"><div>float16x4x4_t <b><b>vld4_lane_f16</b></b> (float16_t const * ptr, float16x4x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.4H <br />
+src.val[2] &rarr; Vt3.4H <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_f16" type="checkbox"><label for="vld4q_lane_f16"><div>float16x8x4_t <b><b>vld4q_lane_f16</b></b> (float16_t const * ptr, float16x8x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.8H <br />
+src.val[2] &rarr; Vt3.8H <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_f32" type="checkbox"><label for="vld4_lane_f32"><div>float32x2x4_t <b><b>vld4_lane_f32</b></b> (float32_t const * ptr, float32x2x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.2S <br />
+src.val[2] &rarr; Vt3.2S <br />
+src.val[1] &rarr; Vt2.2S <br />
+src.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_f32" type="checkbox"><label for="vld4q_lane_f32"><div>float32x4x4_t <b><b>vld4q_lane_f32</b></b> (float32_t const * ptr, float32x4x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.4S <br />
+src.val[2] &rarr; Vt3.4S <br />
+src.val[1] &rarr; Vt2.4S <br />
+src.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_p16" type="checkbox"><label for="vld4_lane_p16"><div>poly16x4x4_t <b><b>vld4_lane_p16</b></b> (poly16_t const * ptr, poly16x4x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.4H <br />
+src.val[2] &rarr; Vt3.4H <br />
+src.val[1] &rarr; Vt2.4H <br />
+src.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_p16" type="checkbox"><label for="vld4q_lane_p16"><div>poly16x8x4_t <b><b>vld4q_lane_p16</b></b> (poly16_t const * ptr, poly16x8x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.8H <br />
+src.val[2] &rarr; Vt3.8H <br />
+src.val[1] &rarr; Vt2.8H <br />
+src.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_s8" type="checkbox"><label for="vld4_lane_s8"><div>int8x8x4_t <b><b>vld4_lane_s8</b></b> (int8_t const * ptr, int8x8x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.8B <br />
+src.val[2] &rarr; Vt3.8B <br />
+src.val[1] &rarr; Vt2.8B <br />
+src.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_u8" type="checkbox"><label for="vld4_lane_u8"><div>uint8x8x4_t <b><b>vld4_lane_u8</b></b> (uint8_t const * ptr, uint8x8x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.8B <br />
+src.val[2] &rarr; Vt3.8B <br />
+src.val[1] &rarr; Vt2.8B <br />
+src.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_p8" type="checkbox"><label for="vld4_lane_p8"><div>poly8x8x4_t <b><b>vld4_lane_p8</b></b> (poly8_t const * ptr, poly8x8x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.8B <br />
+src.val[2] &rarr; Vt3.8B <br />
+src.val[1] &rarr; Vt2.8B <br />
+src.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_s8" type="checkbox"><label for="vld4q_lane_s8"><div>int8x16x4_t <b><b>vld4q_lane_s8</b></b> (int8_t const * ptr, int8x16x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.16B <br />
+src.val[2] &rarr; Vt3.16B <br />
+src.val[1] &rarr; Vt2.16B <br />
+src.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_u8" type="checkbox"><label for="vld4q_lane_u8"><div>uint8x16x4_t <b><b>vld4q_lane_u8</b></b> (uint8_t const * ptr, uint8x16x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.16B <br />
+src.val[2] &rarr; Vt3.16B <br />
+src.val[1] &rarr; Vt2.16B <br />
+src.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_p8" type="checkbox"><label for="vld4q_lane_p8"><div>poly8x16x4_t <b><b>vld4q_lane_p8</b></b> (poly8_t const * ptr, poly8x16x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.16B <br />
+src.val[2] &rarr; Vt3.16B <br />
+src.val[1] &rarr; Vt2.16B <br />
+src.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_s64" type="checkbox"><label for="vld4_lane_s64"><div>int64x1x4_t <b><b>vld4_lane_s64</b></b> (int64_t const * ptr, int64x1x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.1D <br />
+src.val[2] &rarr; Vt3.1D <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_s64" type="checkbox"><label for="vld4q_lane_s64"><div>int64x2x4_t <b><b>vld4q_lane_s64</b></b> (int64_t const * ptr, int64x2x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.2D <br />
+src.val[2] &rarr; Vt3.2D <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_u64" type="checkbox"><label for="vld4_lane_u64"><div>uint64x1x4_t <b><b>vld4_lane_u64</b></b> (uint64_t const * ptr, uint64x1x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.1D <br />
+src.val[2] &rarr; Vt3.1D <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_u64" type="checkbox"><label for="vld4q_lane_u64"><div>uint64x2x4_t <b><b>vld4q_lane_u64</b></b> (uint64_t const * ptr, uint64x2x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.2D <br />
+src.val[2] &rarr; Vt3.2D <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_p64" type="checkbox"><label for="vld4_lane_p64"><div>poly64x1x4_t <b><b>vld4_lane_p64</b></b> (poly64_t const * ptr, poly64x1x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.1D <br />
+src.val[2] &rarr; Vt3.1D <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_p64" type="checkbox"><label for="vld4q_lane_p64"><div>poly64x2x4_t <b><b>vld4q_lane_p64</b></b> (poly64_t const * ptr, poly64x2x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.2D <br />
+src.val[2] &rarr; Vt3.2D <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4_lane_f64" type="checkbox"><label for="vld4_lane_f64"><div>float64x1x4_t <b><b>vld4_lane_f64</b></b> (float64_t const * ptr, float64x1x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.1D <br />
+src.val[2] &rarr; Vt3.1D <br />
+src.val[1] &rarr; Vt2.1D <br />
+src.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld4q_lane_f64" type="checkbox"><label for="vld4q_lane_f64"><div>float64x2x4_t <b><b>vld4q_lane_f64</b></b> (float64_t const * ptr, float64x2x4_t src, const int lane)<span class="right">Load single 4-element structure to one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load single 4-element structure to one lane of four registers. This instruction loads a 4-element structure from memory and writes the result to the corresponding elements of the four SIMD&amp;FP registers without affecting the other bits of the registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld4-single-structure-load-single-4-element-structure-to-one-lane-of-four-registers">LD4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+src.val[3] &rarr; Vt4.2D <br />
+src.val[2] &rarr; Vt3.2D <br />
+src.val[1] &rarr; Vt2.2D <br />
+src.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_s8" type="checkbox"><label for="vst2_lane_s8"><div>void <b><b>vst2_lane_s8</b></b> (int8_t * ptr, int8x8x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_u8" type="checkbox"><label for="vst2_lane_u8"><div>void <b><b>vst2_lane_u8</b></b> (uint8_t * ptr, uint8x8x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_p8" type="checkbox"><label for="vst2_lane_p8"><div>void <b><b>vst2_lane_p8</b></b> (poly8_t * ptr, poly8x8x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_s8" type="checkbox"><label for="vst3_lane_s8"><div>void <b><b>vst3_lane_s8</b></b> (int8_t * ptr, int8x8x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_u8" type="checkbox"><label for="vst3_lane_u8"><div>void <b><b>vst3_lane_u8</b></b> (uint8_t * ptr, uint8x8x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_p8" type="checkbox"><label for="vst3_lane_p8"><div>void <b><b>vst3_lane_p8</b></b> (poly8_t * ptr, poly8x8x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_s8" type="checkbox"><label for="vst4_lane_s8"><div>void <b><b>vst4_lane_s8</b></b> (int8_t * ptr, int8x8x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8B <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_u8" type="checkbox"><label for="vst4_lane_u8"><div>void <b><b>vst4_lane_u8</b></b> (uint8_t * ptr, uint8x8x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8B <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_p8" type="checkbox"><label for="vst4_lane_p8"><div>void <b><b>vst4_lane_p8</b></b> (poly8_t * ptr, poly8x8x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8B <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_s16" type="checkbox"><label for="vst2_lane_s16"><div>void <b><b>vst2_lane_s16</b></b> (int16_t * ptr, int16x4x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_s16" type="checkbox"><label for="vst2q_lane_s16"><div>void <b><b>vst2q_lane_s16</b></b> (int16_t * ptr, int16x8x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_s32" type="checkbox"><label for="vst2_lane_s32"><div>void <b><b>vst2_lane_s32</b></b> (int32_t * ptr, int32x2x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_s32" type="checkbox"><label for="vst2q_lane_s32"><div>void <b><b>vst2q_lane_s32</b></b> (int32_t * ptr, int32x4x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_u16" type="checkbox"><label for="vst2_lane_u16"><div>void <b><b>vst2_lane_u16</b></b> (uint16_t * ptr, uint16x4x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_u16" type="checkbox"><label for="vst2q_lane_u16"><div>void <b><b>vst2q_lane_u16</b></b> (uint16_t * ptr, uint16x8x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_u32" type="checkbox"><label for="vst2_lane_u32"><div>void <b><b>vst2_lane_u32</b></b> (uint32_t * ptr, uint32x2x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_u32" type="checkbox"><label for="vst2q_lane_u32"><div>void <b><b>vst2q_lane_u32</b></b> (uint32_t * ptr, uint32x4x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_f16" type="checkbox"><label for="vst2_lane_f16"><div>void <b><b>vst2_lane_f16</b></b> (float16_t * ptr, float16x4x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_f16" type="checkbox"><label for="vst2q_lane_f16"><div>void <b><b>vst2q_lane_f16</b></b> (float16_t * ptr, float16x8x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_f32" type="checkbox"><label for="vst2_lane_f32"><div>void <b><b>vst2_lane_f32</b></b> (float32_t * ptr, float32x2x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_f32" type="checkbox"><label for="vst2q_lane_f32"><div>void <b><b>vst2q_lane_f32</b></b> (float32_t * ptr, float32x4x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.s - Vt2.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_p16" type="checkbox"><label for="vst2_lane_p16"><div>void <b><b>vst2_lane_p16</b></b> (poly16_t * ptr, poly16x4x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_p16" type="checkbox"><label for="vst2q_lane_p16"><div>void <b><b>vst2q_lane_p16</b></b> (poly16_t * ptr, poly16x8x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.h - Vt2.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_s8" type="checkbox"><label for="vst2q_lane_s8"><div>void <b><b>vst2q_lane_s8</b></b> (int8_t * ptr, int8x16x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_u8" type="checkbox"><label for="vst2q_lane_u8"><div>void <b><b>vst2q_lane_u8</b></b> (uint8_t * ptr, uint8x16x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_p8" type="checkbox"><label for="vst2q_lane_p8"><div>void <b><b>vst2q_lane_p8</b></b> (poly8_t * ptr, poly8x16x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.b - Vt2.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_s64" type="checkbox"><label for="vst2_lane_s64"><div>void <b><b>vst2_lane_s64</b></b> (int64_t * ptr, int64x1x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_s64" type="checkbox"><label for="vst2q_lane_s64"><div>void <b><b>vst2q_lane_s64</b></b> (int64_t * ptr, int64x2x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_u64" type="checkbox"><label for="vst2_lane_u64"><div>void <b><b>vst2_lane_u64</b></b> (uint64_t * ptr, uint64x1x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_u64" type="checkbox"><label for="vst2q_lane_u64"><div>void <b><b>vst2q_lane_u64</b></b> (uint64_t * ptr, uint64x2x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_p64" type="checkbox"><label for="vst2_lane_p64"><div>void <b><b>vst2_lane_p64</b></b> (poly64_t * ptr, poly64x1x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_p64" type="checkbox"><label for="vst2q_lane_p64"><div>void <b><b>vst2q_lane_p64</b></b> (poly64_t * ptr, poly64x2x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2_lane_f64" type="checkbox"><label for="vst2_lane_f64"><div>void <b><b>vst2_lane_f64</b></b> (float64_t * ptr, float64x1x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst2q_lane_f64" type="checkbox"><label for="vst2q_lane_f64"><div>void <b><b>vst2q_lane_f64</b></b> (float64_t * ptr, float64x2x2_t val, const int lane)<span class="right">Store single 2-element structure from one lane of two registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 2-element structure from one lane of two registers. This instruction stores a 2-element structure to memory from corresponding elements of two SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st2-single-structure-store-single-2-element-structure-from-one-lane-of-two-registers">ST2</a> {Vt.d - Vt2.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 2 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_s16" type="checkbox"><label for="vst3_lane_s16"><div>void <b><b>vst3_lane_s16</b></b> (int16_t * ptr, int16x4x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_s16" type="checkbox"><label for="vst3q_lane_s16"><div>void <b><b>vst3q_lane_s16</b></b> (int16_t * ptr, int16x8x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_s32" type="checkbox"><label for="vst3_lane_s32"><div>void <b><b>vst3_lane_s32</b></b> (int32_t * ptr, int32x2x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_s32" type="checkbox"><label for="vst3q_lane_s32"><div>void <b><b>vst3q_lane_s32</b></b> (int32_t * ptr, int32x4x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_u16" type="checkbox"><label for="vst3_lane_u16"><div>void <b><b>vst3_lane_u16</b></b> (uint16_t * ptr, uint16x4x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_u16" type="checkbox"><label for="vst3q_lane_u16"><div>void <b><b>vst3q_lane_u16</b></b> (uint16_t * ptr, uint16x8x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_u32" type="checkbox"><label for="vst3_lane_u32"><div>void <b><b>vst3_lane_u32</b></b> (uint32_t * ptr, uint32x2x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_u32" type="checkbox"><label for="vst3q_lane_u32"><div>void <b><b>vst3q_lane_u32</b></b> (uint32_t * ptr, uint32x4x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_f16" type="checkbox"><label for="vst3_lane_f16"><div>void <b><b>vst3_lane_f16</b></b> (float16_t * ptr, float16x4x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_f16" type="checkbox"><label for="vst3q_lane_f16"><div>void <b><b>vst3q_lane_f16</b></b> (float16_t * ptr, float16x8x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_f32" type="checkbox"><label for="vst3_lane_f32"><div>void <b><b>vst3_lane_f32</b></b> (float32_t * ptr, float32x2x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_f32" type="checkbox"><label for="vst3q_lane_f32"><div>void <b><b>vst3q_lane_f32</b></b> (float32_t * ptr, float32x4x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.s - Vt3.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_p16" type="checkbox"><label for="vst3_lane_p16"><div>void <b><b>vst3_lane_p16</b></b> (poly16_t * ptr, poly16x4x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_p16" type="checkbox"><label for="vst3q_lane_p16"><div>void <b><b>vst3q_lane_p16</b></b> (poly16_t * ptr, poly16x8x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.h - Vt3.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_s8" type="checkbox"><label for="vst3q_lane_s8"><div>void <b><b>vst3q_lane_s8</b></b> (int8_t * ptr, int8x16x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_u8" type="checkbox"><label for="vst3q_lane_u8"><div>void <b><b>vst3q_lane_u8</b></b> (uint8_t * ptr, uint8x16x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_p8" type="checkbox"><label for="vst3q_lane_p8"><div>void <b><b>vst3q_lane_p8</b></b> (poly8_t * ptr, poly8x16x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.b - Vt3.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_s64" type="checkbox"><label for="vst3_lane_s64"><div>void <b><b>vst3_lane_s64</b></b> (int64_t * ptr, int64x1x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_s64" type="checkbox"><label for="vst3q_lane_s64"><div>void <b><b>vst3q_lane_s64</b></b> (int64_t * ptr, int64x2x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_u64" type="checkbox"><label for="vst3_lane_u64"><div>void <b><b>vst3_lane_u64</b></b> (uint64_t * ptr, uint64x1x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_u64" type="checkbox"><label for="vst3q_lane_u64"><div>void <b><b>vst3q_lane_u64</b></b> (uint64_t * ptr, uint64x2x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_p64" type="checkbox"><label for="vst3_lane_p64"><div>void <b><b>vst3_lane_p64</b></b> (poly64_t * ptr, poly64x1x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_p64" type="checkbox"><label for="vst3q_lane_p64"><div>void <b><b>vst3q_lane_p64</b></b> (poly64_t * ptr, poly64x2x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3_lane_f64" type="checkbox"><label for="vst3_lane_f64"><div>void <b><b>vst3_lane_f64</b></b> (float64_t * ptr, float64x1x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst3q_lane_f64" type="checkbox"><label for="vst3q_lane_f64"><div>void <b><b>vst3q_lane_f64</b></b> (float64_t * ptr, float64x2x3_t val, const int lane)<span class="right">Store single 3-element structure from one lane of three registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 3-element structure from one lane of three registers. This instruction stores a 3-element structure to memory from corresponding elements of three SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st3-single-structure-store-single-3-element-structure-from-one-lane-of-three-registers">ST3</a> {Vt.d - Vt3.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_s16" type="checkbox"><label for="vst4_lane_s16"><div>void <b><b>vst4_lane_s16</b></b> (int16_t * ptr, int16x4x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_s16" type="checkbox"><label for="vst4q_lane_s16"><div>void <b><b>vst4q_lane_s16</b></b> (int16_t * ptr, int16x8x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_s32" type="checkbox"><label for="vst4_lane_s32"><div>void <b><b>vst4_lane_s32</b></b> (int32_t * ptr, int32x2x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2S <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_s32" type="checkbox"><label for="vst4q_lane_s32"><div>void <b><b>vst4q_lane_s32</b></b> (int32_t * ptr, int32x4x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4S <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_u16" type="checkbox"><label for="vst4_lane_u16"><div>void <b><b>vst4_lane_u16</b></b> (uint16_t * ptr, uint16x4x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_u16" type="checkbox"><label for="vst4q_lane_u16"><div>void <b><b>vst4q_lane_u16</b></b> (uint16_t * ptr, uint16x8x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_u32" type="checkbox"><label for="vst4_lane_u32"><div>void <b><b>vst4_lane_u32</b></b> (uint32_t * ptr, uint32x2x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2S <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_u32" type="checkbox"><label for="vst4q_lane_u32"><div>void <b><b>vst4q_lane_u32</b></b> (uint32_t * ptr, uint32x4x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4S <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_f16" type="checkbox"><label for="vst4_lane_f16"><div>void <b><b>vst4_lane_f16</b></b> (float16_t * ptr, float16x4x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_f16" type="checkbox"><label for="vst4q_lane_f16"><div>void <b><b>vst4q_lane_f16</b></b> (float16_t * ptr, float16x8x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_f32" type="checkbox"><label for="vst4_lane_f32"><div>void <b><b>vst4_lane_f32</b></b> (float32_t * ptr, float32x2x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2S <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_f32" type="checkbox"><label for="vst4q_lane_f32"><div>void <b><b>vst4q_lane_f32</b></b> (float32_t * ptr, float32x4x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.s - Vt4.s}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4S <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_p16" type="checkbox"><label for="vst4_lane_p16"><div>void <b><b>vst4_lane_p16</b></b> (poly16_t * ptr, poly16x4x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_p16" type="checkbox"><label for="vst4q_lane_p16"><div>void <b><b>vst4q_lane_p16</b></b> (poly16_t * ptr, poly16x8x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.h - Vt4.h}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_s8" type="checkbox"><label for="vst4q_lane_s8"><div>void <b><b>vst4q_lane_s8</b></b> (int8_t * ptr, int8x16x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.16B <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_u8" type="checkbox"><label for="vst4q_lane_u8"><div>void <b><b>vst4q_lane_u8</b></b> (uint8_t * ptr, uint8x16x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.16B <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_p8" type="checkbox"><label for="vst4q_lane_p8"><div>void <b><b>vst4q_lane_p8</b></b> (poly8_t * ptr, poly8x16x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.b - Vt4.b}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.16B <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_s64" type="checkbox"><label for="vst4_lane_s64"><div>void <b><b>vst4_lane_s64</b></b> (int64_t * ptr, int64x1x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_s64" type="checkbox"><label for="vst4q_lane_s64"><div>void <b><b>vst4q_lane_s64</b></b> (int64_t * ptr, int64x2x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_u64" type="checkbox"><label for="vst4_lane_u64"><div>void <b><b>vst4_lane_u64</b></b> (uint64_t * ptr, uint64x1x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_u64" type="checkbox"><label for="vst4q_lane_u64"><div>void <b><b>vst4q_lane_u64</b></b> (uint64_t * ptr, uint64x2x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_p64" type="checkbox"><label for="vst4_lane_p64"><div>void <b><b>vst4_lane_p64</b></b> (poly64_t * ptr, poly64x1x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_p64" type="checkbox"><label for="vst4q_lane_p64"><div>void <b><b>vst4q_lane_p64</b></b> (poly64_t * ptr, poly64x2x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4_lane_f64" type="checkbox"><label for="vst4_lane_f64"><div>void <b><b>vst4_lane_f64</b></b> (float64_t * ptr, float64x1x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst4q_lane_f64" type="checkbox"><label for="vst4q_lane_f64"><div>void <b><b>vst4q_lane_f64</b></b> (float64_t * ptr, float64x2x4_t val, const int lane)<span class="right">Store single 4-element structure from one lane of four registers</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store single 4-element structure from one lane of four registers. This instruction stores a 4-element structure to memory from corresponding elements of four SIMD&amp;FP registers.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st4-single-structure-store-single-4-element-structure-from-one-lane-of-four-registers">ST4</a> {Vt.d - Vt4.d}[lane],[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s8_x2" type="checkbox"><label for="vst1_s8_x2"><div>void <b><b>vst1_s8_x2</b></b> (int8_t * ptr, int8x8x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s8_x2" type="checkbox"><label for="vst1q_s8_x2"><div>void <b><b>vst1q_s8_x2</b></b> (int8_t * ptr, int8x16x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s16_x2" type="checkbox"><label for="vst1_s16_x2"><div>void <b><b>vst1_s16_x2</b></b> (int16_t * ptr, int16x4x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s16_x2" type="checkbox"><label for="vst1q_s16_x2"><div>void <b><b>vst1q_s16_x2</b></b> (int16_t * ptr, int16x8x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s32_x2" type="checkbox"><label for="vst1_s32_x2"><div>void <b><b>vst1_s32_x2</b></b> (int32_t * ptr, int32x2x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s32_x2" type="checkbox"><label for="vst1q_s32_x2"><div>void <b><b>vst1q_s32_x2</b></b> (int32_t * ptr, int32x4x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u8_x2" type="checkbox"><label for="vst1_u8_x2"><div>void <b><b>vst1_u8_x2</b></b> (uint8_t * ptr, uint8x8x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u8_x2" type="checkbox"><label for="vst1q_u8_x2"><div>void <b><b>vst1q_u8_x2</b></b> (uint8_t * ptr, uint8x16x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u16_x2" type="checkbox"><label for="vst1_u16_x2"><div>void <b><b>vst1_u16_x2</b></b> (uint16_t * ptr, uint16x4x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u16_x2" type="checkbox"><label for="vst1q_u16_x2"><div>void <b><b>vst1q_u16_x2</b></b> (uint16_t * ptr, uint16x8x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u32_x2" type="checkbox"><label for="vst1_u32_x2"><div>void <b><b>vst1_u32_x2</b></b> (uint32_t * ptr, uint32x2x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u32_x2" type="checkbox"><label for="vst1q_u32_x2"><div>void <b><b>vst1q_u32_x2</b></b> (uint32_t * ptr, uint32x4x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f16_x2" type="checkbox"><label for="vst1_f16_x2"><div>void <b><b>vst1_f16_x2</b></b> (float16_t * ptr, float16x4x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f16_x2" type="checkbox"><label for="vst1q_f16_x2"><div>void <b><b>vst1q_f16_x2</b></b> (float16_t * ptr, float16x8x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f32_x2" type="checkbox"><label for="vst1_f32_x2"><div>void <b><b>vst1_f32_x2</b></b> (float32_t * ptr, float32x2x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f32_x2" type="checkbox"><label for="vst1q_f32_x2"><div>void <b><b>vst1q_f32_x2</b></b> (float32_t * ptr, float32x4x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p8_x2" type="checkbox"><label for="vst1_p8_x2"><div>void <b><b>vst1_p8_x2</b></b> (poly8_t * ptr, poly8x8x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p8_x2" type="checkbox"><label for="vst1q_p8_x2"><div>void <b><b>vst1q_p8_x2</b></b> (poly8_t * ptr, poly8x16x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p16_x2" type="checkbox"><label for="vst1_p16_x2"><div>void <b><b>vst1_p16_x2</b></b> (poly16_t * ptr, poly16x4x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p16_x2" type="checkbox"><label for="vst1q_p16_x2"><div>void <b><b>vst1q_p16_x2</b></b> (poly16_t * ptr, poly16x8x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s64_x2" type="checkbox"><label for="vst1_s64_x2"><div>void <b><b>vst1_s64_x2</b></b> (int64_t * ptr, int64x1x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u64_x2" type="checkbox"><label for="vst1_u64_x2"><div>void <b><b>vst1_u64_x2</b></b> (uint64_t * ptr, uint64x1x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p64_x2" type="checkbox"><label for="vst1_p64_x2"><div>void <b><b>vst1_p64_x2</b></b> (poly64_t * ptr, poly64x1x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s64_x2" type="checkbox"><label for="vst1q_s64_x2"><div>void <b><b>vst1q_s64_x2</b></b> (int64_t * ptr, int64x2x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u64_x2" type="checkbox"><label for="vst1q_u64_x2"><div>void <b><b>vst1q_u64_x2</b></b> (uint64_t * ptr, uint64x2x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p64_x2" type="checkbox"><label for="vst1q_p64_x2"><div>void <b><b>vst1q_p64_x2</b></b> (poly64_t * ptr, poly64x2x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f64_x2" type="checkbox"><label for="vst1_f64_x2"><div>void <b><b>vst1_f64_x2</b></b> (float64_t * ptr, float64x1x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f64_x2" type="checkbox"><label for="vst1q_f64_x2"><div>void <b><b>vst1q_f64_x2</b></b> (float64_t * ptr, float64x2x2_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s8_x3" type="checkbox"><label for="vst1_s8_x3"><div>void <b><b>vst1_s8_x3</b></b> (int8_t * ptr, int8x8x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s8_x3" type="checkbox"><label for="vst1q_s8_x3"><div>void <b><b>vst1q_s8_x3</b></b> (int8_t * ptr, int8x16x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s16_x3" type="checkbox"><label for="vst1_s16_x3"><div>void <b><b>vst1_s16_x3</b></b> (int16_t * ptr, int16x4x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s16_x3" type="checkbox"><label for="vst1q_s16_x3"><div>void <b><b>vst1q_s16_x3</b></b> (int16_t * ptr, int16x8x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s32_x3" type="checkbox"><label for="vst1_s32_x3"><div>void <b><b>vst1_s32_x3</b></b> (int32_t * ptr, int32x2x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s32_x3" type="checkbox"><label for="vst1q_s32_x3"><div>void <b><b>vst1q_s32_x3</b></b> (int32_t * ptr, int32x4x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u8_x3" type="checkbox"><label for="vst1_u8_x3"><div>void <b><b>vst1_u8_x3</b></b> (uint8_t * ptr, uint8x8x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u8_x3" type="checkbox"><label for="vst1q_u8_x3"><div>void <b><b>vst1q_u8_x3</b></b> (uint8_t * ptr, uint8x16x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u16_x3" type="checkbox"><label for="vst1_u16_x3"><div>void <b><b>vst1_u16_x3</b></b> (uint16_t * ptr, uint16x4x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u16_x3" type="checkbox"><label for="vst1q_u16_x3"><div>void <b><b>vst1q_u16_x3</b></b> (uint16_t * ptr, uint16x8x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u32_x3" type="checkbox"><label for="vst1_u32_x3"><div>void <b><b>vst1_u32_x3</b></b> (uint32_t * ptr, uint32x2x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u32_x3" type="checkbox"><label for="vst1q_u32_x3"><div>void <b><b>vst1q_u32_x3</b></b> (uint32_t * ptr, uint32x4x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f16_x3" type="checkbox"><label for="vst1_f16_x3"><div>void <b><b>vst1_f16_x3</b></b> (float16_t * ptr, float16x4x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f16_x3" type="checkbox"><label for="vst1q_f16_x3"><div>void <b><b>vst1q_f16_x3</b></b> (float16_t * ptr, float16x8x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f32_x3" type="checkbox"><label for="vst1_f32_x3"><div>void <b><b>vst1_f32_x3</b></b> (float32_t * ptr, float32x2x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f32_x3" type="checkbox"><label for="vst1q_f32_x3"><div>void <b><b>vst1q_f32_x3</b></b> (float32_t * ptr, float32x4x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p8_x3" type="checkbox"><label for="vst1_p8_x3"><div>void <b><b>vst1_p8_x3</b></b> (poly8_t * ptr, poly8x8x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p8_x3" type="checkbox"><label for="vst1q_p8_x3"><div>void <b><b>vst1q_p8_x3</b></b> (poly8_t * ptr, poly8x16x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p16_x3" type="checkbox"><label for="vst1_p16_x3"><div>void <b><b>vst1_p16_x3</b></b> (poly16_t * ptr, poly16x4x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p16_x3" type="checkbox"><label for="vst1q_p16_x3"><div>void <b><b>vst1q_p16_x3</b></b> (poly16_t * ptr, poly16x8x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s64_x3" type="checkbox"><label for="vst1_s64_x3"><div>void <b><b>vst1_s64_x3</b></b> (int64_t * ptr, int64x1x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u64_x3" type="checkbox"><label for="vst1_u64_x3"><div>void <b><b>vst1_u64_x3</b></b> (uint64_t * ptr, uint64x1x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p64_x3" type="checkbox"><label for="vst1_p64_x3"><div>void <b><b>vst1_p64_x3</b></b> (poly64_t * ptr, poly64x1x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s64_x3" type="checkbox"><label for="vst1q_s64_x3"><div>void <b><b>vst1q_s64_x3</b></b> (int64_t * ptr, int64x2x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u64_x3" type="checkbox"><label for="vst1q_u64_x3"><div>void <b><b>vst1q_u64_x3</b></b> (uint64_t * ptr, uint64x2x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p64_x3" type="checkbox"><label for="vst1q_p64_x3"><div>void <b><b>vst1q_p64_x3</b></b> (poly64_t * ptr, poly64x2x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f64_x3" type="checkbox"><label for="vst1_f64_x3"><div>void <b><b>vst1_f64_x3</b></b> (float64_t * ptr, float64x1x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f64_x3" type="checkbox"><label for="vst1q_f64_x3"><div>void <b><b>vst1q_f64_x3</b></b> (float64_t * ptr, float64x2x3_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s8_x4" type="checkbox"><label for="vst1_s8_x4"><div>void <b><b>vst1_s8_x4</b></b> (int8_t * ptr, int8x8x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8B <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s8_x4" type="checkbox"><label for="vst1q_s8_x4"><div>void <b><b>vst1q_s8_x4</b></b> (int8_t * ptr, int8x16x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.16B <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s16_x4" type="checkbox"><label for="vst1_s16_x4"><div>void <b><b>vst1_s16_x4</b></b> (int16_t * ptr, int16x4x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s16_x4" type="checkbox"><label for="vst1q_s16_x4"><div>void <b><b>vst1q_s16_x4</b></b> (int16_t * ptr, int16x8x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s32_x4" type="checkbox"><label for="vst1_s32_x4"><div>void <b><b>vst1_s32_x4</b></b> (int32_t * ptr, int32x2x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2S <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s32_x4" type="checkbox"><label for="vst1q_s32_x4"><div>void <b><b>vst1q_s32_x4</b></b> (int32_t * ptr, int32x4x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4S <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u8_x4" type="checkbox"><label for="vst1_u8_x4"><div>void <b><b>vst1_u8_x4</b></b> (uint8_t * ptr, uint8x8x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8B <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u8_x4" type="checkbox"><label for="vst1q_u8_x4"><div>void <b><b>vst1q_u8_x4</b></b> (uint8_t * ptr, uint8x16x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.16B <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u16_x4" type="checkbox"><label for="vst1_u16_x4"><div>void <b><b>vst1_u16_x4</b></b> (uint16_t * ptr, uint16x4x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u16_x4" type="checkbox"><label for="vst1q_u16_x4"><div>void <b><b>vst1q_u16_x4</b></b> (uint16_t * ptr, uint16x8x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u32_x4" type="checkbox"><label for="vst1_u32_x4"><div>void <b><b>vst1_u32_x4</b></b> (uint32_t * ptr, uint32x2x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2S <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u32_x4" type="checkbox"><label for="vst1q_u32_x4"><div>void <b><b>vst1q_u32_x4</b></b> (uint32_t * ptr, uint32x4x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4S <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f16_x4" type="checkbox"><label for="vst1_f16_x4"><div>void <b><b>vst1_f16_x4</b></b> (float16_t * ptr, float16x4x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f16_x4" type="checkbox"><label for="vst1q_f16_x4"><div>void <b><b>vst1q_f16_x4</b></b> (float16_t * ptr, float16x8x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f32_x4" type="checkbox"><label for="vst1_f32_x4"><div>void <b><b>vst1_f32_x4</b></b> (float32_t * ptr, float32x2x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2S <br />
+val.val[2] &rarr; Vt3.2S <br />
+val.val[1] &rarr; Vt2.2S <br />
+val.val[0] &rarr; Vt.2S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f32_x4" type="checkbox"><label for="vst1q_f32_x4"><div>void <b><b>vst1q_f32_x4</b></b> (float32_t * ptr, float32x4x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4S <br />
+val.val[2] &rarr; Vt3.4S <br />
+val.val[1] &rarr; Vt2.4S <br />
+val.val[0] &rarr; Vt.4S </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p8_x4" type="checkbox"><label for="vst1_p8_x4"><div>void <b><b>vst1_p8_x4</b></b> (poly8_t * ptr, poly8x8x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8B <br />
+val.val[2] &rarr; Vt3.8B <br />
+val.val[1] &rarr; Vt2.8B <br />
+val.val[0] &rarr; Vt.8B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p8_x4" type="checkbox"><label for="vst1q_p8_x4"><div>void <b><b>vst1q_p8_x4</b></b> (poly8_t * ptr, poly8x16x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.16B <br />
+val.val[2] &rarr; Vt3.16B <br />
+val.val[1] &rarr; Vt2.16B <br />
+val.val[0] &rarr; Vt.16B </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p16_x4" type="checkbox"><label for="vst1_p16_x4"><div>void <b><b>vst1_p16_x4</b></b> (poly16_t * ptr, poly16x4x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.4H <br />
+val.val[2] &rarr; Vt3.4H <br />
+val.val[1] &rarr; Vt2.4H <br />
+val.val[0] &rarr; Vt.4H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p16_x4" type="checkbox"><label for="vst1q_p16_x4"><div>void <b><b>vst1q_p16_x4</b></b> (poly16_t * ptr, poly16x8x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.8H <br />
+val.val[2] &rarr; Vt3.8H <br />
+val.val[1] &rarr; Vt2.8H <br />
+val.val[0] &rarr; Vt.8H </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_s64_x4" type="checkbox"><label for="vst1_s64_x4"><div>void <b><b>vst1_s64_x4</b></b> (int64_t * ptr, int64x1x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_u64_x4" type="checkbox"><label for="vst1_u64_x4"><div>void <b><b>vst1_u64_x4</b></b> (uint64_t * ptr, uint64x1x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_p64_x4" type="checkbox"><label for="vst1_p64_x4"><div>void <b><b>vst1_p64_x4</b></b> (poly64_t * ptr, poly64x1x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_s64_x4" type="checkbox"><label for="vst1q_s64_x4"><div>void <b><b>vst1q_s64_x4</b></b> (int64_t * ptr, int64x2x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_u64_x4" type="checkbox"><label for="vst1q_u64_x4"><div>void <b><b>vst1q_u64_x4</b></b> (uint64_t * ptr, uint64x2x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_p64_x4" type="checkbox"><label for="vst1q_p64_x4"><div>void <b><b>vst1q_p64_x4</b></b> (poly64_t * ptr, poly64x2x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vst1_f64_x4" type="checkbox"><label for="vst1_f64_x4"><div>void <b><b>vst1_f64_x4</b></b> (float64_t * ptr, float64x1x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.1D <br />
+val.val[2] &rarr; Vt3.1D <br />
+val.val[1] &rarr; Vt2.1D <br />
+val.val[0] &rarr; Vt.1D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vst1q_f64_x4" type="checkbox"><label for="vst1q_f64_x4"><div>void <b><b>vst1q_f64_x4</b></b> (float64_t * ptr, float64x2x4_t val)<span class="right">Store a single-element structure from one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store a single-element structure from one lane of one register. This instruction stores the specified element of a SIMD&amp;FP register to memory.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/st1-single-structure-store-a-single-element-structure-from-one-lane-of-one-register">ST1</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val.val[3] &rarr; Vt4.2D <br />
+val.val[2] &rarr; Vt3.2D <br />
+val.val[1] &rarr; Vt2.2D <br />
+val.val[0] &rarr; Vt.2D </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s8_x2" type="checkbox"><label for="vld1_s8_x2"><div>int8x8x2_t <b><b>vld1_s8_x2</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s8_x2" type="checkbox"><label for="vld1q_s8_x2"><div>int8x16x2_t <b><b>vld1q_s8_x2</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s16_x2" type="checkbox"><label for="vld1_s16_x2"><div>int16x4x2_t <b><b>vld1_s16_x2</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s16_x2" type="checkbox"><label for="vld1q_s16_x2"><div>int16x8x2_t <b><b>vld1q_s16_x2</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s32_x2" type="checkbox"><label for="vld1_s32_x2"><div>int32x2x2_t <b><b>vld1_s32_x2</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s32_x2" type="checkbox"><label for="vld1q_s32_x2"><div>int32x4x2_t <b><b>vld1q_s32_x2</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u8_x2" type="checkbox"><label for="vld1_u8_x2"><div>uint8x8x2_t <b><b>vld1_u8_x2</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u8_x2" type="checkbox"><label for="vld1q_u8_x2"><div>uint8x16x2_t <b><b>vld1q_u8_x2</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u16_x2" type="checkbox"><label for="vld1_u16_x2"><div>uint16x4x2_t <b><b>vld1_u16_x2</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u16_x2" type="checkbox"><label for="vld1q_u16_x2"><div>uint16x8x2_t <b><b>vld1q_u16_x2</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u32_x2" type="checkbox"><label for="vld1_u32_x2"><div>uint32x2x2_t <b><b>vld1_u32_x2</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u32_x2" type="checkbox"><label for="vld1q_u32_x2"><div>uint32x4x2_t <b><b>vld1q_u32_x2</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f16_x2" type="checkbox"><label for="vld1_f16_x2"><div>float16x4x2_t <b><b>vld1_f16_x2</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f16_x2" type="checkbox"><label for="vld1q_f16_x2"><div>float16x8x2_t <b><b>vld1q_f16_x2</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f32_x2" type="checkbox"><label for="vld1_f32_x2"><div>float32x2x2_t <b><b>vld1_f32_x2</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S - Vt2.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f32_x2" type="checkbox"><label for="vld1q_f32_x2"><div>float32x4x2_t <b><b>vld1q_f32_x2</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S - Vt2.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p8_x2" type="checkbox"><label for="vld1_p8_x2"><div>poly8x8x2_t <b><b>vld1_p8_x2</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B - Vt2.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p8_x2" type="checkbox"><label for="vld1q_p8_x2"><div>poly8x16x2_t <b><b>vld1q_p8_x2</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B - Vt2.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p16_x2" type="checkbox"><label for="vld1_p16_x2"><div>poly16x4x2_t <b><b>vld1_p16_x2</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt2.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p16_x2" type="checkbox"><label for="vld1q_p16_x2"><div>poly16x8x2_t <b><b>vld1q_p16_x2</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt2.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s64_x2" type="checkbox"><label for="vld1_s64_x2"><div>int64x1x2_t <b><b>vld1_s64_x2</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u64_x2" type="checkbox"><label for="vld1_u64_x2"><div>uint64x1x2_t <b><b>vld1_u64_x2</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p64_x2" type="checkbox"><label for="vld1_p64_x2"><div>poly64x1x2_t <b><b>vld1_p64_x2</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s64_x2" type="checkbox"><label for="vld1q_s64_x2"><div>int64x2x2_t <b><b>vld1q_s64_x2</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u64_x2" type="checkbox"><label for="vld1q_u64_x2"><div>uint64x2x2_t <b><b>vld1q_u64_x2</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p64_x2" type="checkbox"><label for="vld1q_p64_x2"><div>poly64x2x2_t <b><b>vld1q_p64_x2</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f64_x2" type="checkbox"><label for="vld1_f64_x2"><div>float64x1x2_t <b><b>vld1_f64_x2</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt2.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f64_x2" type="checkbox"><label for="vld1q_f64_x2"><div>float64x2x2_t <b><b>vld1q_f64_x2</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt2.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s8_x3" type="checkbox"><label for="vld1_s8_x3"><div>int8x8x3_t <b><b>vld1_s8_x3</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s8_x3" type="checkbox"><label for="vld1q_s8_x3"><div>int8x16x3_t <b><b>vld1q_s8_x3</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s16_x3" type="checkbox"><label for="vld1_s16_x3"><div>int16x4x3_t <b><b>vld1_s16_x3</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s16_x3" type="checkbox"><label for="vld1q_s16_x3"><div>int16x8x3_t <b><b>vld1q_s16_x3</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s32_x3" type="checkbox"><label for="vld1_s32_x3"><div>int32x2x3_t <b><b>vld1_s32_x3</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s32_x3" type="checkbox"><label for="vld1q_s32_x3"><div>int32x4x3_t <b><b>vld1q_s32_x3</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u8_x3" type="checkbox"><label for="vld1_u8_x3"><div>uint8x8x3_t <b><b>vld1_u8_x3</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u8_x3" type="checkbox"><label for="vld1q_u8_x3"><div>uint8x16x3_t <b><b>vld1q_u8_x3</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u16_x3" type="checkbox"><label for="vld1_u16_x3"><div>uint16x4x3_t <b><b>vld1_u16_x3</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u16_x3" type="checkbox"><label for="vld1q_u16_x3"><div>uint16x8x3_t <b><b>vld1q_u16_x3</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u32_x3" type="checkbox"><label for="vld1_u32_x3"><div>uint32x2x3_t <b><b>vld1_u32_x3</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u32_x3" type="checkbox"><label for="vld1q_u32_x3"><div>uint32x4x3_t <b><b>vld1q_u32_x3</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f16_x3" type="checkbox"><label for="vld1_f16_x3"><div>float16x4x3_t <b><b>vld1_f16_x3</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f16_x3" type="checkbox"><label for="vld1q_f16_x3"><div>float16x8x3_t <b><b>vld1q_f16_x3</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f32_x3" type="checkbox"><label for="vld1_f32_x3"><div>float32x2x3_t <b><b>vld1_f32_x3</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S - Vt3.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f32_x3" type="checkbox"><label for="vld1q_f32_x3"><div>float32x4x3_t <b><b>vld1q_f32_x3</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S - Vt3.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p8_x3" type="checkbox"><label for="vld1_p8_x3"><div>poly8x8x3_t <b><b>vld1_p8_x3</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B - Vt3.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p8_x3" type="checkbox"><label for="vld1q_p8_x3"><div>poly8x16x3_t <b><b>vld1q_p8_x3</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B - Vt3.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p16_x3" type="checkbox"><label for="vld1_p16_x3"><div>poly16x4x3_t <b><b>vld1_p16_x3</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt3.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p16_x3" type="checkbox"><label for="vld1q_p16_x3"><div>poly16x8x3_t <b><b>vld1q_p16_x3</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt3.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s64_x3" type="checkbox"><label for="vld1_s64_x3"><div>int64x1x3_t <b><b>vld1_s64_x3</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u64_x3" type="checkbox"><label for="vld1_u64_x3"><div>uint64x1x3_t <b><b>vld1_u64_x3</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p64_x3" type="checkbox"><label for="vld1_p64_x3"><div>poly64x1x3_t <b><b>vld1_p64_x3</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s64_x3" type="checkbox"><label for="vld1q_s64_x3"><div>int64x2x3_t <b><b>vld1q_s64_x3</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u64_x3" type="checkbox"><label for="vld1q_u64_x3"><div>uint64x2x3_t <b><b>vld1q_u64_x3</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p64_x3" type="checkbox"><label for="vld1q_p64_x3"><div>poly64x2x3_t <b><b>vld1q_p64_x3</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f64_x3" type="checkbox"><label for="vld1_f64_x3"><div>float64x1x3_t <b><b>vld1_f64_x3</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt3.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f64_x3" type="checkbox"><label for="vld1q_f64_x3"><div>float64x2x3_t <b><b>vld1q_f64_x3</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt3.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s8_x4" type="checkbox"><label for="vld1_s8_x4"><div>int8x8x4_t <b><b>vld1_s8_x4</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s8_x4" type="checkbox"><label for="vld1q_s8_x4"><div>int8x16x4_t <b><b>vld1q_s8_x4</b></b> (int8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s16_x4" type="checkbox"><label for="vld1_s16_x4"><div>int16x4x4_t <b><b>vld1_s16_x4</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s16_x4" type="checkbox"><label for="vld1q_s16_x4"><div>int16x8x4_t <b><b>vld1q_s16_x4</b></b> (int16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s32_x4" type="checkbox"><label for="vld1_s32_x4"><div>int32x2x4_t <b><b>vld1_s32_x4</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s32_x4" type="checkbox"><label for="vld1q_s32_x4"><div>int32x4x4_t <b><b>vld1q_s32_x4</b></b> (int32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u8_x4" type="checkbox"><label for="vld1_u8_x4"><div>uint8x8x4_t <b><b>vld1_u8_x4</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u8_x4" type="checkbox"><label for="vld1q_u8_x4"><div>uint8x16x4_t <b><b>vld1q_u8_x4</b></b> (uint8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u16_x4" type="checkbox"><label for="vld1_u16_x4"><div>uint16x4x4_t <b><b>vld1_u16_x4</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u16_x4" type="checkbox"><label for="vld1q_u16_x4"><div>uint16x8x4_t <b><b>vld1q_u16_x4</b></b> (uint16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u32_x4" type="checkbox"><label for="vld1_u32_x4"><div>uint32x2x4_t <b><b>vld1_u32_x4</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u32_x4" type="checkbox"><label for="vld1q_u32_x4"><div>uint32x4x4_t <b><b>vld1q_u32_x4</b></b> (uint32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f16_x4" type="checkbox"><label for="vld1_f16_x4"><div>float16x4x4_t <b><b>vld1_f16_x4</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f16_x4" type="checkbox"><label for="vld1q_f16_x4"><div>float16x8x4_t <b><b>vld1q_f16_x4</b></b> (float16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f32_x4" type="checkbox"><label for="vld1_f32_x4"><div>float32x2x4_t <b><b>vld1_f32_x4</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2S - Vt4.2S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2S &rarr; result.val[3]
+Vt3.2S &rarr; result.val[2]
+Vt2.2S &rarr; result.val[1]
+Vt.2S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f32_x4" type="checkbox"><label for="vld1q_f32_x4"><div>float32x4x4_t <b><b>vld1q_f32_x4</b></b> (float32_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4S - Vt4.4S},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4S &rarr; result.val[3]
+Vt3.4S &rarr; result.val[2]
+Vt2.4S &rarr; result.val[1]
+Vt.4S &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p8_x4" type="checkbox"><label for="vld1_p8_x4"><div>poly8x8x4_t <b><b>vld1_p8_x4</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8B - Vt4.8B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8B &rarr; result.val[3]
+Vt3.8B &rarr; result.val[2]
+Vt2.8B &rarr; result.val[1]
+Vt.8B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p8_x4" type="checkbox"><label for="vld1q_p8_x4"><div>poly8x16x4_t <b><b>vld1q_p8_x4</b></b> (poly8_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.16B - Vt4.16B},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.16B &rarr; result.val[3]
+Vt3.16B &rarr; result.val[2]
+Vt2.16B &rarr; result.val[1]
+Vt.16B &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p16_x4" type="checkbox"><label for="vld1_p16_x4"><div>poly16x4x4_t <b><b>vld1_p16_x4</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.4H - Vt4.4H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.4H &rarr; result.val[3]
+Vt3.4H &rarr; result.val[2]
+Vt2.4H &rarr; result.val[1]
+Vt.4H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p16_x4" type="checkbox"><label for="vld1q_p16_x4"><div>poly16x8x4_t <b><b>vld1q_p16_x4</b></b> (poly16_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.8H - Vt4.8H},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.8H &rarr; result.val[3]
+Vt3.8H &rarr; result.val[2]
+Vt2.8H &rarr; result.val[1]
+Vt.8H &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_s64_x4" type="checkbox"><label for="vld1_s64_x4"><div>int64x1x4_t <b><b>vld1_s64_x4</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_u64_x4" type="checkbox"><label for="vld1_u64_x4"><div>uint64x1x4_t <b><b>vld1_u64_x4</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_p64_x4" type="checkbox"><label for="vld1_p64_x4"><div>poly64x1x4_t <b><b>vld1_p64_x4</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_s64_x4" type="checkbox"><label for="vld1q_s64_x4"><div>int64x2x4_t <b><b>vld1q_s64_x4</b></b> (int64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_u64_x4" type="checkbox"><label for="vld1q_u64_x4"><div>uint64x2x4_t <b><b>vld1q_u64_x4</b></b> (uint64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_p64_x4" type="checkbox"><label for="vld1q_p64_x4"><div>poly64x2x4_t <b><b>vld1q_p64_x4</b></b> (poly64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vld1_f64_x4" type="checkbox"><label for="vld1_f64_x4"><div>float64x1x4_t <b><b>vld1_f64_x4</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.1D - Vt4.1D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.1D &rarr; result.val[3]
+Vt3.1D &rarr; result.val[2]
+Vt2.1D &rarr; result.val[1]
+Vt.1D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vld1q_f64_x4" type="checkbox"><label for="vld1q_f64_x4"><div>float64x2x4_t <b><b>vld1q_f64_x4</b></b> (float64_t const * ptr)<span class="right">Load one single-element structure to one lane of one register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load one single-element structure to one lane of one register. This instruction loads a single-element structure from memory and writes the result to the specified lane of the SIMD&amp;FP register without affecting the other bits of the register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ld1-single-structure-load-one-single-element-structure-to-one-lane-of-one-register">LD1</a> {Vt.2D - Vt4.2D},[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Vt4.2D &rarr; result.val[3]
+Vt3.2D &rarr; result.val[2]
+Vt2.2D &rarr; result.val[1]
+Vt.2D &rarr; result.val[0]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    SetNotTagCheckedInstruction(!wback &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+
+bits(64) address;
+bits(64) offs;
+bits(128) rval;
+bits(esize) element;
+constant integer ebytes = esize DIV 8;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+if replicate then
+    // load and replicate to all elements
+    for s = 0 to selem-1
+        element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        // replicate to fill 128- or 64-bit register
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Replicate.2" title="function: bits(M*N) Replicate(bits(M) x, integer N)">Replicate</a>(element, datasize DIV esize);
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+else
+    // load/store one element per register
+    for s = 0 to selem-1
+        rval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        if memop == <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a> then
+            // insert into one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[rval, index, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = rval;
+        else // memop == MemOp_STORE
+            // extract from one lane of 128-bit register
+            <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address+offs, ebytes, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[rval, index, esize];
+        offs = offs + ebytes;
+        t = (t + 1) MOD 32;
+
+if wback then
+    if m != 31 then
+        offs = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];
+    if n == 31 then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.write.0" title="accessor: SP[] = bits(width) value">SP</a>[] = address + offs;
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[n] = address + offs;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpadd_s8" type="checkbox"><label for="vpadd_s8"><div>int8x8_t <b><b>vpadd_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadd_s16" type="checkbox"><label for="vpadd_s16"><div>int16x4_t <b><b>vpadd_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadd_s32" type="checkbox"><label for="vpadd_s32"><div>int32x2_t <b><b>vpadd_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadd_u8" type="checkbox"><label for="vpadd_u8"><div>uint8x8_t <b><b>vpadd_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadd_u16" type="checkbox"><label for="vpadd_u16"><div>uint16x4_t <b><b>vpadd_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadd_u32" type="checkbox"><label for="vpadd_u32"><div>uint32x2_t <b><b>vpadd_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadd_f32" type="checkbox"><label for="vpadd_f32"><div>float32x2_t <b><b>vpadd_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/faddp-vector-floating-point-add-pairwise-vector">FADDP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_s8" type="checkbox"><label for="vpaddq_s8"><div>int8x16_t <b><b>vpaddq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_s16" type="checkbox"><label for="vpaddq_s16"><div>int16x8_t <b><b>vpaddq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_s32" type="checkbox"><label for="vpaddq_s32"><div>int32x4_t <b><b>vpaddq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_s64" type="checkbox"><label for="vpaddq_s64"><div>int64x2_t <b><b>vpaddq_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_u8" type="checkbox"><label for="vpaddq_u8"><div>uint8x16_t <b><b>vpaddq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_u16" type="checkbox"><label for="vpaddq_u16"><div>uint16x8_t <b><b>vpaddq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_u32" type="checkbox"><label for="vpaddq_u32"><div>uint32x4_t <b><b>vpaddq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_u64" type="checkbox"><label for="vpaddq_u64"><div>uint64x2_t <b><b>vpaddq_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_f32" type="checkbox"><label for="vpaddq_f32"><div>float32x4_t <b><b>vpaddq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/faddp-vector-floating-point-add-pairwise-vector">FADDP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddq_f64" type="checkbox"><label for="vpaddq_f64"><div>float64x2_t <b><b>vpaddq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/faddp-vector-floating-point-add-pairwise-vector">FADDP</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddl_s8" type="checkbox"><label for="vpaddl_s8"><div>int16x4_t <b><b>vpaddl_s8</b></b> (int8x8_t a)<span class="right">Signed add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlp-signed-add-long-pairwise">SADDLP</a> Vd.4H,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddlq_s8" type="checkbox"><label for="vpaddlq_s8"><div>int16x8_t <b><b>vpaddlq_s8</b></b> (int8x16_t a)<span class="right">Signed add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlp-signed-add-long-pairwise">SADDLP</a> Vd.8H,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddl_s16" type="checkbox"><label for="vpaddl_s16"><div>int32x2_t <b><b>vpaddl_s16</b></b> (int16x4_t a)<span class="right">Signed add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlp-signed-add-long-pairwise">SADDLP</a> Vd.2S,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddlq_s16" type="checkbox"><label for="vpaddlq_s16"><div>int32x4_t <b><b>vpaddlq_s16</b></b> (int16x8_t a)<span class="right">Signed add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlp-signed-add-long-pairwise">SADDLP</a> Vd.4S,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddl_s32" type="checkbox"><label for="vpaddl_s32"><div>int64x1_t <b><b>vpaddl_s32</b></b> (int32x2_t a)<span class="right">Signed add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlp-signed-add-long-pairwise">SADDLP</a> Vd.1D,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddlq_s32" type="checkbox"><label for="vpaddlq_s32"><div>int64x2_t <b><b>vpaddlq_s32</b></b> (int32x4_t a)<span class="right">Signed add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlp-signed-add-long-pairwise">SADDLP</a> Vd.2D,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddl_u8" type="checkbox"><label for="vpaddl_u8"><div>uint16x4_t <b><b>vpaddl_u8</b></b> (uint8x8_t a)<span class="right">Unsigned add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlp-unsigned-add-long-pairwise">UADDLP</a> Vd.4H,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddlq_u8" type="checkbox"><label for="vpaddlq_u8"><div>uint16x8_t <b><b>vpaddlq_u8</b></b> (uint8x16_t a)<span class="right">Unsigned add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlp-unsigned-add-long-pairwise">UADDLP</a> Vd.8H,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddl_u16" type="checkbox"><label for="vpaddl_u16"><div>uint32x2_t <b><b>vpaddl_u16</b></b> (uint16x4_t a)<span class="right">Unsigned add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlp-unsigned-add-long-pairwise">UADDLP</a> Vd.2S,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddlq_u16" type="checkbox"><label for="vpaddlq_u16"><div>uint32x4_t <b><b>vpaddlq_u16</b></b> (uint16x8_t a)<span class="right">Unsigned add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlp-unsigned-add-long-pairwise">UADDLP</a> Vd.4S,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddl_u32" type="checkbox"><label for="vpaddl_u32"><div>uint64x1_t <b><b>vpaddl_u32</b></b> (uint32x2_t a)<span class="right">Unsigned add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlp-unsigned-add-long-pairwise">UADDLP</a> Vd.1D,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpaddlq_u32" type="checkbox"><label for="vpaddlq_u32"><div>uint64x2_t <b><b>vpaddlq_u32</b></b> (uint32x4_t a)<span class="right">Unsigned add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlp-unsigned-add-long-pairwise">UADDLP</a> Vd.2D,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadal_s8" type="checkbox"><label for="vpadal_s8"><div>int16x4_t <b><b>vpadal_s8</b></b> (int16x4_t a, int8x8_t b)<span class="right">Signed add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register and accumulates the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sadalp-signed-add-and-accumulate-long-pairwise">SADALP</a> Vd.4H,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadalq_s8" type="checkbox"><label for="vpadalq_s8"><div>int16x8_t <b><b>vpadalq_s8</b></b> (int16x8_t a, int8x16_t b)<span class="right">Signed add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register and accumulates the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sadalp-signed-add-and-accumulate-long-pairwise">SADALP</a> Vd.8H,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadal_s16" type="checkbox"><label for="vpadal_s16"><div>int32x2_t <b><b>vpadal_s16</b></b> (int32x2_t a, int16x4_t b)<span class="right">Signed add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register and accumulates the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sadalp-signed-add-and-accumulate-long-pairwise">SADALP</a> Vd.2S,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadalq_s16" type="checkbox"><label for="vpadalq_s16"><div>int32x4_t <b><b>vpadalq_s16</b></b> (int32x4_t a, int16x8_t b)<span class="right">Signed add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register and accumulates the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sadalp-signed-add-and-accumulate-long-pairwise">SADALP</a> Vd.4S,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadal_s32" type="checkbox"><label for="vpadal_s32"><div>int64x1_t <b><b>vpadal_s32</b></b> (int64x1_t a, int32x2_t b)<span class="right">Signed add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register and accumulates the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sadalp-signed-add-and-accumulate-long-pairwise">SADALP</a> Vd.1D,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D <br />
+b &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadalq_s32" type="checkbox"><label for="vpadalq_s32"><div>int64x2_t <b><b>vpadalq_s32</b></b> (int64x2_t a, int32x4_t b)<span class="right">Signed add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register and accumulates the results into the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sadalp-signed-add-and-accumulate-long-pairwise">SADALP</a> Vd.2D,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadal_u8" type="checkbox"><label for="vpadal_u8"><div>uint16x4_t <b><b>vpadal_u8</b></b> (uint16x4_t a, uint8x8_t b)<span class="right">Unsigned add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uadalp-unsigned-add-and-accumulate-long-pairwise">UADALP</a> Vd.4H,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H <br />
+b &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadalq_u8" type="checkbox"><label for="vpadalq_u8"><div>uint16x8_t <b><b>vpadalq_u8</b></b> (uint16x8_t a, uint8x16_t b)<span class="right">Unsigned add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uadalp-unsigned-add-and-accumulate-long-pairwise">UADALP</a> Vd.8H,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H <br />
+b &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadal_u16" type="checkbox"><label for="vpadal_u16"><div>uint32x2_t <b><b>vpadal_u16</b></b> (uint32x2_t a, uint16x4_t b)<span class="right">Unsigned add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uadalp-unsigned-add-and-accumulate-long-pairwise">UADALP</a> Vd.2S,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadalq_u16" type="checkbox"><label for="vpadalq_u16"><div>uint32x4_t <b><b>vpadalq_u16</b></b> (uint32x4_t a, uint16x8_t b)<span class="right">Unsigned add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uadalp-unsigned-add-and-accumulate-long-pairwise">UADALP</a> Vd.4S,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadal_u32" type="checkbox"><label for="vpadal_u32"><div>uint64x1_t <b><b>vpadal_u32</b></b> (uint64x1_t a, uint32x2_t b)<span class="right">Unsigned add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uadalp-unsigned-add-and-accumulate-long-pairwise">UADALP</a> Vd.1D,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D <br />
+b &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpadalq_u32" type="checkbox"><label for="vpadalq_u32"><div>uint64x2_t <b><b>vpadalq_u32</b></b> (uint64x2_t a, uint32x4_t b)<span class="right">Unsigned add and accumulate long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add and Accumulate Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register and accumulates the results with the vector elements of the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uadalp-unsigned-add-and-accumulate-long-pairwise">UADALP</a> Vd.2D,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmax_s8" type="checkbox"><label for="vpmax_s8"><div>int8x8_t <b><b>vpmax_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxp-signed-maximum-pairwise">SMAXP</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmax_s16" type="checkbox"><label for="vpmax_s16"><div>int16x4_t <b><b>vpmax_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxp-signed-maximum-pairwise">SMAXP</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmax_s32" type="checkbox"><label for="vpmax_s32"><div>int32x2_t <b><b>vpmax_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxp-signed-maximum-pairwise">SMAXP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmax_u8" type="checkbox"><label for="vpmax_u8"><div>uint8x8_t <b><b>vpmax_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxp-unsigned-maximum-pairwise">UMAXP</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmax_u16" type="checkbox"><label for="vpmax_u16"><div>uint16x4_t <b><b>vpmax_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxp-unsigned-maximum-pairwise">UMAXP</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmax_u32" type="checkbox"><label for="vpmax_u32"><div>uint32x2_t <b><b>vpmax_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxp-unsigned-maximum-pairwise">UMAXP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmax_f32" type="checkbox"><label for="vpmax_f32"><div>float32x2_t <b><b>vpmax_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the larger of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxp-vector-floating-point-maximum-pairwise-vector">FMAXP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxq_s8" type="checkbox"><label for="vpmaxq_s8"><div>int8x16_t <b><b>vpmaxq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxp-signed-maximum-pairwise">SMAXP</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxq_s16" type="checkbox"><label for="vpmaxq_s16"><div>int16x8_t <b><b>vpmaxq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxp-signed-maximum-pairwise">SMAXP</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxq_s32" type="checkbox"><label for="vpmaxq_s32"><div>int32x4_t <b><b>vpmaxq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxp-signed-maximum-pairwise">SMAXP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxq_u8" type="checkbox"><label for="vpmaxq_u8"><div>uint8x16_t <b><b>vpmaxq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxp-unsigned-maximum-pairwise">UMAXP</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxq_u16" type="checkbox"><label for="vpmaxq_u16"><div>uint16x8_t <b><b>vpmaxq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxp-unsigned-maximum-pairwise">UMAXP</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxq_u32" type="checkbox"><label for="vpmaxq_u32"><div>uint32x4_t <b><b>vpmaxq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxp-unsigned-maximum-pairwise">UMAXP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxq_f32" type="checkbox"><label for="vpmaxq_f32"><div>float32x4_t <b><b>vpmaxq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the larger of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxp-vector-floating-point-maximum-pairwise-vector">FMAXP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxq_f64" type="checkbox"><label for="vpmaxq_f64"><div>float64x2_t <b><b>vpmaxq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the larger of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxp-vector-floating-point-maximum-pairwise-vector">FMAXP</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmin_s8" type="checkbox"><label for="vpmin_s8"><div>int8x8_t <b><b>vpmin_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Signed minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminp-signed-minimum-pairwise">SMINP</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmin_s16" type="checkbox"><label for="vpmin_s16"><div>int16x4_t <b><b>vpmin_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Signed minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminp-signed-minimum-pairwise">SMINP</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmin_s32" type="checkbox"><label for="vpmin_s32"><div>int32x2_t <b><b>vpmin_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Signed minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminp-signed-minimum-pairwise">SMINP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmin_u8" type="checkbox"><label for="vpmin_u8"><div>uint8x8_t <b><b>vpmin_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unsigned minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminp-unsigned-minimum-pairwise">UMINP</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmin_u16" type="checkbox"><label for="vpmin_u16"><div>uint16x4_t <b><b>vpmin_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unsigned minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminp-unsigned-minimum-pairwise">UMINP</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmin_u32" type="checkbox"><label for="vpmin_u32"><div>uint32x2_t <b><b>vpmin_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unsigned minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminp-unsigned-minimum-pairwise">UMINP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpmin_f32" type="checkbox"><label for="vpmin_f32"><div>float32x2_t <b><b>vpmin_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the smaller of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminp-vector-floating-point-minimum-pairwise-vector">FMINP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vpminq_s8" type="checkbox"><label for="vpminq_s8"><div>int8x16_t <b><b>vpminq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Signed minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminp-signed-minimum-pairwise">SMINP</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminq_s16" type="checkbox"><label for="vpminq_s16"><div>int16x8_t <b><b>vpminq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Signed minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminp-signed-minimum-pairwise">SMINP</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminq_s32" type="checkbox"><label for="vpminq_s32"><div>int32x4_t <b><b>vpminq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Signed minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminp-signed-minimum-pairwise">SMINP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminq_u8" type="checkbox"><label for="vpminq_u8"><div>uint8x16_t <b><b>vpminq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unsigned minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminp-unsigned-minimum-pairwise">UMINP</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminq_u16" type="checkbox"><label for="vpminq_u16"><div>uint16x8_t <b><b>vpminq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unsigned minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminp-unsigned-minimum-pairwise">UMINP</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminq_u32" type="checkbox"><label for="vpminq_u32"><div>uint32x4_t <b><b>vpminq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unsigned minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminp-unsigned-minimum-pairwise">UMINP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminq_f32" type="checkbox"><label for="vpminq_f32"><div>float32x4_t <b><b>vpminq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the smaller of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminp-vector-floating-point-minimum-pairwise-vector">FMINP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminq_f64" type="checkbox"><label for="vpminq_f64"><div>float64x2_t <b><b>vpminq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the smaller of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminp-vector-floating-point-minimum-pairwise-vector">FMINP</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxnm_f32" type="checkbox"><label for="vpmaxnm_f32"><div>float32x2_t <b><b>vpmaxnm_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point maximum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnmp-vector-floating-point-maximum-number-pairwise-vector">FMAXNMP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxnmq_f32" type="checkbox"><label for="vpmaxnmq_f32"><div>float32x4_t <b><b>vpmaxnmq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point maximum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnmp-vector-floating-point-maximum-number-pairwise-vector">FMAXNMP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxnmq_f64" type="checkbox"><label for="vpmaxnmq_f64"><div>float64x2_t <b><b>vpmaxnmq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point maximum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnmp-vector-floating-point-maximum-number-pairwise-vector">FMAXNMP</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminnm_f32" type="checkbox"><label for="vpminnm_f32"><div>float32x2_t <b><b>vpminnm_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Floating-point minimum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnmp-vector-floating-point-minimum-number-pairwise-vector">FMINNMP</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminnmq_f32" type="checkbox"><label for="vpminnmq_f32"><div>float32x4_t <b><b>vpminnmq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Floating-point minimum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnmp-vector-floating-point-minimum-number-pairwise-vector">FMINNMP</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminnmq_f64" type="checkbox"><label for="vpminnmq_f64"><div>float64x2_t <b><b>vpminnmq_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Floating-point minimum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnmp-vector-floating-point-minimum-number-pairwise-vector">FMINNMP</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddd_s64" type="checkbox"><label for="vpaddd_s64"><div>int64_t <b><b>vpaddd_s64</b></b> (int64x2_t a)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddd_u64" type="checkbox"><label for="vpaddd_u64"><div>uint64_t <b><b>vpaddd_u64</b></b> (uint64x2_t a)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpadds_f32" type="checkbox"><label for="vpadds_f32"><div>float32_t <b><b>vpadds_f32</b></b> (float32x2_t a)<span class="right">Floating-point add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/faddp-vector-floating-point-add-pairwise-vector">FADDP</a> Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpaddd_f64" type="checkbox"><label for="vpaddd_f64"><div>float64_t <b><b>vpaddd_f64</b></b> (float64x2_t a)<span class="right">Floating-point add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/faddp-vector-floating-point-add-pairwise-vector">FADDP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxs_f32" type="checkbox"><label for="vpmaxs_f32"><div>float32_t <b><b>vpmaxs_f32</b></b> (float32x2_t a)<span class="right">Floating-point maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the larger of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxp-vector-floating-point-maximum-pairwise-vector">FMAXP</a> Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxqd_f64" type="checkbox"><label for="vpmaxqd_f64"><div>float64_t <b><b>vpmaxqd_f64</b></b> (float64x2_t a)<span class="right">Floating-point maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the larger of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxp-vector-floating-point-maximum-pairwise-vector">FMAXP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmins_f32" type="checkbox"><label for="vpmins_f32"><div>float32_t <b><b>vpmins_f32</b></b> (float32x2_t a)<span class="right">Floating-point minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the smaller of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminp-vector-floating-point-minimum-pairwise-vector">FMINP</a> Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminqd_f64" type="checkbox"><label for="vpminqd_f64"><div>float64_t <b><b>vpminqd_f64</b></b> (float64x2_t a)<span class="right">Floating-point minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the smaller of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminp-vector-floating-point-minimum-pairwise-vector">FMINP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxnms_f32" type="checkbox"><label for="vpmaxnms_f32"><div>float32_t <b><b>vpmaxnms_f32</b></b> (float32x2_t a)<span class="right">Floating-point maximum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnmp-vector-floating-point-maximum-number-pairwise-vector">FMAXNMP</a> Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpmaxnmqd_f64" type="checkbox"><label for="vpmaxnmqd_f64"><div>float64_t <b><b>vpmaxnmqd_f64</b></b> (float64x2_t a)<span class="right">Floating-point maximum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnmp-vector-floating-point-maximum-number-pairwise-vector">FMAXNMP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminnms_f32" type="checkbox"><label for="vpminnms_f32"><div>float32_t <b><b>vpminnms_f32</b></b> (float32x2_t a)<span class="right">Floating-point minimum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnmp-vector-floating-point-minimum-number-pairwise-vector">FMINNMP</a> Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vpminnmqd_f64" type="checkbox"><label for="vpminnmqd_f64"><div>float64_t <b><b>vpminnmqd_f64</b></b> (float64x2_t a)<span class="right">Floating-point minimum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnmp-vector-floating-point-minimum-number-pairwise-vector">FMINNMP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddv_s8" type="checkbox"><label for="vaddv_s8"><div>int8_t <b><b>vaddv_s8</b></b> (int8x8_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Bd,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_s8" type="checkbox"><label for="vaddvq_s8"><div>int8_t <b><b>vaddvq_s8</b></b> (int8x16_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Bd,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddv_s16" type="checkbox"><label for="vaddv_s16"><div>int16_t <b><b>vaddv_s16</b></b> (int16x4_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Hd,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_s16" type="checkbox"><label for="vaddvq_s16"><div>int16_t <b><b>vaddvq_s16</b></b> (int16x8_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Hd,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddv_s32" type="checkbox"><label for="vaddv_s32"><div>int32_t <b><b>vaddv_s32</b></b> (int32x2_t a)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a>  Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+a &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.S[0] &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_s32" type="checkbox"><label for="vaddvq_s32"><div>int32_t <b><b>vaddvq_s32</b></b> (int32x4_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_s64" type="checkbox"><label for="vaddvq_s64"><div>int64_t <b><b>vaddvq_s64</b></b> (int64x2_t a)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddv_u8" type="checkbox"><label for="vaddv_u8"><div>uint8_t <b><b>vaddv_u8</b></b> (uint8x8_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Bd,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_u8" type="checkbox"><label for="vaddvq_u8"><div>uint8_t <b><b>vaddvq_u8</b></b> (uint8x16_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Bd,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddv_u16" type="checkbox"><label for="vaddv_u16"><div>uint16_t <b><b>vaddv_u16</b></b> (uint16x4_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Hd,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_u16" type="checkbox"><label for="vaddvq_u16"><div>uint16_t <b><b>vaddvq_u16</b></b> (uint16x8_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Hd,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddv_u32" type="checkbox"><label for="vaddv_u32"><div>uint32_t <b><b>vaddv_u32</b></b> (uint32x2_t a)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a>  Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+a &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.S[0] &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_u32" type="checkbox"><label for="vaddvq_u32"><div>uint32_t <b><b>vaddvq_u32</b></b> (uint32x4_t a)<span class="right">Add across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addv-add-across-vector">ADDV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_ADD" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_ADD</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_u64" type="checkbox"><label for="vaddvq_u64"><div>uint64_t <b><b>vaddvq_u64</b></b> (uint64x2_t a)<span class="right">Add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/addp-vector-add-pairwise-vector">ADDP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element1 + element2;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddv_f32" type="checkbox"><label for="vaddv_f32"><div>float32_t <b><b>vaddv_f32</b></b> (float32x2_t a)<span class="right">Floating-point add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/faddp-vector-floating-point-add-pairwise-vector">FADDP</a> Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_f32" type="checkbox"><label for="vaddvq_f32"><div>float32_t <b><b>vaddvq_f32</b></b> (float32x4_t a)<span class="right">Floating-point add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/faddp-vector-floating-point-add-pairwise-vector">FADDP</a> Vt.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/faddp-vector-floating-point-add-pairwise-vector">FADDP</a> Sd,Vt.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+a &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddvq_f64" type="checkbox"><label for="vaddvq_f64"><div>float64_t <b><b>vaddvq_f64</b></b> (float64x2_t a)<span class="right">Floating-point add pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Add Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, adds each pair of values together, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/faddp-vector-floating-point-add-pairwise-vector">FADDP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPAdd.3" title="function: bits(N) FPAdd(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPAdd</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlv_s8" type="checkbox"><label for="vaddlv_s8"><div>int16_t <b><b>vaddlv_s8</b></b> (int8x8_t a)<span class="right">Signed add long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlv-signed-add-long-across-vector">SADDLV</a> Hd,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlvq_s8" type="checkbox"><label for="vaddlvq_s8"><div>int16_t <b><b>vaddlvq_s8</b></b> (int8x16_t a)<span class="right">Signed add long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlv-signed-add-long-across-vector">SADDLV</a> Hd,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlv_s16" type="checkbox"><label for="vaddlv_s16"><div>int32_t <b><b>vaddlv_s16</b></b> (int16x4_t a)<span class="right">Signed add long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlv-signed-add-long-across-vector">SADDLV</a> Sd,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlvq_s16" type="checkbox"><label for="vaddlvq_s16"><div>int32_t <b><b>vaddlvq_s16</b></b> (int16x8_t a)<span class="right">Signed add long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlv-signed-add-long-across-vector">SADDLV</a> Sd,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlv_s32" type="checkbox"><label for="vaddlv_s32"><div>int64_t <b><b>vaddlv_s32</b></b> (int32x2_t a)<span class="right">Signed add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long Pairwise. This instruction adds pairs of adjacent signed integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlp-signed-add-long-pairwise">SADDLP</a> Vd.1D,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlvq_s32" type="checkbox"><label for="vaddlvq_s32"><div>int64_t <b><b>vaddlvq_s32</b></b> (int32x4_t a)<span class="right">Signed add long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Add Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/saddlv-signed-add-long-across-vector">SADDLV</a> Dd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlv_u8" type="checkbox"><label for="vaddlv_u8"><div>uint16_t <b><b>vaddlv_u8</b></b> (uint8x8_t a)<span class="right">Unsigned sum long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned sum Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlv-unsigned-sum-long-across-vector">UADDLV</a> Hd,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlvq_u8" type="checkbox"><label for="vaddlvq_u8"><div>uint16_t <b><b>vaddlvq_u8</b></b> (uint8x16_t a)<span class="right">Unsigned sum long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned sum Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlv-unsigned-sum-long-across-vector">UADDLV</a> Hd,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlv_u16" type="checkbox"><label for="vaddlv_u16"><div>uint32_t <b><b>vaddlv_u16</b></b> (uint16x4_t a)<span class="right">Unsigned sum long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned sum Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlv-unsigned-sum-long-across-vector">UADDLV</a> Sd,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlvq_u16" type="checkbox"><label for="vaddlvq_u16"><div>uint32_t <b><b>vaddlvq_u16</b></b> (uint16x8_t a)<span class="right">Unsigned sum long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned sum Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlv-unsigned-sum-long-across-vector">UADDLV</a> Sd,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlv_u32" type="checkbox"><label for="vaddlv_u32"><div>uint64_t <b><b>vaddlv_u32</b></b> (uint32x2_t a)<span class="right">Unsigned add long pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Add Long Pairwise. This instruction adds pairs of adjacent unsigned integer values from the vector in the source SIMD&amp;FP register, places the result into a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the source vector elements.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlp-unsigned-add-long-pairwise">UADDLP</a> Vd.1D,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+
+bits(2*esize) sum;
+integer op1;
+integer op2;
+
+result = if acc then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d] else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+for e = 0 to elements-1
+    op1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+0, esize], unsigned);
+    op2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 2*e+1, esize], unsigned);
+    sum = (op1+op2)&lt;2*esize-1:0&gt;;
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[result, e, 2*esize] + sum;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vaddlvq_u32" type="checkbox"><label for="vaddlvq_u32"><div>uint64_t <b><b>vaddlvq_u32</b></b> (uint32x4_t a)<span class="right">Unsigned sum long across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned sum Long across Vector. This instruction adds every vector element in the source SIMD&amp;FP register together, and writes the scalar result to the destination SIMD&amp;FP register. The destination scalar is twice as long as the source vector elements. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uaddlv-unsigned-sum-long-across-vector">UADDLV</a> Dd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer sum;
+
+sum = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    sum = sum + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = sum&lt;2*esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxv_s8" type="checkbox"><label for="vmaxv_s8"><div>int8_t <b><b>vmaxv_s8</b></b> (int8x8_t a)<span class="right">Signed maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxv-signed-maximum-across-vector">SMAXV</a> Bd,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxvq_s8" type="checkbox"><label for="vmaxvq_s8"><div>int8_t <b><b>vmaxvq_s8</b></b> (int8x16_t a)<span class="right">Signed maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxv-signed-maximum-across-vector">SMAXV</a> Bd,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxv_s16" type="checkbox"><label for="vmaxv_s16"><div>int16_t <b><b>vmaxv_s16</b></b> (int16x4_t a)<span class="right">Signed maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxv-signed-maximum-across-vector">SMAXV</a> Hd,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxvq_s16" type="checkbox"><label for="vmaxvq_s16"><div>int16_t <b><b>vmaxvq_s16</b></b> (int16x8_t a)<span class="right">Signed maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxv-signed-maximum-across-vector">SMAXV</a> Hd,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxv_s32" type="checkbox"><label for="vmaxv_s32"><div>int32_t <b><b>vmaxv_s32</b></b> (int32x2_t a)<span class="right">Signed maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxp-signed-maximum-pairwise">SMAXP</a>  Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+a &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.S[0] &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxvq_s32" type="checkbox"><label for="vmaxvq_s32"><div>int32_t <b><b>vmaxvq_s32</b></b> (int32x4_t a)<span class="right">Signed maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smaxv-signed-maximum-across-vector">SMAXV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxv_u8" type="checkbox"><label for="vmaxv_u8"><div>uint8_t <b><b>vmaxv_u8</b></b> (uint8x8_t a)<span class="right">Unsigned maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxv-unsigned-maximum-across-vector">UMAXV</a> Bd,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxvq_u8" type="checkbox"><label for="vmaxvq_u8"><div>uint8_t <b><b>vmaxvq_u8</b></b> (uint8x16_t a)<span class="right">Unsigned maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxv-unsigned-maximum-across-vector">UMAXV</a> Bd,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxv_u16" type="checkbox"><label for="vmaxv_u16"><div>uint16_t <b><b>vmaxv_u16</b></b> (uint16x4_t a)<span class="right">Unsigned maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxv-unsigned-maximum-across-vector">UMAXV</a> Hd,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxvq_u16" type="checkbox"><label for="vmaxvq_u16"><div>uint16_t <b><b>vmaxvq_u16</b></b> (uint16x8_t a)<span class="right">Unsigned maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxv-unsigned-maximum-across-vector">UMAXV</a> Hd,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxv_u32" type="checkbox"><label for="vmaxv_u32"><div>uint32_t <b><b>vmaxv_u32</b></b> (uint32x2_t a)<span class="right">Unsigned maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxp-unsigned-maximum-pairwise">UMAXP</a>  Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+a &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.S[0] &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxvq_u32" type="checkbox"><label for="vmaxvq_u32"><div>uint32_t <b><b>vmaxvq_u32</b></b> (uint32x4_t a)<span class="right">Unsigned maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umaxv-unsigned-maximum-across-vector">UMAXV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxv_f32" type="checkbox"><label for="vmaxv_f32"><div>float32_t <b><b>vmaxv_f32</b></b> (float32x2_t a)<span class="right">Floating-point maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the larger of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxp-vector-floating-point-maximum-pairwise-vector">FMAXP</a> Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxvq_f32" type="checkbox"><label for="vmaxvq_f32"><div>float32_t <b><b>vmaxvq_f32</b></b> (float32x4_t a)<span class="right">Floating-point maximum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxv-floating-point-maximum-across-vector">FMAXV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_FMAX" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_FMAX</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxvq_f64" type="checkbox"><label for="vmaxvq_f64"><div>float64_t <b><b>vmaxvq_f64</b></b> (float64x2_t a)<span class="right">Floating-point maximum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the larger of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxp-vector-floating-point-maximum-pairwise-vector">FMAXP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminv_s8" type="checkbox"><label for="vminv_s8"><div>int8_t <b><b>vminv_s8</b></b> (int8x8_t a)<span class="right">Signed minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminv-signed-minimum-across-vector">SMINV</a> Bd,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminvq_s8" type="checkbox"><label for="vminvq_s8"><div>int8_t <b><b>vminvq_s8</b></b> (int8x16_t a)<span class="right">Signed minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminv-signed-minimum-across-vector">SMINV</a> Bd,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminv_s16" type="checkbox"><label for="vminv_s16"><div>int16_t <b><b>vminv_s16</b></b> (int16x4_t a)<span class="right">Signed minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminv-signed-minimum-across-vector">SMINV</a> Hd,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminvq_s16" type="checkbox"><label for="vminvq_s16"><div>int16_t <b><b>vminvq_s16</b></b> (int16x8_t a)<span class="right">Signed minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminv-signed-minimum-across-vector">SMINV</a> Hd,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminv_s32" type="checkbox"><label for="vminv_s32"><div>int32_t <b><b>vminv_s32</b></b> (int32x2_t a)<span class="right">Signed minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of signed integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminp-signed-minimum-pairwise">SMINP</a>  Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+a &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.S[0] &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminvq_s32" type="checkbox"><label for="vminvq_s32"><div>int32_t <b><b>vminvq_s32</b></b> (int32x4_t a)<span class="right">Signed minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are signed integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sminv-signed-minimum-across-vector">SMINV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminv_u8" type="checkbox"><label for="vminv_u8"><div>uint8_t <b><b>vminv_u8</b></b> (uint8x8_t a)<span class="right">Unsigned minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminv-unsigned-minimum-across-vector">UMINV</a> Bd,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminvq_u8" type="checkbox"><label for="vminvq_u8"><div>uint8_t <b><b>vminvq_u8</b></b> (uint8x16_t a)<span class="right">Unsigned minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminv-unsigned-minimum-across-vector">UMINV</a> Bd,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Bd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminv_u16" type="checkbox"><label for="vminv_u16"><div>uint16_t <b><b>vminv_u16</b></b> (uint16x4_t a)<span class="right">Unsigned minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminv-unsigned-minimum-across-vector">UMINV</a> Hd,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminvq_u16" type="checkbox"><label for="vminvq_u16"><div>uint16_t <b><b>vminvq_u16</b></b> (uint16x8_t a)<span class="right">Unsigned minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminv-unsigned-minimum-across-vector">UMINV</a> Hd,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminv_u32" type="checkbox"><label for="vminv_u32"><div>uint32_t <b><b>vminv_u32</b></b> (uint32x2_t a)<span class="right">Unsigned minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum Pairwise. This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of unsigned integer values into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminp-unsigned-minimum-pairwise">UMINP</a>  Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+a &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.S[0] &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+integer element1;
+integer element2;
+integer maxmin;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize], unsigned);
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize], unsigned);
+    maxmin = if minimum then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(element1, element2) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(element1, element2);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = maxmin&lt;esize-1:0&gt;;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminvq_u32" type="checkbox"><label for="vminvq_u32"><div>uint32_t <b><b>vminvq_u32</b></b> (uint32x4_t a)<span class="right">Unsigned minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are unsigned integer values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uminv-unsigned-minimum-across-vector">UMINV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+integer maxmin;
+integer element;
+
+maxmin = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, 0, esize], unsigned);
+for e = 1 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Int.2" title="function: integer Int(bits(N) x, boolean unsigned)">Int</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize], unsigned);
+    maxmin = if min then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Min.2" title="function: integer Min(integer a, integer b)">Min</a>(maxmin, element) else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Max.2" title="function: integer Max(integer a, integer b)">Max</a>(maxmin, element);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = maxmin&lt;esize-1:0&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminv_f32" type="checkbox"><label for="vminv_f32"><div>float32_t <b><b>vminv_f32</b></b> (float32x2_t a)<span class="right">Floating-point minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the smaller of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminp-vector-floating-point-minimum-pairwise-vector">FMINP</a> Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminvq_f32" type="checkbox"><label for="vminvq_f32"><div>float32_t <b><b>vminvq_f32</b></b> (float32x4_t a)<span class="right">Floating-point minimum across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminv-floating-point-minimum-across-vector">FMINV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_FMIN" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_FMIN</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminvq_f64" type="checkbox"><label for="vminvq_f64"><div>float64_t <b><b>vminvq_f64</b></b> (float64x2_t a)<span class="right">Floating-point minimum pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements from the concatenated vector, writes the smaller of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminp-vector-floating-point-minimum-pairwise-vector">FMINP</a> Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMin.3" title="function: bits(N) FPMin(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMin</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMax.3" title="function: bits(N) FPMax(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMax</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxnmv_f32" type="checkbox"><label for="vmaxnmv_f32"><div>float32_t <b><b>vmaxnmv_f32</b></b> (float32x2_t a)<span class="right">Floating-point maximum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnmp-vector-floating-point-maximum-number-pairwise-vector">FMAXNMP</a> Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxnmvq_f32" type="checkbox"><label for="vmaxnmvq_f32"><div>float32_t <b><b>vmaxnmvq_f32</b></b> (float32x4_t a)<span class="right">Floating-point maximum number across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the largest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnmv-floating-point-maximum-number-across-vector">FMAXNMV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_FMAXNUM" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_FMAXNUM</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vmaxnmvq_f64" type="checkbox"><label for="vmaxnmvq_f64"><div>float64_t <b><b>vmaxnmvq_f64</b></b> (float64x2_t a)<span class="right">Floating-point maximum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Maximum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the largest of each pair of values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmaxnmp-vector-floating-point-maximum-number-pairwise-vector">FMAXNMP</a>  Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminnmv_f32" type="checkbox"><label for="vminnmv_f32"><div>float32_t <b><b>vminnmv_f32</b></b> (float32x2_t a)<span class="right">Floating-point minimum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnmp-vector-floating-point-minimum-number-pairwise-vector">FMINNMP</a>  Sd,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminnmvq_f32" type="checkbox"><label for="vminnmvq_f32"><div>float32_t <b><b>vminnmvq_f32</b></b> (float32x4_t a)<span class="right">Floating-point minimum number across vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number across Vector. This instruction compares all the vector elements in the source SIMD&amp;FP register, and writes the smallest of the values as a scalar to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnmv-floating-point-minimum-number-across-vector">FMINNMV</a> Sd,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.Reduce.3" title="function: bits(esize) Reduce(ReduceOp op, bits(N) input, integer esize)">Reduce</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#ReduceOp_FMINNUM" title="enumeration ReduceOp {ReduceOp_FMINNUM, ReduceOp_FMAXNUM,
+ ReduceOp_FMIN, ReduceOp_FMAX,
+ ReduceOp_FADD, ReduceOp_ADD}">ReduceOp_FMINNUM</a>, operand, esize);</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vminnmvq_f64" type="checkbox"><label for="vminnmvq_f64"><div>float64_t <b><b>vminnmvq_f64</b></b> (float64x2_t a)<span class="right">Floating-point minimum number pairwise</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Minimum Number Pairwise (vector). This instruction creates a vector by concatenating the vector elements of the first source SIMD&amp;FP register after the vector elements of the second source SIMD&amp;FP register, reads each pair of adjacent vector elements in the two source SIMD&amp;FP registers, writes the smallest of each pair of floating-point values into a vector, and writes the vector to the destination SIMD&amp;FP register. All the values in this instruction are floating-point values.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fminnmp-vector-floating-point-minimum-number-pairwise-vector">FMINNMP</a>  Dd,Vn.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+bits(2*datasize) concat = operand2:operand1;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    if pair then
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, 2*e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[concat, (2*e)+1, esize];
+    else
+        element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+        element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+
+    if minimum then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMinNum.3" title="function: bits(N) FPMinNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMinNum</a>(element1, element2, FPCR);
+    else
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMaxNum.3" title="function: bits(N) FPMaxNum(bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMaxNum</a>(element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vext_s8" type="checkbox"><label for="vext_s8"><div>int8x8_t <b><b>vext_s8</b></b> (int8x8_t a, int8x8_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_s8" type="checkbox"><label for="vextq_s8"><div>int8x16_t <b><b>vextq_s8</b></b> (int8x16_t a, int8x16_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_s16" type="checkbox"><label for="vext_s16"><div>int16x4_t <b><b>vext_s16</b></b> (int16x4_t a, int16x4_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<1)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_s16" type="checkbox"><label for="vextq_s16"><div>int16x8_t <b><b>vextq_s16</b></b> (int16x8_t a, int16x8_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<1)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_s32" type="checkbox"><label for="vext_s32"><div>int32x2_t <b><b>vext_s32</b></b> (int32x2_t a, int32x2_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<2)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_s32" type="checkbox"><label for="vextq_s32"><div>int32x4_t <b><b>vextq_s32</b></b> (int32x4_t a, int32x4_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<2)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_s64" type="checkbox"><label for="vext_s64"><div>int64x1_t <b><b>vext_s64</b></b> (int64x1_t a, int64x1_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<3)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_s64" type="checkbox"><label for="vextq_s64"><div>int64x2_t <b><b>vextq_s64</b></b> (int64x2_t a, int64x2_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<3)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_u8" type="checkbox"><label for="vext_u8"><div>uint8x8_t <b><b>vext_u8</b></b> (uint8x8_t a, uint8x8_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_u8" type="checkbox"><label for="vextq_u8"><div>uint8x16_t <b><b>vextq_u8</b></b> (uint8x16_t a, uint8x16_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_u16" type="checkbox"><label for="vext_u16"><div>uint16x4_t <b><b>vext_u16</b></b> (uint16x4_t a, uint16x4_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<1)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_u16" type="checkbox"><label for="vextq_u16"><div>uint16x8_t <b><b>vextq_u16</b></b> (uint16x8_t a, uint16x8_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<1)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_u32" type="checkbox"><label for="vext_u32"><div>uint32x2_t <b><b>vext_u32</b></b> (uint32x2_t a, uint32x2_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<2)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_u32" type="checkbox"><label for="vextq_u32"><div>uint32x4_t <b><b>vextq_u32</b></b> (uint32x4_t a, uint32x4_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<2)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_u64" type="checkbox"><label for="vext_u64"><div>uint64x1_t <b><b>vext_u64</b></b> (uint64x1_t a, uint64x1_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<3)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_u64" type="checkbox"><label for="vextq_u64"><div>uint64x2_t <b><b>vextq_u64</b></b> (uint64x2_t a, uint64x2_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<3)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_p64" type="checkbox"><label for="vext_p64"><div>poly64x1_t <b><b>vext_p64</b></b> (poly64x1_t a, poly64x1_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<3)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_p64" type="checkbox"><label for="vextq_p64"><div>poly64x2_t <b><b>vextq_p64</b></b> (poly64x2_t a, poly64x2_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<3)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_f32" type="checkbox"><label for="vext_f32"><div>float32x2_t <b><b>vext_f32</b></b> (float32x2_t a, float32x2_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<2)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_f32" type="checkbox"><label for="vextq_f32"><div>float32x4_t <b><b>vextq_f32</b></b> (float32x4_t a, float32x4_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<2)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_f64" type="checkbox"><label for="vext_f64"><div>float64x1_t <b><b>vext_f64</b></b> (float64x1_t a, float64x1_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<3)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vextq_f64" type="checkbox"><label for="vextq_f64"><div>float64x2_t <b><b>vextq_f64</b></b> (float64x2_t a, float64x2_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<3)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vext_p8" type="checkbox"><label for="vext_p8"><div>poly8x8_t <b><b>vext_p8</b></b> (poly8x8_t a, poly8x8_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_p8" type="checkbox"><label for="vextq_p8"><div>poly8x16_t <b><b>vextq_p8</b></b> (poly8x16_t a, poly8x16_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#n
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vext_p16" type="checkbox"><label for="vext_p16"><div>poly16x4_t <b><b>vext_p16</b></b> (poly16x4_t a, poly16x4_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.8B,Vn.8B,Vm.8B,#(n<<1)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B <br />
+0 &lt;&lt; n &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vextq_p16" type="checkbox"><label for="vextq_p16"><div>poly16x8_t <b><b>vextq_p16</b></b> (poly16x8_t a, poly16x8_t b, const int n)<span class="right">Extract vector from pair of vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Extract vector from pair of vectors. This instruction extracts the lowest vector elements from the second source SIMD&amp;FP register and the highest vector elements from the first source SIMD&amp;FP register, concatenates the results into a vector, and writes the vector to the destination SIMD&amp;FP register vector. The index value specifies the lowest vector element to extract from the first source register, and consecutive elements are extracted from the first, then second, source registers until the destination vector is filled.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ext-extract-vector-from-pair-of-vectors">EXT</a> Vd.16B,Vn.16B,Vm.16B,#(n<<1)
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B <br />
+0 &lt;&lt; n &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) hi = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) lo = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize*2) concat = hi:lo;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = concat&lt;position+datasize-1:position&gt;;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64_s8" type="checkbox"><label for="vrev64_s8"><div>int8x8_t <b><b>vrev64_s8</b></b> (int8x8_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64q_s8" type="checkbox"><label for="vrev64q_s8"><div>int8x16_t <b><b>vrev64q_s8</b></b> (int8x16_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64_s16" type="checkbox"><label for="vrev64_s16"><div>int16x4_t <b><b>vrev64_s16</b></b> (int16x4_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64q_s16" type="checkbox"><label for="vrev64q_s16"><div>int16x8_t <b><b>vrev64q_s16</b></b> (int16x8_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64_s32" type="checkbox"><label for="vrev64_s32"><div>int32x2_t <b><b>vrev64_s32</b></b> (int32x2_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64q_s32" type="checkbox"><label for="vrev64q_s32"><div>int32x4_t <b><b>vrev64q_s32</b></b> (int32x4_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64_u8" type="checkbox"><label for="vrev64_u8"><div>uint8x8_t <b><b>vrev64_u8</b></b> (uint8x8_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64q_u8" type="checkbox"><label for="vrev64q_u8"><div>uint8x16_t <b><b>vrev64q_u8</b></b> (uint8x16_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64_u16" type="checkbox"><label for="vrev64_u16"><div>uint16x4_t <b><b>vrev64_u16</b></b> (uint16x4_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64q_u16" type="checkbox"><label for="vrev64q_u16"><div>uint16x8_t <b><b>vrev64q_u16</b></b> (uint16x8_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64_u32" type="checkbox"><label for="vrev64_u32"><div>uint32x2_t <b><b>vrev64_u32</b></b> (uint32x2_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64q_u32" type="checkbox"><label for="vrev64q_u32"><div>uint32x4_t <b><b>vrev64q_u32</b></b> (uint32x4_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64_f32" type="checkbox"><label for="vrev64_f32"><div>float32x2_t <b><b>vrev64_f32</b></b> (float32x2_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.2S,Vn.2S
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64q_f32" type="checkbox"><label for="vrev64q_f32"><div>float32x4_t <b><b>vrev64q_f32</b></b> (float32x4_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64_p8" type="checkbox"><label for="vrev64_p8"><div>poly8x8_t <b><b>vrev64_p8</b></b> (poly8x8_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64q_p8" type="checkbox"><label for="vrev64q_p8"><div>poly8x16_t <b><b>vrev64q_p8</b></b> (poly8x16_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64_p16" type="checkbox"><label for="vrev64_p16"><div>poly16x4_t <b><b>vrev64_p16</b></b> (poly16x4_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev64q_p16" type="checkbox"><label for="vrev64q_p16"><div>poly16x8_t <b><b>vrev64q_p16</b></b> (poly16x8_t vec)<span class="right">Reverse elements in 64-bit doublewords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 64-bit doublewords (vector). This instruction reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev64-reverse-elements-in-64-bit-doublewords-vector">REV64</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32_s8" type="checkbox"><label for="vrev32_s8"><div>int8x8_t <b><b>vrev32_s8</b></b> (int8x8_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32q_s8" type="checkbox"><label for="vrev32q_s8"><div>int8x16_t <b><b>vrev32q_s8</b></b> (int8x16_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32_s16" type="checkbox"><label for="vrev32_s16"><div>int16x4_t <b><b>vrev32_s16</b></b> (int16x4_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32q_s16" type="checkbox"><label for="vrev32q_s16"><div>int16x8_t <b><b>vrev32q_s16</b></b> (int16x8_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32_u8" type="checkbox"><label for="vrev32_u8"><div>uint8x8_t <b><b>vrev32_u8</b></b> (uint8x8_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32q_u8" type="checkbox"><label for="vrev32q_u8"><div>uint8x16_t <b><b>vrev32q_u8</b></b> (uint8x16_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32_u16" type="checkbox"><label for="vrev32_u16"><div>uint16x4_t <b><b>vrev32_u16</b></b> (uint16x4_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32q_u16" type="checkbox"><label for="vrev32q_u16"><div>uint16x8_t <b><b>vrev32q_u16</b></b> (uint16x8_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32_p8" type="checkbox"><label for="vrev32_p8"><div>poly8x8_t <b><b>vrev32_p8</b></b> (poly8x8_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32q_p8" type="checkbox"><label for="vrev32q_p8"><div>poly8x16_t <b><b>vrev32q_p8</b></b> (poly8x16_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32_p16" type="checkbox"><label for="vrev32_p16"><div>poly16x4_t <b><b>vrev32_p16</b></b> (poly16x4_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.4H,Vn.4H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev32q_p16" type="checkbox"><label for="vrev32q_p16"><div>poly16x8_t <b><b>vrev32q_p16</b></b> (poly16x8_t vec)<span class="right">Reverse elements in 32-bit words</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 32-bit words (vector). This instruction reverses the order of 8-bit or 16-bit elements in each word of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev32-vector-reverse-elements-in-32-bit-words-vector">REV32</a> Vd.8H,Vn.8H
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev16_s8" type="checkbox"><label for="vrev16_s8"><div>int8x8_t <b><b>vrev16_s8</b></b> (int8x8_t vec)<span class="right">Reverse elements in 16-bit halfwords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 16-bit halfwords (vector). This instruction reverses the order of 8-bit elements in each halfword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev16-vector-reverse-elements-in-16-bit-halfwords-vector">REV16</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev16q_s8" type="checkbox"><label for="vrev16q_s8"><div>int8x16_t <b><b>vrev16q_s8</b></b> (int8x16_t vec)<span class="right">Reverse elements in 16-bit halfwords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 16-bit halfwords (vector). This instruction reverses the order of 8-bit elements in each halfword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev16-vector-reverse-elements-in-16-bit-halfwords-vector">REV16</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev16_u8" type="checkbox"><label for="vrev16_u8"><div>uint8x8_t <b><b>vrev16_u8</b></b> (uint8x8_t vec)<span class="right">Reverse elements in 16-bit halfwords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 16-bit halfwords (vector). This instruction reverses the order of 8-bit elements in each halfword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev16-vector-reverse-elements-in-16-bit-halfwords-vector">REV16</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev16q_u8" type="checkbox"><label for="vrev16q_u8"><div>uint8x16_t <b><b>vrev16q_u8</b></b> (uint8x16_t vec)<span class="right">Reverse elements in 16-bit halfwords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 16-bit halfwords (vector). This instruction reverses the order of 8-bit elements in each halfword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev16-vector-reverse-elements-in-16-bit-halfwords-vector">REV16</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev16_p8" type="checkbox"><label for="vrev16_p8"><div>poly8x8_t <b><b>vrev16_p8</b></b> (poly8x8_t vec)<span class="right">Reverse elements in 16-bit halfwords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 16-bit halfwords (vector). This instruction reverses the order of 8-bit elements in each halfword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev16-vector-reverse-elements-in-16-bit-halfwords-vector">REV16</a> Vd.8B,Vn.8B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vrev16q_p8" type="checkbox"><label for="vrev16q_p8"><div>poly8x16_t <b><b>vrev16q_p8</b></b> (poly8x16_t vec)<span class="right">Reverse elements in 16-bit halfwords</span></div></label><article>      <h4>Description</h4><p><p class="aml">Reverse elements in 16-bit halfwords (vector). This instruction reverses the order of 8-bit elements in each halfword of the vector in the source SIMD&amp;FP register, places the results into a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/rev16-vector-reverse-elements-in-16-bit-halfwords-vector">REV16</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>vec &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+integer element = 0;
+integer rev_element;
+for c = 0 to containers-1
+    rev_element = element + elements_per_container - 1;
+    for e = 0 to elements_per_container-1
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, rev_element, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, element, esize];
+        element = element + 1;
+        rev_element = rev_element - 1;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip1_s8" type="checkbox"><label for="vzip1_s8"><div>int8x8_t <b><b>vzip1_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_s8" type="checkbox"><label for="vzip1q_s8"><div>int8x16_t <b><b>vzip1q_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1_s16" type="checkbox"><label for="vzip1_s16"><div>int16x4_t <b><b>vzip1_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_s16" type="checkbox"><label for="vzip1q_s16"><div>int16x8_t <b><b>vzip1q_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1_s32" type="checkbox"><label for="vzip1_s32"><div>int32x2_t <b><b>vzip1_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_s32" type="checkbox"><label for="vzip1q_s32"><div>int32x4_t <b><b>vzip1q_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_s64" type="checkbox"><label for="vzip1q_s64"><div>int64x2_t <b><b>vzip1q_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1_u8" type="checkbox"><label for="vzip1_u8"><div>uint8x8_t <b><b>vzip1_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_u8" type="checkbox"><label for="vzip1q_u8"><div>uint8x16_t <b><b>vzip1q_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1_u16" type="checkbox"><label for="vzip1_u16"><div>uint16x4_t <b><b>vzip1_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_u16" type="checkbox"><label for="vzip1q_u16"><div>uint16x8_t <b><b>vzip1q_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1_u32" type="checkbox"><label for="vzip1_u32"><div>uint32x2_t <b><b>vzip1_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_u32" type="checkbox"><label for="vzip1q_u32"><div>uint32x4_t <b><b>vzip1q_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_u64" type="checkbox"><label for="vzip1q_u64"><div>uint64x2_t <b><b>vzip1q_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_p64" type="checkbox"><label for="vzip1q_p64"><div>poly64x2_t <b><b>vzip1q_p64</b></b> (poly64x2_t a, poly64x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1_f32" type="checkbox"><label for="vzip1_f32"><div>float32x2_t <b><b>vzip1_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_f32" type="checkbox"><label for="vzip1q_f32"><div>float32x4_t <b><b>vzip1q_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_f64" type="checkbox"><label for="vzip1q_f64"><div>float64x2_t <b><b>vzip1q_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1_p8" type="checkbox"><label for="vzip1_p8"><div>poly8x8_t <b><b>vzip1_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_p8" type="checkbox"><label for="vzip1q_p8"><div>poly8x16_t <b><b>vzip1q_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1_p16" type="checkbox"><label for="vzip1_p16"><div>poly16x4_t <b><b>vzip1_p16</b></b> (poly16x4_t a, poly16x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip1q_p16" type="checkbox"><label for="vzip1q_p16"><div>poly16x8_t <b><b>vzip1q_p16</b></b> (poly16x8_t a, poly16x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (primary). This instruction reads adjacent vector elements from the upper half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2_s8" type="checkbox"><label for="vzip2_s8"><div>int8x8_t <b><b>vzip2_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_s8" type="checkbox"><label for="vzip2q_s8"><div>int8x16_t <b><b>vzip2q_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2_s16" type="checkbox"><label for="vzip2_s16"><div>int16x4_t <b><b>vzip2_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_s16" type="checkbox"><label for="vzip2q_s16"><div>int16x8_t <b><b>vzip2q_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2_s32" type="checkbox"><label for="vzip2_s32"><div>int32x2_t <b><b>vzip2_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_s32" type="checkbox"><label for="vzip2q_s32"><div>int32x4_t <b><b>vzip2q_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_s64" type="checkbox"><label for="vzip2q_s64"><div>int64x2_t <b><b>vzip2q_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2_u8" type="checkbox"><label for="vzip2_u8"><div>uint8x8_t <b><b>vzip2_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_u8" type="checkbox"><label for="vzip2q_u8"><div>uint8x16_t <b><b>vzip2q_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2_u16" type="checkbox"><label for="vzip2_u16"><div>uint16x4_t <b><b>vzip2_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_u16" type="checkbox"><label for="vzip2q_u16"><div>uint16x8_t <b><b>vzip2q_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2_u32" type="checkbox"><label for="vzip2_u32"><div>uint32x2_t <b><b>vzip2_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_u32" type="checkbox"><label for="vzip2q_u32"><div>uint32x4_t <b><b>vzip2q_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_u64" type="checkbox"><label for="vzip2q_u64"><div>uint64x2_t <b><b>vzip2q_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_p64" type="checkbox"><label for="vzip2q_p64"><div>poly64x2_t <b><b>vzip2q_p64</b></b> (poly64x2_t a, poly64x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2_f32" type="checkbox"><label for="vzip2_f32"><div>float32x2_t <b><b>vzip2_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_f32" type="checkbox"><label for="vzip2q_f32"><div>float32x4_t <b><b>vzip2q_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_f64" type="checkbox"><label for="vzip2q_f64"><div>float64x2_t <b><b>vzip2q_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2_p8" type="checkbox"><label for="vzip2_p8"><div>poly8x8_t <b><b>vzip2_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_p8" type="checkbox"><label for="vzip2q_p8"><div>poly8x16_t <b><b>vzip2q_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2_p16" type="checkbox"><label for="vzip2_p16"><div>poly16x4_t <b><b>vzip2_p16</b></b> (poly16x4_t a, poly16x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vzip2q_p16" type="checkbox"><label for="vzip2q_p16"><div>poly16x8_t <b><b>vzip2q_p16</b></b> (poly16x8_t a, poly16x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1_s8" type="checkbox"><label for="vuzp1_s8"><div>int8x8_t <b><b>vuzp1_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_s8" type="checkbox"><label for="vuzp1q_s8"><div>int8x16_t <b><b>vuzp1q_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1_s16" type="checkbox"><label for="vuzp1_s16"><div>int16x4_t <b><b>vuzp1_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_s16" type="checkbox"><label for="vuzp1q_s16"><div>int16x8_t <b><b>vuzp1q_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1_s32" type="checkbox"><label for="vuzp1_s32"><div>int32x2_t <b><b>vuzp1_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_s32" type="checkbox"><label for="vuzp1q_s32"><div>int32x4_t <b><b>vuzp1q_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_s64" type="checkbox"><label for="vuzp1q_s64"><div>int64x2_t <b><b>vuzp1q_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1_u8" type="checkbox"><label for="vuzp1_u8"><div>uint8x8_t <b><b>vuzp1_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_u8" type="checkbox"><label for="vuzp1q_u8"><div>uint8x16_t <b><b>vuzp1q_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1_u16" type="checkbox"><label for="vuzp1_u16"><div>uint16x4_t <b><b>vuzp1_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_u16" type="checkbox"><label for="vuzp1q_u16"><div>uint16x8_t <b><b>vuzp1q_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1_u32" type="checkbox"><label for="vuzp1_u32"><div>uint32x2_t <b><b>vuzp1_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_u32" type="checkbox"><label for="vuzp1q_u32"><div>uint32x4_t <b><b>vuzp1q_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_u64" type="checkbox"><label for="vuzp1q_u64"><div>uint64x2_t <b><b>vuzp1q_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_p64" type="checkbox"><label for="vuzp1q_p64"><div>poly64x2_t <b><b>vuzp1q_p64</b></b> (poly64x2_t a, poly64x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1_f32" type="checkbox"><label for="vuzp1_f32"><div>float32x2_t <b><b>vuzp1_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_f32" type="checkbox"><label for="vuzp1q_f32"><div>float32x4_t <b><b>vuzp1q_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_f64" type="checkbox"><label for="vuzp1q_f64"><div>float64x2_t <b><b>vuzp1q_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1_p8" type="checkbox"><label for="vuzp1_p8"><div>poly8x8_t <b><b>vuzp1_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_p8" type="checkbox"><label for="vuzp1q_p8"><div>poly8x16_t <b><b>vuzp1q_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1_p16" type="checkbox"><label for="vuzp1_p16"><div>poly16x4_t <b><b>vuzp1_p16</b></b> (poly16x4_t a, poly16x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp1q_p16" type="checkbox"><label for="vuzp1q_p16"><div>poly16x8_t <b><b>vuzp1q_p16</b></b> (poly16x8_t a, poly16x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2_s8" type="checkbox"><label for="vuzp2_s8"><div>int8x8_t <b><b>vuzp2_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_s8" type="checkbox"><label for="vuzp2q_s8"><div>int8x16_t <b><b>vuzp2q_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2_s16" type="checkbox"><label for="vuzp2_s16"><div>int16x4_t <b><b>vuzp2_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_s16" type="checkbox"><label for="vuzp2q_s16"><div>int16x8_t <b><b>vuzp2q_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2_s32" type="checkbox"><label for="vuzp2_s32"><div>int32x2_t <b><b>vuzp2_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_s32" type="checkbox"><label for="vuzp2q_s32"><div>int32x4_t <b><b>vuzp2q_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_s64" type="checkbox"><label for="vuzp2q_s64"><div>int64x2_t <b><b>vuzp2q_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2_u8" type="checkbox"><label for="vuzp2_u8"><div>uint8x8_t <b><b>vuzp2_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_u8" type="checkbox"><label for="vuzp2q_u8"><div>uint8x16_t <b><b>vuzp2q_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2_u16" type="checkbox"><label for="vuzp2_u16"><div>uint16x4_t <b><b>vuzp2_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_u16" type="checkbox"><label for="vuzp2q_u16"><div>uint16x8_t <b><b>vuzp2q_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2_u32" type="checkbox"><label for="vuzp2_u32"><div>uint32x2_t <b><b>vuzp2_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_u32" type="checkbox"><label for="vuzp2q_u32"><div>uint32x4_t <b><b>vuzp2q_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_u64" type="checkbox"><label for="vuzp2q_u64"><div>uint64x2_t <b><b>vuzp2q_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_p64" type="checkbox"><label for="vuzp2q_p64"><div>poly64x2_t <b><b>vuzp2q_p64</b></b> (poly64x2_t a, poly64x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2_f32" type="checkbox"><label for="vuzp2_f32"><div>float32x2_t <b><b>vuzp2_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_f32" type="checkbox"><label for="vuzp2q_f32"><div>float32x4_t <b><b>vuzp2q_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_f64" type="checkbox"><label for="vuzp2q_f64"><div>float64x2_t <b><b>vuzp2q_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2_p8" type="checkbox"><label for="vuzp2_p8"><div>poly8x8_t <b><b>vuzp2_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_p8" type="checkbox"><label for="vuzp2q_p8"><div>poly8x16_t <b><b>vuzp2q_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2_p16" type="checkbox"><label for="vuzp2_p16"><div>poly16x4_t <b><b>vuzp2_p16</b></b> (poly16x4_t a, poly16x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vuzp2q_p16" type="checkbox"><label for="vuzp2q_p16"><div>poly16x8_t <b><b>vuzp2q_p16</b></b> (poly16x8_t a, poly16x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1_s8" type="checkbox"><label for="vtrn1_s8"><div>int8x8_t <b><b>vtrn1_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_s8" type="checkbox"><label for="vtrn1q_s8"><div>int8x16_t <b><b>vtrn1q_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1_s16" type="checkbox"><label for="vtrn1_s16"><div>int16x4_t <b><b>vtrn1_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_s16" type="checkbox"><label for="vtrn1q_s16"><div>int16x8_t <b><b>vtrn1q_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1_s32" type="checkbox"><label for="vtrn1_s32"><div>int32x2_t <b><b>vtrn1_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_s32" type="checkbox"><label for="vtrn1q_s32"><div>int32x4_t <b><b>vtrn1q_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_s64" type="checkbox"><label for="vtrn1q_s64"><div>int64x2_t <b><b>vtrn1q_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1_u8" type="checkbox"><label for="vtrn1_u8"><div>uint8x8_t <b><b>vtrn1_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_u8" type="checkbox"><label for="vtrn1q_u8"><div>uint8x16_t <b><b>vtrn1q_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1_u16" type="checkbox"><label for="vtrn1_u16"><div>uint16x4_t <b><b>vtrn1_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_u16" type="checkbox"><label for="vtrn1q_u16"><div>uint16x8_t <b><b>vtrn1q_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1_u32" type="checkbox"><label for="vtrn1_u32"><div>uint32x2_t <b><b>vtrn1_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_u32" type="checkbox"><label for="vtrn1q_u32"><div>uint32x4_t <b><b>vtrn1q_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_u64" type="checkbox"><label for="vtrn1q_u64"><div>uint64x2_t <b><b>vtrn1q_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_p64" type="checkbox"><label for="vtrn1q_p64"><div>poly64x2_t <b><b>vtrn1q_p64</b></b> (poly64x2_t a, poly64x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1_f32" type="checkbox"><label for="vtrn1_f32"><div>float32x2_t <b><b>vtrn1_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_f32" type="checkbox"><label for="vtrn1q_f32"><div>float32x4_t <b><b>vtrn1q_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_f64" type="checkbox"><label for="vtrn1q_f64"><div>float64x2_t <b><b>vtrn1q_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1_p8" type="checkbox"><label for="vtrn1_p8"><div>poly8x8_t <b><b>vtrn1_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_p8" type="checkbox"><label for="vtrn1q_p8"><div>poly8x16_t <b><b>vtrn1q_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1_p16" type="checkbox"><label for="vtrn1_p16"><div>poly16x4_t <b><b>vtrn1_p16</b></b> (poly16x4_t a, poly16x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn1q_p16" type="checkbox"><label for="vtrn1q_p16"><div>poly16x8_t <b><b>vtrn1q_p16</b></b> (poly16x8_t a, poly16x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (primary). This instruction reads corresponding even-numbered vector elements from the two source SIMD&amp;FP registers, starting at zero, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2_s8" type="checkbox"><label for="vtrn2_s8"><div>int8x8_t <b><b>vtrn2_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_s8" type="checkbox"><label for="vtrn2q_s8"><div>int8x16_t <b><b>vtrn2q_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2_s16" type="checkbox"><label for="vtrn2_s16"><div>int16x4_t <b><b>vtrn2_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_s16" type="checkbox"><label for="vtrn2q_s16"><div>int16x8_t <b><b>vtrn2q_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2_s32" type="checkbox"><label for="vtrn2_s32"><div>int32x2_t <b><b>vtrn2_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_s32" type="checkbox"><label for="vtrn2q_s32"><div>int32x4_t <b><b>vtrn2q_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_s64" type="checkbox"><label for="vtrn2q_s64"><div>int64x2_t <b><b>vtrn2q_s64</b></b> (int64x2_t a, int64x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2_u8" type="checkbox"><label for="vtrn2_u8"><div>uint8x8_t <b><b>vtrn2_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_u8" type="checkbox"><label for="vtrn2q_u8"><div>uint8x16_t <b><b>vtrn2q_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2_u16" type="checkbox"><label for="vtrn2_u16"><div>uint16x4_t <b><b>vtrn2_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_u16" type="checkbox"><label for="vtrn2q_u16"><div>uint16x8_t <b><b>vtrn2q_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2_u32" type="checkbox"><label for="vtrn2_u32"><div>uint32x2_t <b><b>vtrn2_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_u32" type="checkbox"><label for="vtrn2q_u32"><div>uint32x4_t <b><b>vtrn2q_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_u64" type="checkbox"><label for="vtrn2q_u64"><div>uint64x2_t <b><b>vtrn2q_u64</b></b> (uint64x2_t a, uint64x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_p64" type="checkbox"><label for="vtrn2q_p64"><div>poly64x2_t <b><b>vtrn2q_p64</b></b> (poly64x2_t a, poly64x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2_f32" type="checkbox"><label for="vtrn2_f32"><div>float32x2_t <b><b>vtrn2_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_f32" type="checkbox"><label for="vtrn2q_f32"><div>float32x4_t <b><b>vtrn2q_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_f64" type="checkbox"><label for="vtrn2q_f64"><div>float64x2_t <b><b>vtrn2q_f64</b></b> (float64x2_t a, float64x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.2D,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2_p8" type="checkbox"><label for="vtrn2_p8"><div>poly8x8_t <b><b>vtrn2_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_p8" type="checkbox"><label for="vtrn2q_p8"><div>poly8x16_t <b><b>vtrn2q_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2_p16" type="checkbox"><label for="vtrn2_p16"><div>poly16x4_t <b><b>vtrn2_p16</b></b> (poly16x4_t a, poly16x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn2q_p16" type="checkbox"><label for="vtrn2q_p16"><div>poly16x8_t <b><b>vtrn2q_p16</b></b> (poly16x8_t a, poly16x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtbl1_s8" type="checkbox"><label for="vtbl1_s8"><div>int8x8_t <b><b>vtbl1_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; Zeros(64):a <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl1_u8" type="checkbox"><label for="vtbl1_u8"><div>uint8x8_t <b><b>vtbl1_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; Zeros(64):a <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl1_p8" type="checkbox"><label for="vtbl1_p8"><div>poly8x8_t <b><b>vtbl1_p8</b></b> (poly8x8_t a, uint8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; Zeros(64):a <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx1_s8" type="checkbox"><label for="vtbx1_s8"><div>int8x8_t <b><b>vtbx1_s8</b></b> (int8x8_t a, int8x8_t b, int8x8_t c)<span class="right">Bitwise insert if false</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Insert if False. This instruction inserts each bit from the first source SIMD&amp;FP register into the destination SIMD&amp;FP register if the corresponding bit of the second source SIMD&amp;FP register is 0, otherwise leaves the bit in the destination register unchanged.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/movi-move-immediate-vector">MOVI</a> Vtmp.8B,#8
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vtmp.8B,Vm.8B,Vtmp.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vtmp1.8B,{Vn.16B},Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bif-bitwise-insert-if-false">BIF</a> Vd.8B,Vtmp1.8B,Vtmp.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; Zeros(64):b <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+operand3 = NOT(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m]);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx1_u8" type="checkbox"><label for="vtbx1_u8"><div>uint8x8_t <b><b>vtbx1_u8</b></b> (uint8x8_t a, uint8x8_t b, uint8x8_t c)<span class="right">Bitwise insert if false</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Insert if False. This instruction inserts each bit from the first source SIMD&amp;FP register into the destination SIMD&amp;FP register if the corresponding bit of the second source SIMD&amp;FP register is 0, otherwise leaves the bit in the destination register unchanged.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/movi-move-immediate-vector">MOVI</a> Vtmp.8B,#8
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vtmp.8B,Vm.8B,Vtmp.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vtmp1.8B,{Vn.16B},Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bif-bitwise-insert-if-false">BIF</a> Vd.8B,Vtmp1.8B,Vtmp.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; Zeros(64):b <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+operand3 = NOT(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m]);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx1_p8" type="checkbox"><label for="vtbx1_p8"><div>poly8x8_t <b><b>vtbx1_p8</b></b> (poly8x8_t a, poly8x8_t b, uint8x8_t c)<span class="right">Bitwise insert if false</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Insert if False. This instruction inserts each bit from the first source SIMD&amp;FP register into the destination SIMD&amp;FP register if the corresponding bit of the second source SIMD&amp;FP register is 0, otherwise leaves the bit in the destination register unchanged.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/movi-move-immediate-vector">MOVI</a> Vtmp.8B,#8
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vtmp.8B,Vm.8B,Vtmp.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vtmp1.8B,{Vn.16B},Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bif-bitwise-insert-if-false">BIF</a> Vd.8B,Vtmp1.8B, Vtmp.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; Zeros(64):b <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+operand3 = NOT(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m]);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl2_s8" type="checkbox"><label for="vtbl2_s8"><div>int8x8_t <b><b>vtbl2_s8</b></b> (int8x8x2_t a, int8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; a.val[1]:a.val[0] <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl2_u8" type="checkbox"><label for="vtbl2_u8"><div>uint8x8_t <b><b>vtbl2_u8</b></b> (uint8x8x2_t a, uint8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; a.val[1]:a.val[0] <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl2_p8" type="checkbox"><label for="vtbl2_p8"><div>poly8x8_t <b><b>vtbl2_p8</b></b> (poly8x8x2_t a, uint8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; a.val[1]:a.val[0] <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl3_s8" type="checkbox"><label for="vtbl3_s8"><div>int8x8_t <b><b>vtbl3_s8</b></b> (int8x8x3_t a, int8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; a.val[1]:a.val[0] <br />
+Vn+1 &rarr; Zeros(64):a.val[2] <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl3_u8" type="checkbox"><label for="vtbl3_u8"><div>uint8x8_t <b><b>vtbl3_u8</b></b> (uint8x8x3_t a, uint8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; a.val[1]:a.val[0] <br />
+Vn+1 &rarr; Zeros(64):a.val[2] <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl3_p8" type="checkbox"><label for="vtbl3_p8"><div>poly8x8_t <b><b>vtbl3_p8</b></b> (poly8x8x3_t a, uint8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; a.val[1]:a.val[0] <br />
+Vn+1 &rarr; Zeros(64):a.val[2] <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl4_s8" type="checkbox"><label for="vtbl4_s8"><div>int8x8_t <b><b>vtbl4_s8</b></b> (int8x8x4_t a, int8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; a.val[1]:a.val[0] <br />
+Vn+1 &rarr; a.val[3]:a.val[2] <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl4_u8" type="checkbox"><label for="vtbl4_u8"><div>uint8x8_t <b><b>vtbl4_u8</b></b> (uint8x8x4_t a, uint8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; a.val[1]:a.val[0] <br />
+Vn+1 &rarr; a.val[3]:a.val[2] <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbl4_p8" type="checkbox"><label for="vtbl4_p8"><div>poly8x8_t <b><b>vtbl4_p8</b></b> (poly8x8x4_t a, uint8x8_t b)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>Vn &rarr; a.val[1]:a.val[0] <br />
+Vn+1 &rarr; a.val[3]:a.val[2] <br />
+b &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx2_s8" type="checkbox"><label for="vtbx2_s8"><div>int8x8_t <b><b>vtbx2_s8</b></b> (int8x8_t a, int8x8x2_t b, int8x8_t c)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; b.val[1]:b.val[0] <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx2_u8" type="checkbox"><label for="vtbx2_u8"><div>uint8x8_t <b><b>vtbx2_u8</b></b> (uint8x8_t a, uint8x8x2_t b, uint8x8_t c)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; b.val[1]:b.val[0] <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx2_p8" type="checkbox"><label for="vtbx2_p8"><div>poly8x8_t <b><b>vtbx2_p8</b></b> (poly8x8_t a, poly8x8x2_t b, uint8x8_t c)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; b.val[1]:b.val[0] <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx3_s8" type="checkbox"><label for="vtbx3_s8"><div>int8x8_t <b><b>vtbx3_s8</b></b> (int8x8_t a, int8x8x3_t b, int8x8_t c)<span class="right">Bitwise insert if false</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Insert if False. This instruction inserts each bit from the first source SIMD&amp;FP register into the destination SIMD&amp;FP register if the corresponding bit of the second source SIMD&amp;FP register is 0, otherwise leaves the bit in the destination register unchanged.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/movi-move-immediate-vector">MOVI</a> Vtmp.8B,#24
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vtmp.8B,Vm.8B,Vtmp.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vtmp1.8B,{Vn.16B,Vn+1.16B},Vm.8
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bif-bitwise-insert-if-false">BIF</a> Vd.8B,Vtmp1.8B,Vtmp.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; b.val[1]:b.val[0] <br />
+Vn+1 &rarr; Zeros(64):b.val[2] <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+operand3 = NOT(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m]);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx3_u8" type="checkbox"><label for="vtbx3_u8"><div>uint8x8_t <b><b>vtbx3_u8</b></b> (uint8x8_t a, uint8x8x3_t b, uint8x8_t c)<span class="right">Bitwise insert if false</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Insert if False. This instruction inserts each bit from the first source SIMD&amp;FP register into the destination SIMD&amp;FP register if the corresponding bit of the second source SIMD&amp;FP register is 0, otherwise leaves the bit in the destination register unchanged.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/movi-move-immediate-vector">MOVI</a> Vtmp.8B,#24
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vtmp.8B,Vm.8B,Vtmp.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vtmp1.8B,{Vn.16B,Vn+1.16B},Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bif-bitwise-insert-if-false">BIF</a> Vd.8B,Vtmp1.8B,Vtmp.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; b.val[1]:b.val[0] <br />
+Vn+1 &rarr; Zeros(64):b.val[2] <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+operand3 = NOT(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m]);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx3_p8" type="checkbox"><label for="vtbx3_p8"><div>poly8x8_t <b><b>vtbx3_p8</b></b> (poly8x8_t a, poly8x8x3_t b, uint8x8_t c)<span class="right">Bitwise insert if false</span></div></label><article>      <h4>Description</h4><p><p class="aml">Bitwise Insert if False. This instruction inserts each bit from the first source SIMD&amp;FP register into the destination SIMD&amp;FP register if the corresponding bit of the second source SIMD&amp;FP register is 0, otherwise leaves the bit in the destination register unchanged.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/movi-move-immediate-vector">MOVI</a> Vtmp.8B,#24
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/cmhs-register-compare-unsigned-higher-or-same-vector">CMHS</a> Vtmp.8B,Vm.8B,Vtmp.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vtmp1.8B,{Vn.16B,Vn+1.16B},Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/bif-bitwise-insert-if-false">BIF</a> Vd.8B,Vtmp1.8B,Vtmp.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; b.val[1]:b.val[0] <br />
+Vn+1 &rarr; Zeros(64):b.val[2] <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1;
+bits(datasize) operand3;
+bits(datasize) operand4 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+operand3 = NOT(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m]);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = operand1 EOR ((operand1 EOR operand4) AND operand3);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx4_s8" type="checkbox"><label for="vtbx4_s8"><div>int8x8_t <b><b>vtbx4_s8</b></b> (int8x8_t a, int8x8x4_t b, int8x8_t c)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; b.val[1]:b.val[0] <br />
+Vn+1 &rarr; b.val[3]:b.val[2] <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx4_u8" type="checkbox"><label for="vtbx4_u8"><div>uint8x8_t <b><b>vtbx4_u8</b></b> (uint8x8_t a, uint8x8x4_t b, uint8x8_t c)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; b.val[1]:b.val[0] <br />
+Vn+1 &rarr; b.val[3]:b.val[2] <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtbx4_p8" type="checkbox"><label for="vtbx4_p8"><div>poly8x8_t <b><b>vtbx4_p8</b></b> (poly8x8_t a, poly8x8x4_t b, uint8x8_t c)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd <br />
+Vn &rarr; b.val[1]:b.val[0] <br />
+Vn+1 &rarr; b.val[3]:b.val[2] <br />
+c &rarr; Vm </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl1_s8" type="checkbox"><label for="vqtbl1_s8"><div>int8x8_t <b><b>vqtbl1_s8</b></b> (int8x16_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t &rarr; Vn.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl1q_s8" type="checkbox"><label for="vqtbl1q_s8"><div>int8x16_t <b><b>vqtbl1q_s8</b></b> (int8x16_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t &rarr; Vn.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl1_u8" type="checkbox"><label for="vqtbl1_u8"><div>uint8x8_t <b><b>vqtbl1_u8</b></b> (uint8x16_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t &rarr; Vn.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl1q_u8" type="checkbox"><label for="vqtbl1q_u8"><div>uint8x16_t <b><b>vqtbl1q_u8</b></b> (uint8x16_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t &rarr; Vn.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl1_p8" type="checkbox"><label for="vqtbl1_p8"><div>poly8x8_t <b><b>vqtbl1_p8</b></b> (poly8x16_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t &rarr; Vn.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl1q_p8" type="checkbox"><label for="vqtbl1q_p8"><div>poly8x16_t <b><b>vqtbl1q_p8</b></b> (poly8x16_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t &rarr; Vn.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx1_s8" type="checkbox"><label for="vqtbx1_s8"><div>int8x8_t <b><b>vqtbx1_s8</b></b> (int8x8_t a, int8x16_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t &rarr; Vn.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx1q_s8" type="checkbox"><label for="vqtbx1q_s8"><div>int8x16_t <b><b>vqtbx1q_s8</b></b> (int8x16_t a, int8x16_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t &rarr; Vn.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx1_u8" type="checkbox"><label for="vqtbx1_u8"><div>uint8x8_t <b><b>vqtbx1_u8</b></b> (uint8x8_t a, uint8x16_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t &rarr; Vn.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx1q_u8" type="checkbox"><label for="vqtbx1q_u8"><div>uint8x16_t <b><b>vqtbx1q_u8</b></b> (uint8x16_t a, uint8x16_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t &rarr; Vn.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx1_p8" type="checkbox"><label for="vqtbx1_p8"><div>poly8x8_t <b><b>vqtbx1_p8</b></b> (poly8x8_t a, poly8x16_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t &rarr; Vn.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx1q_p8" type="checkbox"><label for="vqtbx1q_p8"><div>poly8x16_t <b><b>vqtbx1q_p8</b></b> (poly8x16_t a, poly8x16_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t &rarr; Vn.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl2_s8" type="checkbox"><label for="vqtbl2_s8"><div>int8x8_t <b><b>vqtbl2_s8</b></b> (int8x16x2_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl2q_s8" type="checkbox"><label for="vqtbl2q_s8"><div>int8x16_t <b><b>vqtbl2q_s8</b></b> (int8x16x2_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl2_u8" type="checkbox"><label for="vqtbl2_u8"><div>uint8x8_t <b><b>vqtbl2_u8</b></b> (uint8x16x2_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl2q_u8" type="checkbox"><label for="vqtbl2q_u8"><div>uint8x16_t <b><b>vqtbl2q_u8</b></b> (uint8x16x2_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl2_p8" type="checkbox"><label for="vqtbl2_p8"><div>poly8x8_t <b><b>vqtbl2_p8</b></b> (poly8x16x2_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl2q_p8" type="checkbox"><label for="vqtbl2q_p8"><div>poly8x16_t <b><b>vqtbl2q_p8</b></b> (poly8x16x2_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl3_s8" type="checkbox"><label for="vqtbl3_s8"><div>int8x8_t <b><b>vqtbl3_s8</b></b> (int8x16x3_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl3q_s8" type="checkbox"><label for="vqtbl3q_s8"><div>int8x16_t <b><b>vqtbl3q_s8</b></b> (int8x16x3_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl3_u8" type="checkbox"><label for="vqtbl3_u8"><div>uint8x8_t <b><b>vqtbl3_u8</b></b> (uint8x16x3_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl3q_u8" type="checkbox"><label for="vqtbl3q_u8"><div>uint8x16_t <b><b>vqtbl3q_u8</b></b> (uint8x16x3_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl3_p8" type="checkbox"><label for="vqtbl3_p8"><div>poly8x8_t <b><b>vqtbl3_p8</b></b> (poly8x16x3_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl3q_p8" type="checkbox"><label for="vqtbl3q_p8"><div>poly8x16_t <b><b>vqtbl3q_p8</b></b> (poly8x16x3_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl4_s8" type="checkbox"><label for="vqtbl4_s8"><div>int8x8_t <b><b>vqtbl4_s8</b></b> (int8x16x4_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl4q_s8" type="checkbox"><label for="vqtbl4q_s8"><div>int8x16_t <b><b>vqtbl4q_s8</b></b> (int8x16x4_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B - Vn+3.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl4_u8" type="checkbox"><label for="vqtbl4_u8"><div>uint8x8_t <b><b>vqtbl4_u8</b></b> (uint8x16x4_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl4q_u8" type="checkbox"><label for="vqtbl4q_u8"><div>uint8x16_t <b><b>vqtbl4q_u8</b></b> (uint8x16x4_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B - Vn+3.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl4_p8" type="checkbox"><label for="vqtbl4_p8"><div>poly8x8_t <b><b>vqtbl4_p8</b></b> (poly8x16x4_t t, uint8x8_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbl4q_p8" type="checkbox"><label for="vqtbl4q_p8"><div>poly8x16_t <b><b>vqtbl4q_p8</b></b> (poly8x16x4_t t, uint8x16_t idx)<span class="right">Table vector lookup</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector Lookup. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the result for that lookup is 0. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbl-table-vector-lookup">TBL</a> Vd.16B,{Vn.16B - Vn+3.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx2_s8" type="checkbox"><label for="vqtbx2_s8"><div>int8x8_t <b><b>vqtbx2_s8</b></b> (int8x8_t a, int8x16x2_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx2q_s8" type="checkbox"><label for="vqtbx2q_s8"><div>int8x16_t <b><b>vqtbx2q_s8</b></b> (int8x16_t a, int8x16x2_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx2_u8" type="checkbox"><label for="vqtbx2_u8"><div>uint8x8_t <b><b>vqtbx2_u8</b></b> (uint8x8_t a, uint8x16x2_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx2q_u8" type="checkbox"><label for="vqtbx2q_u8"><div>uint8x16_t <b><b>vqtbx2q_u8</b></b> (uint8x16_t a, uint8x16x2_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx2_p8" type="checkbox"><label for="vqtbx2_p8"><div>poly8x8_t <b><b>vqtbx2_p8</b></b> (poly8x8_t a, poly8x16x2_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx2q_p8" type="checkbox"><label for="vqtbx2q_p8"><div>poly8x16_t <b><b>vqtbx2q_p8</b></b> (poly8x16_t a, poly8x16x2_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx3_s8" type="checkbox"><label for="vqtbx3_s8"><div>int8x8_t <b><b>vqtbx3_s8</b></b> (int8x8_t a, int8x16x3_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx3q_s8" type="checkbox"><label for="vqtbx3q_s8"><div>int8x16_t <b><b>vqtbx3q_s8</b></b> (int8x16_t a, int8x16x3_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx3_u8" type="checkbox"><label for="vqtbx3_u8"><div>uint8x8_t <b><b>vqtbx3_u8</b></b> (uint8x8_t a, uint8x16x3_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx3q_u8" type="checkbox"><label for="vqtbx3q_u8"><div>uint8x16_t <b><b>vqtbx3q_u8</b></b> (uint8x16_t a, uint8x16x3_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx3_p8" type="checkbox"><label for="vqtbx3_p8"><div>poly8x8_t <b><b>vqtbx3_p8</b></b> (poly8x8_t a, poly8x16x3_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx3q_p8" type="checkbox"><label for="vqtbx3q_p8"><div>poly8x16_t <b><b>vqtbx3q_p8</b></b> (poly8x16_t a, poly8x16x3_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx4_s8" type="checkbox"><label for="vqtbx4_s8"><div>int8x8_t <b><b>vqtbx4_s8</b></b> (int8x8_t a, int8x16x4_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx4q_s8" type="checkbox"><label for="vqtbx4q_s8"><div>int8x16_t <b><b>vqtbx4q_s8</b></b> (int8x16_t a, int8x16x4_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B - Vn+3.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx4_u8" type="checkbox"><label for="vqtbx4_u8"><div>uint8x8_t <b><b>vqtbx4_u8</b></b> (uint8x8_t a, uint8x16x4_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx4q_u8" type="checkbox"><label for="vqtbx4q_u8"><div>uint8x16_t <b><b>vqtbx4q_u8</b></b> (uint8x16_t a, uint8x16x4_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B - Vn+3.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx4_p8" type="checkbox"><label for="vqtbx4_p8"><div>poly8x8_t <b><b>vqtbx4_p8</b></b> (poly8x8_t a, poly8x16x4_t t, uint8x8_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vqtbx4q_p8" type="checkbox"><label for="vqtbx4q_p8"><div>poly8x16_t <b><b>vqtbx4q_p8</b></b> (poly8x16_t a, poly8x16x4_t t, uint8x16_t idx)<span class="right">Table vector lookup extension</span></div></label><article>      <h4>Description</h4><p><p class="aml">Table vector lookup extension. This instruction reads each value from the vector elements in the index source SIMD&amp;FP register, uses each result as an index to perform a lookup in a table of bytes that is described by one to four source table SIMD&amp;FP registers, places the lookup result in a vector, and writes the vector to the destination SIMD&amp;FP register. If an index is out of range for the table, the existing value in the vector element of the destination register is left unchanged. If more than one source register is used to describe the table, the first source register describes the lowest bytes of the table.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/tbx-table-vector-lookup-extension">TBX</a> Vd.16B,{Vn.16B - Vn+3.16B},Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B <br />
+t.val[0] &rarr; Vn.16B <br />
+t.val[1] &rarr; Vn+1.16B <br />
+t.val[2] &rarr; Vn+2.16B <br />
+t.val[3] &rarr; Vn+3.16B <br />
+idx &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) indices = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128*regs) table = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>();
+bits(datasize) result;
+integer index;
+
+// Create table from registers
+for i = 0 to regs-1
+    table&lt;128*i+127:128*i&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+    n = (n + 1) MOD 32;
+
+result = if is_tbl then <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.0" title="function: bits(N) Zeros()">Zeros</a>() else <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+for i = 0 to elements-1
+    index = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.UInt.1" title="function: integer UInt(bits(N) x)">UInt</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[indices, i, 8]);
+    if index &lt; 16 * regs then
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, i, 8] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[table, index, 8];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_u8" type="checkbox"><label for="vget_lane_u8"><div>uint8_t <b><b>vget_lane_u8</b></b> (uint8x8_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_u16" type="checkbox"><label for="vget_lane_u16"><div>uint16_t <b><b>vget_lane_u16</b></b> (uint16x4_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_u32" type="checkbox"><label for="vget_lane_u32"><div>uint32_t <b><b>vget_lane_u32</b></b> (uint32x2_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_u64" type="checkbox"><label for="vget_lane_u64"><div>uint64_t <b><b>vget_lane_u64</b></b> (uint64x1_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_p64" type="checkbox"><label for="vget_lane_p64"><div>poly64_t <b><b>vget_lane_p64</b></b> (poly64x1_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_s8" type="checkbox"><label for="vget_lane_s8"><div>int8_t <b><b>vget_lane_s8</b></b> (int8x8_t v, const int lane)<span class="right">Signed move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Move vector element to general-purpose register. This instruction reads the signed integer from the source SIMD&amp;FP register, sign-extends it to form a 32-bit or 64-bit value, and writes the result to destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smov-signed-move-vector-element-to-general-purpose-register">SMOV</a> Rd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignExtend.2" title="function: bits(N) SignExtend(bits(M) x, integer N)">SignExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_s16" type="checkbox"><label for="vget_lane_s16"><div>int16_t <b><b>vget_lane_s16</b></b> (int16x4_t v, const int lane)<span class="right">Signed move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Move vector element to general-purpose register. This instruction reads the signed integer from the source SIMD&amp;FP register, sign-extends it to form a 32-bit or 64-bit value, and writes the result to destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smov-signed-move-vector-element-to-general-purpose-register">SMOV</a> Rd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignExtend.2" title="function: bits(N) SignExtend(bits(M) x, integer N)">SignExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_s32" type="checkbox"><label for="vget_lane_s32"><div>int32_t <b><b>vget_lane_s32</b></b> (int32x2_t v, const int lane)<span class="right">Signed move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Move vector element to general-purpose register. This instruction reads the signed integer from the source SIMD&amp;FP register, sign-extends it to form a 32-bit or 64-bit value, and writes the result to destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smov-signed-move-vector-element-to-general-purpose-register">SMOV</a> Rd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignExtend.2" title="function: bits(N) SignExtend(bits(M) x, integer N)">SignExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_s64" type="checkbox"><label for="vget_lane_s64"><div>int64_t <b><b>vget_lane_s64</b></b> (int64x1_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_p8" type="checkbox"><label for="vget_lane_p8"><div>poly8_t <b><b>vget_lane_p8</b></b> (poly8x8_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_p16" type="checkbox"><label for="vget_lane_p16"><div>poly16_t <b><b>vget_lane_p16</b></b> (poly16x4_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_f32" type="checkbox"><label for="vget_lane_f32"><div>float32_t <b><b>vget_lane_f32</b></b> (float32x2_t v, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Sd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_f64" type="checkbox"><label for="vget_lane_f64"><div>float64_t <b><b>vget_lane_f64</b></b> (float64x1_t v, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_u8" type="checkbox"><label for="vgetq_lane_u8"><div>uint8_t <b><b>vgetq_lane_u8</b></b> (uint8x16_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_u16" type="checkbox"><label for="vgetq_lane_u16"><div>uint16_t <b><b>vgetq_lane_u16</b></b> (uint16x8_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_u32" type="checkbox"><label for="vgetq_lane_u32"><div>uint32_t <b><b>vgetq_lane_u32</b></b> (uint32x4_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_u64" type="checkbox"><label for="vgetq_lane_u64"><div>uint64_t <b><b>vgetq_lane_u64</b></b> (uint64x2_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_p64" type="checkbox"><label for="vgetq_lane_p64"><div>poly64_t <b><b>vgetq_lane_p64</b></b> (poly64x2_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_s8" type="checkbox"><label for="vgetq_lane_s8"><div>int8_t <b><b>vgetq_lane_s8</b></b> (int8x16_t v, const int lane)<span class="right">Signed move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Move vector element to general-purpose register. This instruction reads the signed integer from the source SIMD&amp;FP register, sign-extends it to form a 32-bit or 64-bit value, and writes the result to destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smov-signed-move-vector-element-to-general-purpose-register">SMOV</a> Rd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignExtend.2" title="function: bits(N) SignExtend(bits(M) x, integer N)">SignExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_s16" type="checkbox"><label for="vgetq_lane_s16"><div>int16_t <b><b>vgetq_lane_s16</b></b> (int16x8_t v, const int lane)<span class="right">Signed move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Move vector element to general-purpose register. This instruction reads the signed integer from the source SIMD&amp;FP register, sign-extends it to form a 32-bit or 64-bit value, and writes the result to destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smov-signed-move-vector-element-to-general-purpose-register">SMOV</a> Rd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignExtend.2" title="function: bits(N) SignExtend(bits(M) x, integer N)">SignExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_s32" type="checkbox"><label for="vgetq_lane_s32"><div>int32_t <b><b>vgetq_lane_s32</b></b> (int32x4_t v, const int lane)<span class="right">Signed move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Signed Move vector element to general-purpose register. This instruction reads the signed integer from the source SIMD&amp;FP register, sign-extends it to form a 32-bit or 64-bit value, and writes the result to destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/smov-signed-move-vector-element-to-general-purpose-register">SMOV</a> Rd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SignExtend.2" title="function: bits(N) SignExtend(bits(M) x, integer N)">SignExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_s64" type="checkbox"><label for="vgetq_lane_s64"><div>int64_t <b><b>vgetq_lane_s64</b></b> (int64x2_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_p8" type="checkbox"><label for="vgetq_lane_p8"><div>poly8_t <b><b>vgetq_lane_p8</b></b> (poly8x16_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.B[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_p16" type="checkbox"><label for="vgetq_lane_p16"><div>poly16_t <b><b>vgetq_lane_p16</b></b> (poly16x8_t v, const int lane)<span class="right">Unsigned move vector element to general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unsigned Move vector element to general-purpose register. This instruction reads the unsigned integer from the source SIMD&amp;FP register, zero-extends it to form a 32-bit or 64-bit value, and writes the result to the destination general-purpose register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/umov-unsigned-move-vector-element-to-general-purpose-register">UMOV</a> Rd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Rd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(idxdsize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ZeroExtend.2" title="function: bits(N) ZeroExtend(bits(M) x, integer N)">ZeroExtend</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, index, esize], datasize);</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vget_lane_f16" type="checkbox"><label for="vget_lane_f16"><div>float16_t <b><b>vget_lane_f16</b></b> (float16x4_t v, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Hd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_f16" type="checkbox"><label for="vgetq_lane_f16"><div>float16_t <b><b>vgetq_lane_f16</b></b> (float16x8_t v, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Hd,Vn.H[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Hd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_f32" type="checkbox"><label for="vgetq_lane_f32"><div>float32_t <b><b>vgetq_lane_f32</b></b> (float32x4_t v, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Sd,Vn.S[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vgetq_lane_f64" type="checkbox"><label for="vgetq_lane_f64"><div>float64_t <b><b>vgetq_lane_f64</b></b> (float64x2_t v, const int lane)<span class="right">Duplicate general-purpose register to vector</span></div></label><article>      <h4>Description</h4><p><p class="aml">Duplicate general-purpose register to vector. This instruction duplicates the contents of the source general-purpose register into a scalar or each element in a vector, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/dup-general-duplicate-general-purpose-register-to-vector">DUP</a> Dd,Vn.D[lane]
+</pre>      <h4>Argument Preparation</h4><pre>v &rarr; Vn.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(datasize) result;
+
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_u8" type="checkbox"><label for="vset_lane_u8"><div>uint8x8_t <b><b>vset_lane_u8</b></b> (uint8_t a, uint8x8_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_u16" type="checkbox"><label for="vset_lane_u16"><div>uint16x4_t <b><b>vset_lane_u16</b></b> (uint16_t a, uint16x4_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_u32" type="checkbox"><label for="vset_lane_u32"><div>uint32x2_t <b><b>vset_lane_u32</b></b> (uint32_t a, uint32x2_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_u64" type="checkbox"><label for="vset_lane_u64"><div>uint64x1_t <b><b>vset_lane_u64</b></b> (uint64_t a, uint64x1_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_p64" type="checkbox"><label for="vset_lane_p64"><div>poly64x1_t <b><b>vset_lane_p64</b></b> (poly64_t a, poly64x1_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_s8" type="checkbox"><label for="vset_lane_s8"><div>int8x8_t <b><b>vset_lane_s8</b></b> (int8_t a, int8x8_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_s16" type="checkbox"><label for="vset_lane_s16"><div>int16x4_t <b><b>vset_lane_s16</b></b> (int16_t a, int16x4_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_s32" type="checkbox"><label for="vset_lane_s32"><div>int32x2_t <b><b>vset_lane_s32</b></b> (int32_t a, int32x2_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_s64" type="checkbox"><label for="vset_lane_s64"><div>int64x1_t <b><b>vset_lane_s64</b></b> (int64_t a, int64x1_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_p8" type="checkbox"><label for="vset_lane_p8"><div>poly8x8_t <b><b>vset_lane_p8</b></b> (poly8_t a, poly8x8_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.8B <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_p16" type="checkbox"><label for="vset_lane_p16"><div>poly16x4_t <b><b>vset_lane_p16</b></b> (poly16_t a, poly16x4_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_f16" type="checkbox"><label for="vset_lane_f16"><div>float16x4_t <b><b>vset_lane_f16</b></b> (float16_t a, float16x4_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane],Vn.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; VnH <br />
+v &rarr; Vd.4H <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_f16" type="checkbox"><label for="vsetq_lane_f16"><div>float16x8_t <b><b>vsetq_lane_f16</b></b> (float16_t a, float16x8_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane],Vn.H[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; VnH <br />
+v &rarr; Vd.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_f32" type="checkbox"><label for="vset_lane_f32"><div>float32x2_t <b><b>vset_lane_f32</b></b> (float32_t a, float32x2_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.2S <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vset_lane_f64" type="checkbox"><label for="vset_lane_f64"><div>float64x1_t <b><b>vset_lane_f64</b></b> (float64_t a, float64x1_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.1D <br />
+0 &lt;&lt; lane &lt;&lt; 0 </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_u8" type="checkbox"><label for="vsetq_lane_u8"><div>uint8x16_t <b><b>vsetq_lane_u8</b></b> (uint8_t a, uint8x16_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_u16" type="checkbox"><label for="vsetq_lane_u16"><div>uint16x8_t <b><b>vsetq_lane_u16</b></b> (uint16_t a, uint16x8_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_u32" type="checkbox"><label for="vsetq_lane_u32"><div>uint32x4_t <b><b>vsetq_lane_u32</b></b> (uint32_t a, uint32x4_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_u64" type="checkbox"><label for="vsetq_lane_u64"><div>uint64x2_t <b><b>vsetq_lane_u64</b></b> (uint64_t a, uint64x2_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_p64" type="checkbox"><label for="vsetq_lane_p64"><div>poly64x2_t <b><b>vsetq_lane_p64</b></b> (poly64_t a, poly64x2_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_s8" type="checkbox"><label for="vsetq_lane_s8"><div>int8x16_t <b><b>vsetq_lane_s8</b></b> (int8_t a, int8x16_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_s16" type="checkbox"><label for="vsetq_lane_s16"><div>int16x8_t <b><b>vsetq_lane_s16</b></b> (int16_t a, int16x8_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_s32" type="checkbox"><label for="vsetq_lane_s32"><div>int32x4_t <b><b>vsetq_lane_s32</b></b> (int32_t a, int32x4_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_s64" type="checkbox"><label for="vsetq_lane_s64"><div>int64x2_t <b><b>vsetq_lane_s64</b></b> (int64_t a, int64x2_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_p8" type="checkbox"><label for="vsetq_lane_p8"><div>poly8x16_t <b><b>vsetq_lane_p8</b></b> (poly8_t a, poly8x16_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.B[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.16B <br />
+0 &lt;&lt; lane &lt;&lt; 15 </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_p16" type="checkbox"><label for="vsetq_lane_p16"><div>poly16x8_t <b><b>vsetq_lane_p16</b></b> (poly16_t a, poly16x8_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.H[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.8H <br />
+0 &lt;&lt; lane &lt;&lt; 7 </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_f32" type="checkbox"><label for="vsetq_lane_f32"><div>float32x4_t <b><b>vsetq_lane_f32</b></b> (float32_t a, float32x4_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.S[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.4S <br />
+0 &lt;&lt; lane &lt;&lt; 3 </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsetq_lane_f64" type="checkbox"><label for="vsetq_lane_f64"><div>float64x2_t <b><b>vsetq_lane_f64</b></b> (float64_t a, float64x2_t v, const int lane)<span class="right">Insert vector element from general-purpose register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Insert vector element from general-purpose register. This instruction copies the contents of the source general-purpose register to the specified vector element in the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ins-general-insert-vector-element-from-general-purpose-register">INS</a> Vd.D[lane],Rn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Rn <br />
+v &rarr; Vd.2D <br />
+0 &lt;&lt; lane &lt;&lt; 1 </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(esize) element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+bits(128) result;
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, index, esize] = element;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrecpxs_f32" type="checkbox"><label for="vrecpxs_f32"><div>float32_t <b><b>vrecpxs_f32</b></b> (float32_t a)<span class="right">Floating-point reciprocal exponent</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal exponent (scalar). This instruction finds an approximate reciprocal exponent for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecpx-floating-point-reciprocal-exponent-scalar">FRECPX</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRecpX.2" title="function: bits(N) FPRecpX(bits(N) op, FPCRType fpcr)">FPRecpX</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vrecpxd_f64" type="checkbox"><label for="vrecpxd_f64"><div>float64_t <b><b>vrecpxd_f64</b></b> (float64_t a)<span class="right">Floating-point reciprocal exponent</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Reciprocal exponent (scalar). This instruction finds an approximate reciprocal exponent for each vector element in the source SIMD&amp;FP register, places the result in a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/frecpx-floating-point-reciprocal-exponent-scalar">FRECPX</a> Dd,Dn
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Dn </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) result;
+bits(esize) element;
+
+for e = 0 to elements-1
+    element = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPRecpX.2" title="function: bits(N) FPRecpX(bits(N) op, FPCRType fpcr)">FPRecpX</a>(element, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfma_n_f32" type="checkbox"><label for="vfma_n_f32"><div>float32x2_t <b><b>vfma_n_f32</b></b> (float32x2_t a, float32x2_t b, float32_t n)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+n &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vfmaq_n_f32" type="checkbox"><label for="vfmaq_n_f32"><div>float32x4_t <b><b>vfmaq_n_f32</b></b> (float32x4_t a, float32x4_t b, float32_t n)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+n &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vfms_n_f32" type="checkbox"><label for="vfms_n_f32"><div>float32x2_t <b><b>vfms_n_f32</b></b> (float32x2_t a, float32x2_t b, float32_t n)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.2S,Vn.2S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S <br />
+b &rarr; Vn.2S <br />
+n &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmsq_n_f32" type="checkbox"><label for="vfmsq_n_f32"><div>float32x4_t <b><b>vfmsq_n_f32</b></b> (float32x4_t a, float32x4_t b, float32_t n)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.4S,Vn.4S,Vm.S[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S <br />
+b &rarr; Vn.4S <br />
+n &rarr; Vm.S[0] </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfma_n_f64" type="checkbox"><label for="vfma_n_f64"><div>float64x1_t <b><b>vfma_n_f64</b></b> (float64x1_t a, float64x1_t b, float64_t n)<span class="right">Floating-point fused multiply-add</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add (scalar). This instruction multiplies the values of the first two SIMD&amp;FP source registers, adds the product to the value of the third SIMD&amp;FP source register, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmadd-floating-point-fused-multiply-add-scalar">FMADD</a> Dd,Dn,Dm,Da
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Da <br />
+b &rarr; Dn <br />
+n &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) result;
+bits(datasize) operanda = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[a];
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(operanda, operand1, operand2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmaq_n_f64" type="checkbox"><label for="vfmaq_n_f64"><div>float64x2_t <b><b>vfmaq_n_f64</b></b> (float64x2_t a, float64x2_t b, float64_t n)<span class="right">Floating-point fused multiply-add to accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Add to accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, adds the product to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmla-vector-floating-point-fused-multiply-add-to-accumulator-vector">FMLA</a> Vd.2D,Vn.2D,Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+n &rarr; Vm.D[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfms_n_f64" type="checkbox"><label for="vfms_n_f64"><div>float64x1_t <b><b>vfms_n_f64</b></b> (float64x1_t a, float64x1_t b, float64_t n)<span class="right">Floating-point fused multiply-subtract</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point Fused Multiply-Subtract (scalar). This instruction multiplies the values of the first two SIMD&amp;FP source registers, negates the product, adds that to the value of the third SIMD&amp;FP source register, and writes the result to the SIMD&amp;FP destination register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmsub-floating-point-fused-multiply-subtract-scalar">FMSUB</a> Dd,Dn,Dm,Da
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Da <br />
+b &rarr; Dn <br />
+n &rarr; Dm </pre>      <h4>Results</h4>      <pre>Dd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) result;
+bits(datasize) operanda = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[a];
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+
+operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(operand1);
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(operanda, operand1, operand2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vfmsq_n_f64" type="checkbox"><label for="vfmsq_n_f64"><div>float64x2_t <b><b>vfmsq_n_f64</b></b> (float64x2_t a, float64x2_t b, float64_t n)<span class="right">Floating-point fused multiply-subtract from accumulator</span></div></label><article>      <h4>Description</h4><p><p class="aml">Floating-point fused Multiply-Subtract from accumulator (vector). This instruction multiplies corresponding floating-point values in the vectors in the two source SIMD&amp;FP registers, negates the product, adds the result to the corresponding vector element of the destination SIMD&amp;FP register, and writes the result to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/fmls-vector-floating-point-fused-multiply-subtract-from-accumulator-vector">FMLS</a> Vd.2D,Vn.2D,Vm.D[0]
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D <br />
+b &rarr; Vn.2D <br />
+n &rarr; Vm.D[0] </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    if sub_op then element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPNeg.1" title="function: bits(N) FPNeg(bits(N) op)">FPNeg</a>(element1);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.FPMulAdd.4" title="function: bits(N) FPMulAdd(bits(N) addend, bits(N) op1, bits(N) op2, FPCRType fpcr)">FPMulAdd</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand3, e, esize], element1, element2, FPCR);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vtrn_s8" type="checkbox"><label for="vtrn_s8"><div>int8x8x2_t <b><b>vtrn_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.8B,Vn.8B,Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd1.8B &rarr; result.val[0]
+Vd2.8B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrn_s16" type="checkbox"><label for="vtrn_s16"><div>int16x4x2_t <b><b>vtrn_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.4H,Vn.4H,Vm.4H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd1.4H &rarr; result.val[0]
+Vd2.4H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrn_u8" type="checkbox"><label for="vtrn_u8"><div>uint8x8x2_t <b><b>vtrn_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.8B,Vn.8B,Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd1.8B &rarr; result.val[0]
+Vd2.8B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrn_u16" type="checkbox"><label for="vtrn_u16"><div>uint16x4x2_t <b><b>vtrn_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.4H,Vn.4H,Vm.4H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd1.4H &rarr; result.val[0]
+Vd2.4H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrn_p8" type="checkbox"><label for="vtrn_p8"><div>poly8x8x2_t <b><b>vtrn_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.8B,Vn.8B,Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd1.8B &rarr; result.val[0]
+Vd2.8B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrn_p16" type="checkbox"><label for="vtrn_p16"><div>poly16x4x2_t <b><b>vtrn_p16</b></b> (poly16x4_t a, poly16x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.4H,Vn.4H,Vm.4H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd1.4H &rarr; result.val[0]
+Vd2.4H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrn_s32" type="checkbox"><label for="vtrn_s32"><div>int32x2x2_t <b><b>vtrn_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.2S,Vn.2S,Vm.2S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd1.2S &rarr; result.val[0]
+Vd2.2S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrn_f32" type="checkbox"><label for="vtrn_f32"><div>float32x2x2_t <b><b>vtrn_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.2S,Vn.2S,Vm.2S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd1.2S &rarr; result.val[0]
+Vd2.2S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrn_u32" type="checkbox"><label for="vtrn_u32"><div>uint32x2x2_t <b><b>vtrn_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.2S,Vn.2S,Vm.2S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd1.2S &rarr; result.val[0]
+Vd2.2S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrnq_s8" type="checkbox"><label for="vtrnq_s8"><div>int8x16x2_t <b><b>vtrnq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.16B,Vn.16B,Vm.16B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd1.16B &rarr; result.val[0]
+Vd2.16B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrnq_s16" type="checkbox"><label for="vtrnq_s16"><div>int16x8x2_t <b><b>vtrnq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.8H,Vn.8H,Vm.8H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd1.8H &rarr; result.val[0]
+Vd2.8H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrnq_s32" type="checkbox"><label for="vtrnq_s32"><div>int32x4x2_t <b><b>vtrnq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd1.4S &rarr; result.val[0]
+Vd2.4S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrnq_f32" type="checkbox"><label for="vtrnq_f32"><div>float32x4x2_t <b><b>vtrnq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd1.4S &rarr; result.val[0]
+Vd2.4S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrnq_u8" type="checkbox"><label for="vtrnq_u8"><div>uint8x16x2_t <b><b>vtrnq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.16B,Vn.16B,Vm.16B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd1.16B &rarr; result.val[0]
+Vd2.16B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrnq_u16" type="checkbox"><label for="vtrnq_u16"><div>uint16x8x2_t <b><b>vtrnq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.8H,Vn.8H,Vm.8H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd1.8H &rarr; result.val[0]
+Vd2.8H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrnq_u32" type="checkbox"><label for="vtrnq_u32"><div>uint32x4x2_t <b><b>vtrnq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd1.4S &rarr; result.val[0]
+Vd2.4S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrnq_p8" type="checkbox"><label for="vtrnq_p8"><div>poly8x16x2_t <b><b>vtrnq_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.16B,Vn.16B,Vm.16B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd1.16B &rarr; result.val[0]
+Vd2.16B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vtrnq_p16" type="checkbox"><label for="vtrnq_p16"><div>poly16x8x2_t <b><b>vtrnq_p16</b></b> (poly16x8_t a, poly16x8_t b)<span class="right">Transpose vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Transpose vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places each result into consecutive elements of a vector, and writes the vector to the destination SIMD&amp;FP register. Vector elements from the first source register are placed into even-numbered elements of the destination vector, starting at zero, while vector elements from the second source register are placed into odd-numbered elements of the destination vector.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn1-transpose-vectors-primary">TRN1</a> Vd1.8H,Vn.8H,Vm.8H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/trn2-transpose-vectors-secondary">TRN2</a> Vd2.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd1.8H &rarr; result.val[0]
+Vd2.8H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, 2*p+part, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, 2*p+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip_s8" type="checkbox"><label for="vzip_s8"><div>int8x8x2_t <b><b>vzip_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.8B,Vn.8B,Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd1.8B &rarr; result.val[0]
+Vd2.8B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip_s16" type="checkbox"><label for="vzip_s16"><div>int16x4x2_t <b><b>vzip_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.4H,Vn.4H,Vm.4H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd1.4H &rarr; result.val[0]
+Vd2.4H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip_u8" type="checkbox"><label for="vzip_u8"><div>uint8x8x2_t <b><b>vzip_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.8B,Vn.8B,Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd1.8B &rarr; result.val[0]
+Vd2.8B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip_u16" type="checkbox"><label for="vzip_u16"><div>uint16x4x2_t <b><b>vzip_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.4H,Vn.4H,Vm.4H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd1.4H &rarr; result.val[0]
+Vd2.4H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip_p8" type="checkbox"><label for="vzip_p8"><div>poly8x8x2_t <b><b>vzip_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.8B,Vn.8B,Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd1.8B &rarr; result.val[0]
+Vd2.8B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip_p16" type="checkbox"><label for="vzip_p16"><div>poly16x4x2_t <b><b>vzip_p16</b></b> (poly16x4_t a, poly16x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.4H,Vn.4H,Vm.4H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd1.4H &rarr; result.val[0]
+Vd2.4H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip_s32" type="checkbox"><label for="vzip_s32"><div>int32x2x2_t <b><b>vzip_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.2S,Vn.2S,Vm.2S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd1.2S &rarr; result.val[0]
+Vd2.2S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip_f32" type="checkbox"><label for="vzip_f32"><div>float32x2x2_t <b><b>vzip_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.2S,Vn.2S,Vm.2S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd1.2S &rarr; result.val[0]
+Vd2.2S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzip_u32" type="checkbox"><label for="vzip_u32"><div>uint32x2x2_t <b><b>vzip_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.2S,Vn.2S,Vm.2S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd1.2S &rarr; result.val[0]
+Vd2.2S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzipq_s8" type="checkbox"><label for="vzipq_s8"><div>int8x16x2_t <b><b>vzipq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.16B,Vn.16B,Vm.16B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd1.16B &rarr; result.val[0]
+Vd2.16B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzipq_s16" type="checkbox"><label for="vzipq_s16"><div>int16x8x2_t <b><b>vzipq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.8H,Vn.8H,Vm.8H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd1.8H &rarr; result.val[0]
+Vd2.8H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzipq_s32" type="checkbox"><label for="vzipq_s32"><div>int32x4x2_t <b><b>vzipq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd1.4S &rarr; result.val[0]
+Vd2.4S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzipq_f32" type="checkbox"><label for="vzipq_f32"><div>float32x4x2_t <b><b>vzipq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd1.4S &rarr; result.val[0]
+Vd2.4S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzipq_u8" type="checkbox"><label for="vzipq_u8"><div>uint8x16x2_t <b><b>vzipq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.16B,Vn.16B,Vm.16B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd1.16B &rarr; result.val[0]
+Vd2.16B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzipq_u16" type="checkbox"><label for="vzipq_u16"><div>uint16x8x2_t <b><b>vzipq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.8H,Vn.8H,Vm.8H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd1.8H &rarr; result.val[0]
+Vd2.8H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzipq_u32" type="checkbox"><label for="vzipq_u32"><div>uint32x4x2_t <b><b>vzipq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd1.4S &rarr; result.val[0]
+Vd2.4S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzipq_p8" type="checkbox"><label for="vzipq_p8"><div>poly8x16x2_t <b><b>vzipq_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.16B,Vn.16B,Vm.16B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd1.16B &rarr; result.val[0]
+Vd2.16B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vzipq_p16" type="checkbox"><label for="vzipq_p16"><div>poly16x8x2_t <b><b>vzipq_p16</b></b> (poly16x8_t a, poly16x8_t b)<span class="right">Zip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Zip vectors (secondary). This instruction reads adjacent vector elements from the lower half of two source SIMD&amp;FP registers as pairs, interleaves the pairs and places them into a vector, and writes the vector to the destination SIMD&amp;FP register. The first pair from the first source register is placed into the two lowest vector elements, with subsequent pairs taken alternately from each source register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip1-zip-vectors-primary">ZIP1</a> Vd1.8H,Vn.8H,Vm.8H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/zip2-zip-vectors-secondary">ZIP2</a> Vd2.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd1.8H &rarr; result.val[0]
+Vd2.8H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+integer base = part * pairs;
+
+for p = 0 to pairs-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+0, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, base+p, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, 2*p+1, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, base+p, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzp_s8" type="checkbox"><label for="vuzp_s8"><div>int8x8x2_t <b><b>vuzp_s8</b></b> (int8x8_t a, int8x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.8B,Vn.8B,Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd1.8B &rarr; result.val[0]
+Vd2.8B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzp_s16" type="checkbox"><label for="vuzp_s16"><div>int16x4x2_t <b><b>vuzp_s16</b></b> (int16x4_t a, int16x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.4H,Vn.4H,Vm.4H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd1.4H &rarr; result.val[0]
+Vd2.4H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzp_s32" type="checkbox"><label for="vuzp_s32"><div>int32x2x2_t <b><b>vuzp_s32</b></b> (int32x2_t a, int32x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.2S,Vn.2S,Vm.2S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd1.2S &rarr; result.val[0]
+Vd2.2S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzp_f32" type="checkbox"><label for="vuzp_f32"><div>float32x2x2_t <b><b>vuzp_f32</b></b> (float32x2_t a, float32x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.2S,Vn.2S,Vm.2S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd1.2S &rarr; result.val[0]
+Vd2.2S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzp_u8" type="checkbox"><label for="vuzp_u8"><div>uint8x8x2_t <b><b>vuzp_u8</b></b> (uint8x8_t a, uint8x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.8B,Vn.8B,Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd1.8B &rarr; result.val[0]
+Vd2.8B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzp_u16" type="checkbox"><label for="vuzp_u16"><div>uint16x4x2_t <b><b>vuzp_u16</b></b> (uint16x4_t a, uint16x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.4H,Vn.4H,Vm.4H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd1.4H &rarr; result.val[0]
+Vd2.4H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzp_u32" type="checkbox"><label for="vuzp_u32"><div>uint32x2x2_t <b><b>vuzp_u32</b></b> (uint32x2_t a, uint32x2_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.2S,Vn.2S,Vm.2S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.2S,Vn.2S,Vm.2S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2S <br />
+b &rarr; Vm.2S </pre>      <h4>Results</h4>      <pre>Vd1.2S &rarr; result.val[0]
+Vd2.2S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzp_p8" type="checkbox"><label for="vuzp_p8"><div>poly8x8x2_t <b><b>vuzp_p8</b></b> (poly8x8_t a, poly8x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.8B,Vn.8B,Vm.8B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.8B,Vn.8B,Vm.8B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8B <br />
+b &rarr; Vm.8B </pre>      <h4>Results</h4>      <pre>Vd1.8B &rarr; result.val[0]
+Vd2.8B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzp_p16" type="checkbox"><label for="vuzp_p16"><div>poly16x4x2_t <b><b>vuzp_p16</b></b> (poly16x4_t a, poly16x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.4H,Vn.4H,Vm.4H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.4H,Vn.4H,Vm.4H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4H <br />
+b &rarr; Vm.4H </pre>      <h4>Results</h4>      <pre>Vd1.4H &rarr; result.val[0]
+Vd2.4H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzpq_s8" type="checkbox"><label for="vuzpq_s8"><div>int8x16x2_t <b><b>vuzpq_s8</b></b> (int8x16_t a, int8x16_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.16B,Vn.16B,Vm.16B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd1.16B &rarr; result.val[0]
+Vd2.16B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzpq_s16" type="checkbox"><label for="vuzpq_s16"><div>int16x8x2_t <b><b>vuzpq_s16</b></b> (int16x8_t a, int16x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.8H,Vn.8H,Vm.8H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd1.8H &rarr; result.val[0]
+Vd2.8H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzpq_s32" type="checkbox"><label for="vuzpq_s32"><div>int32x4x2_t <b><b>vuzpq_s32</b></b> (int32x4_t a, int32x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd1.4S &rarr; result.val[0]
+Vd2.4S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzpq_f32" type="checkbox"><label for="vuzpq_f32"><div>float32x4x2_t <b><b>vuzpq_f32</b></b> (float32x4_t a, float32x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd1.4S &rarr; result.val[0]
+Vd2.4S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzpq_u8" type="checkbox"><label for="vuzpq_u8"><div>uint8x16x2_t <b><b>vuzpq_u8</b></b> (uint8x16_t a, uint8x16_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.16B,Vn.16B,Vm.16B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd1.16B &rarr; result.val[0]
+Vd2.16B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzpq_u16" type="checkbox"><label for="vuzpq_u16"><div>uint16x8x2_t <b><b>vuzpq_u16</b></b> (uint16x8_t a, uint16x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.8H,Vn.8H,Vm.8H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd1.8H &rarr; result.val[0]
+Vd2.8H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzpq_u32" type="checkbox"><label for="vuzpq_u32"><div>uint32x4x2_t <b><b>vuzpq_u32</b></b> (uint32x4_t a, uint32x4_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.4S,Vn.4S,Vm.4S
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.4S <br />
+b &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd1.4S &rarr; result.val[0]
+Vd2.4S &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzpq_p8" type="checkbox"><label for="vuzpq_p8"><div>poly8x16x2_t <b><b>vuzpq_p8</b></b> (poly8x16_t a, poly8x16_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.16B,Vn.16B,Vm.16B
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.16B,Vn.16B,Vm.16B
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.16B <br />
+b &rarr; Vm.16B </pre>      <h4>Results</h4>      <pre>Vd1.16B &rarr; result.val[0]
+Vd2.16B &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vuzpq_p16" type="checkbox"><label for="vuzpq_p16"><div>poly16x8x2_t <b><b>vuzpq_p16</b></b> (poly16x8_t a, poly16x8_t b)<span class="right">Unzip vectors</span></div></label><article>      <h4>Description</h4><p><p class="aml">Unzip vectors (secondary). This instruction reads corresponding odd-numbered vector elements from the two source SIMD&amp;FP registers, places the result from the first source register into consecutive elements in the lower half of a vector, and the result from the second source register into consecutive elements in the upper half of a vector, and writes the vector to the destination SIMD&amp;FP register.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp1-unzip-vectors-primary">UZP1</a> Vd1.8H,Vn.8H,Vm.8H
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/uzp2-unzip-vectors-secondary">UZP2</a> Vd2.8H,Vn.8H,Vm.8H
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.8H <br />
+b &rarr; Vm.8H </pre>      <h4>Results</h4>      <pre>Vd1.8H &rarr; result.val[0]
+Vd2.8H &rarr; result.val[1]
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operandl = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(datasize) operandh = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(datasize) result;
+
+bits(datasize*2) zipped = operandh:operandl;
+for e = 0 to elements-1
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[zipped, 2*e+part, esize];
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_s8" type="checkbox"><label for="vreinterpret_s16_s8"><div>int16x4_t <b><b>vreinterpret_s16_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_s8" type="checkbox"><label for="vreinterpret_s32_s8"><div>int32x2_t <b><b>vreinterpret_s32_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_s8" type="checkbox"><label for="vreinterpret_f32_s8"><div>float32x2_t <b><b>vreinterpret_f32_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_s8" type="checkbox"><label for="vreinterpret_u8_s8"><div>uint8x8_t <b><b>vreinterpret_u8_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_s8" type="checkbox"><label for="vreinterpret_u16_s8"><div>uint16x4_t <b><b>vreinterpret_u16_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_s8" type="checkbox"><label for="vreinterpret_u32_s8"><div>uint32x2_t <b><b>vreinterpret_u32_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_s8" type="checkbox"><label for="vreinterpret_p8_s8"><div>poly8x8_t <b><b>vreinterpret_p8_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_s8" type="checkbox"><label for="vreinterpret_p16_s8"><div>poly16x4_t <b><b>vreinterpret_p16_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_s8" type="checkbox"><label for="vreinterpret_u64_s8"><div>uint64x1_t <b><b>vreinterpret_u64_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_s8" type="checkbox"><label for="vreinterpret_s64_s8"><div>int64x1_t <b><b>vreinterpret_s64_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_s8" type="checkbox"><label for="vreinterpret_f64_s8"><div>float64x1_t <b><b>vreinterpret_f64_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_s8" type="checkbox"><label for="vreinterpret_p64_s8"><div>poly64x1_t <b><b>vreinterpret_p64_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_s8" type="checkbox"><label for="vreinterpret_f16_s8"><div>float16x4_t <b><b>vreinterpret_f16_s8</b></b> (int8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_s16" type="checkbox"><label for="vreinterpret_s8_s16"><div>int8x8_t <b><b>vreinterpret_s8_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_s16" type="checkbox"><label for="vreinterpret_s32_s16"><div>int32x2_t <b><b>vreinterpret_s32_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_s16" type="checkbox"><label for="vreinterpret_f32_s16"><div>float32x2_t <b><b>vreinterpret_f32_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_s16" type="checkbox"><label for="vreinterpret_u8_s16"><div>uint8x8_t <b><b>vreinterpret_u8_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_s16" type="checkbox"><label for="vreinterpret_u16_s16"><div>uint16x4_t <b><b>vreinterpret_u16_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_s16" type="checkbox"><label for="vreinterpret_u32_s16"><div>uint32x2_t <b><b>vreinterpret_u32_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_s16" type="checkbox"><label for="vreinterpret_p8_s16"><div>poly8x8_t <b><b>vreinterpret_p8_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_s16" type="checkbox"><label for="vreinterpret_p16_s16"><div>poly16x4_t <b><b>vreinterpret_p16_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_s16" type="checkbox"><label for="vreinterpret_u64_s16"><div>uint64x1_t <b><b>vreinterpret_u64_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_s16" type="checkbox"><label for="vreinterpret_s64_s16"><div>int64x1_t <b><b>vreinterpret_s64_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_s16" type="checkbox"><label for="vreinterpret_f64_s16"><div>float64x1_t <b><b>vreinterpret_f64_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_s16" type="checkbox"><label for="vreinterpret_p64_s16"><div>poly64x1_t <b><b>vreinterpret_p64_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_s16" type="checkbox"><label for="vreinterpret_f16_s16"><div>float16x4_t <b><b>vreinterpret_f16_s16</b></b> (int16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_s32" type="checkbox"><label for="vreinterpret_s8_s32"><div>int8x8_t <b><b>vreinterpret_s8_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_s32" type="checkbox"><label for="vreinterpret_s16_s32"><div>int16x4_t <b><b>vreinterpret_s16_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_s32" type="checkbox"><label for="vreinterpret_f32_s32"><div>float32x2_t <b><b>vreinterpret_f32_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_s32" type="checkbox"><label for="vreinterpret_u8_s32"><div>uint8x8_t <b><b>vreinterpret_u8_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_s32" type="checkbox"><label for="vreinterpret_u16_s32"><div>uint16x4_t <b><b>vreinterpret_u16_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_s32" type="checkbox"><label for="vreinterpret_u32_s32"><div>uint32x2_t <b><b>vreinterpret_u32_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_s32" type="checkbox"><label for="vreinterpret_p8_s32"><div>poly8x8_t <b><b>vreinterpret_p8_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_s32" type="checkbox"><label for="vreinterpret_p16_s32"><div>poly16x4_t <b><b>vreinterpret_p16_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_s32" type="checkbox"><label for="vreinterpret_u64_s32"><div>uint64x1_t <b><b>vreinterpret_u64_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_s32" type="checkbox"><label for="vreinterpret_s64_s32"><div>int64x1_t <b><b>vreinterpret_s64_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_s32" type="checkbox"><label for="vreinterpret_f64_s32"><div>float64x1_t <b><b>vreinterpret_f64_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_s32" type="checkbox"><label for="vreinterpret_p64_s32"><div>poly64x1_t <b><b>vreinterpret_p64_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_s32" type="checkbox"><label for="vreinterpret_f16_s32"><div>float16x4_t <b><b>vreinterpret_f16_s32</b></b> (int32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_f32" type="checkbox"><label for="vreinterpret_s8_f32"><div>int8x8_t <b><b>vreinterpret_s8_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_f32" type="checkbox"><label for="vreinterpret_s16_f32"><div>int16x4_t <b><b>vreinterpret_s16_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_f32" type="checkbox"><label for="vreinterpret_s32_f32"><div>int32x2_t <b><b>vreinterpret_s32_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_f32" type="checkbox"><label for="vreinterpret_u8_f32"><div>uint8x8_t <b><b>vreinterpret_u8_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_f32" type="checkbox"><label for="vreinterpret_u16_f32"><div>uint16x4_t <b><b>vreinterpret_u16_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_f32" type="checkbox"><label for="vreinterpret_u32_f32"><div>uint32x2_t <b><b>vreinterpret_u32_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_f32" type="checkbox"><label for="vreinterpret_p8_f32"><div>poly8x8_t <b><b>vreinterpret_p8_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_f32" type="checkbox"><label for="vreinterpret_p16_f32"><div>poly16x4_t <b><b>vreinterpret_p16_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_f32" type="checkbox"><label for="vreinterpret_u64_f32"><div>uint64x1_t <b><b>vreinterpret_u64_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_f32" type="checkbox"><label for="vreinterpret_s64_f32"><div>int64x1_t <b><b>vreinterpret_s64_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_f32" type="checkbox"><label for="vreinterpret_f64_f32"><div>float64x1_t <b><b>vreinterpret_f64_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_f32" type="checkbox"><label for="vreinterpret_p64_f32"><div>poly64x1_t <b><b>vreinterpret_p64_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_f64" type="checkbox"><label for="vreinterpret_p64_f64"><div>poly64x1_t <b><b>vreinterpret_p64_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_f32" type="checkbox"><label for="vreinterpret_f16_f32"><div>float16x4_t <b><b>vreinterpret_f16_f32</b></b> (float32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_u8" type="checkbox"><label for="vreinterpret_s8_u8"><div>int8x8_t <b><b>vreinterpret_s8_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_u8" type="checkbox"><label for="vreinterpret_s16_u8"><div>int16x4_t <b><b>vreinterpret_s16_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_u8" type="checkbox"><label for="vreinterpret_s32_u8"><div>int32x2_t <b><b>vreinterpret_s32_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_u8" type="checkbox"><label for="vreinterpret_f32_u8"><div>float32x2_t <b><b>vreinterpret_f32_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_u8" type="checkbox"><label for="vreinterpret_u16_u8"><div>uint16x4_t <b><b>vreinterpret_u16_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_u8" type="checkbox"><label for="vreinterpret_u32_u8"><div>uint32x2_t <b><b>vreinterpret_u32_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_u8" type="checkbox"><label for="vreinterpret_p8_u8"><div>poly8x8_t <b><b>vreinterpret_p8_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_u8" type="checkbox"><label for="vreinterpret_p16_u8"><div>poly16x4_t <b><b>vreinterpret_p16_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_u8" type="checkbox"><label for="vreinterpret_u64_u8"><div>uint64x1_t <b><b>vreinterpret_u64_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_u8" type="checkbox"><label for="vreinterpret_s64_u8"><div>int64x1_t <b><b>vreinterpret_s64_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_u8" type="checkbox"><label for="vreinterpret_f64_u8"><div>float64x1_t <b><b>vreinterpret_f64_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_u8" type="checkbox"><label for="vreinterpret_p64_u8"><div>poly64x1_t <b><b>vreinterpret_p64_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_u8" type="checkbox"><label for="vreinterpret_f16_u8"><div>float16x4_t <b><b>vreinterpret_f16_u8</b></b> (uint8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_u16" type="checkbox"><label for="vreinterpret_s8_u16"><div>int8x8_t <b><b>vreinterpret_s8_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_u16" type="checkbox"><label for="vreinterpret_s16_u16"><div>int16x4_t <b><b>vreinterpret_s16_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_u16" type="checkbox"><label for="vreinterpret_s32_u16"><div>int32x2_t <b><b>vreinterpret_s32_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_u16" type="checkbox"><label for="vreinterpret_f32_u16"><div>float32x2_t <b><b>vreinterpret_f32_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_u16" type="checkbox"><label for="vreinterpret_u8_u16"><div>uint8x8_t <b><b>vreinterpret_u8_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_u16" type="checkbox"><label for="vreinterpret_u32_u16"><div>uint32x2_t <b><b>vreinterpret_u32_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_u16" type="checkbox"><label for="vreinterpret_p8_u16"><div>poly8x8_t <b><b>vreinterpret_p8_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_u16" type="checkbox"><label for="vreinterpret_p16_u16"><div>poly16x4_t <b><b>vreinterpret_p16_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_u16" type="checkbox"><label for="vreinterpret_u64_u16"><div>uint64x1_t <b><b>vreinterpret_u64_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_u16" type="checkbox"><label for="vreinterpret_s64_u16"><div>int64x1_t <b><b>vreinterpret_s64_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_u16" type="checkbox"><label for="vreinterpret_f64_u16"><div>float64x1_t <b><b>vreinterpret_f64_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_u16" type="checkbox"><label for="vreinterpret_p64_u16"><div>poly64x1_t <b><b>vreinterpret_p64_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_u16" type="checkbox"><label for="vreinterpret_f16_u16"><div>float16x4_t <b><b>vreinterpret_f16_u16</b></b> (uint16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_u32" type="checkbox"><label for="vreinterpret_s8_u32"><div>int8x8_t <b><b>vreinterpret_s8_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_u32" type="checkbox"><label for="vreinterpret_s16_u32"><div>int16x4_t <b><b>vreinterpret_s16_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_u32" type="checkbox"><label for="vreinterpret_s32_u32"><div>int32x2_t <b><b>vreinterpret_s32_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_u32" type="checkbox"><label for="vreinterpret_f32_u32"><div>float32x2_t <b><b>vreinterpret_f32_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_u32" type="checkbox"><label for="vreinterpret_u8_u32"><div>uint8x8_t <b><b>vreinterpret_u8_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_u32" type="checkbox"><label for="vreinterpret_u16_u32"><div>uint16x4_t <b><b>vreinterpret_u16_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_u32" type="checkbox"><label for="vreinterpret_p8_u32"><div>poly8x8_t <b><b>vreinterpret_p8_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_u32" type="checkbox"><label for="vreinterpret_p16_u32"><div>poly16x4_t <b><b>vreinterpret_p16_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_u32" type="checkbox"><label for="vreinterpret_u64_u32"><div>uint64x1_t <b><b>vreinterpret_u64_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_u32" type="checkbox"><label for="vreinterpret_s64_u32"><div>int64x1_t <b><b>vreinterpret_s64_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_u32" type="checkbox"><label for="vreinterpret_f64_u32"><div>float64x1_t <b><b>vreinterpret_f64_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_u32" type="checkbox"><label for="vreinterpret_p64_u32"><div>poly64x1_t <b><b>vreinterpret_p64_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_u32" type="checkbox"><label for="vreinterpret_f16_u32"><div>float16x4_t <b><b>vreinterpret_f16_u32</b></b> (uint32x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2S </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_p8" type="checkbox"><label for="vreinterpret_s8_p8"><div>int8x8_t <b><b>vreinterpret_s8_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_p8" type="checkbox"><label for="vreinterpret_s16_p8"><div>int16x4_t <b><b>vreinterpret_s16_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_p8" type="checkbox"><label for="vreinterpret_s32_p8"><div>int32x2_t <b><b>vreinterpret_s32_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_p8" type="checkbox"><label for="vreinterpret_f32_p8"><div>float32x2_t <b><b>vreinterpret_f32_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_p8" type="checkbox"><label for="vreinterpret_u8_p8"><div>uint8x8_t <b><b>vreinterpret_u8_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_p8" type="checkbox"><label for="vreinterpret_u16_p8"><div>uint16x4_t <b><b>vreinterpret_u16_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_p8" type="checkbox"><label for="vreinterpret_u32_p8"><div>uint32x2_t <b><b>vreinterpret_u32_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_p8" type="checkbox"><label for="vreinterpret_p16_p8"><div>poly16x4_t <b><b>vreinterpret_p16_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_p8" type="checkbox"><label for="vreinterpret_u64_p8"><div>uint64x1_t <b><b>vreinterpret_u64_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_p8" type="checkbox"><label for="vreinterpret_s64_p8"><div>int64x1_t <b><b>vreinterpret_s64_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_p8" type="checkbox"><label for="vreinterpret_f64_p8"><div>float64x1_t <b><b>vreinterpret_f64_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_p8" type="checkbox"><label for="vreinterpret_p64_p8"><div>poly64x1_t <b><b>vreinterpret_p64_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_p8" type="checkbox"><label for="vreinterpret_f16_p8"><div>float16x4_t <b><b>vreinterpret_f16_p8</b></b> (poly8x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8B </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_p16" type="checkbox"><label for="vreinterpret_s8_p16"><div>int8x8_t <b><b>vreinterpret_s8_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_p16" type="checkbox"><label for="vreinterpret_s16_p16"><div>int16x4_t <b><b>vreinterpret_s16_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_p16" type="checkbox"><label for="vreinterpret_s32_p16"><div>int32x2_t <b><b>vreinterpret_s32_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_p16" type="checkbox"><label for="vreinterpret_f32_p16"><div>float32x2_t <b><b>vreinterpret_f32_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_p16" type="checkbox"><label for="vreinterpret_u8_p16"><div>uint8x8_t <b><b>vreinterpret_u8_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_p16" type="checkbox"><label for="vreinterpret_u16_p16"><div>uint16x4_t <b><b>vreinterpret_u16_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_p16" type="checkbox"><label for="vreinterpret_u32_p16"><div>uint32x2_t <b><b>vreinterpret_u32_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_p16" type="checkbox"><label for="vreinterpret_p8_p16"><div>poly8x8_t <b><b>vreinterpret_p8_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_p16" type="checkbox"><label for="vreinterpret_u64_p16"><div>uint64x1_t <b><b>vreinterpret_u64_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_p16" type="checkbox"><label for="vreinterpret_s64_p16"><div>int64x1_t <b><b>vreinterpret_s64_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_p16" type="checkbox"><label for="vreinterpret_f64_p16"><div>float64x1_t <b><b>vreinterpret_f64_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_p16" type="checkbox"><label for="vreinterpret_p64_p16"><div>poly64x1_t <b><b>vreinterpret_p64_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_p16" type="checkbox"><label for="vreinterpret_f16_p16"><div>float16x4_t <b><b>vreinterpret_f16_p16</b></b> (poly16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_u64" type="checkbox"><label for="vreinterpret_s8_u64"><div>int8x8_t <b><b>vreinterpret_s8_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_u64" type="checkbox"><label for="vreinterpret_s16_u64"><div>int16x4_t <b><b>vreinterpret_s16_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_u64" type="checkbox"><label for="vreinterpret_s32_u64"><div>int32x2_t <b><b>vreinterpret_s32_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_u64" type="checkbox"><label for="vreinterpret_f32_u64"><div>float32x2_t <b><b>vreinterpret_f32_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_u64" type="checkbox"><label for="vreinterpret_u8_u64"><div>uint8x8_t <b><b>vreinterpret_u8_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_u64" type="checkbox"><label for="vreinterpret_u16_u64"><div>uint16x4_t <b><b>vreinterpret_u16_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_u64" type="checkbox"><label for="vreinterpret_u32_u64"><div>uint32x2_t <b><b>vreinterpret_u32_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_u64" type="checkbox"><label for="vreinterpret_p8_u64"><div>poly8x8_t <b><b>vreinterpret_p8_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_u64" type="checkbox"><label for="vreinterpret_p16_u64"><div>poly16x4_t <b><b>vreinterpret_p16_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_u64" type="checkbox"><label for="vreinterpret_s64_u64"><div>int64x1_t <b><b>vreinterpret_s64_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_u64" type="checkbox"><label for="vreinterpret_f64_u64"><div>float64x1_t <b><b>vreinterpret_f64_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_u64" type="checkbox"><label for="vreinterpret_p64_u64"><div>poly64x1_t <b><b>vreinterpret_p64_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_u64" type="checkbox"><label for="vreinterpret_f16_u64"><div>float16x4_t <b><b>vreinterpret_f16_u64</b></b> (uint64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_s64" type="checkbox"><label for="vreinterpret_s8_s64"><div>int8x8_t <b><b>vreinterpret_s8_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_s64" type="checkbox"><label for="vreinterpret_s16_s64"><div>int16x4_t <b><b>vreinterpret_s16_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_s64" type="checkbox"><label for="vreinterpret_s32_s64"><div>int32x2_t <b><b>vreinterpret_s32_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_s64" type="checkbox"><label for="vreinterpret_f32_s64"><div>float32x2_t <b><b>vreinterpret_f32_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_s64" type="checkbox"><label for="vreinterpret_u8_s64"><div>uint8x8_t <b><b>vreinterpret_u8_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_s64" type="checkbox"><label for="vreinterpret_u16_s64"><div>uint16x4_t <b><b>vreinterpret_u16_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_s64" type="checkbox"><label for="vreinterpret_u32_s64"><div>uint32x2_t <b><b>vreinterpret_u32_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_s64" type="checkbox"><label for="vreinterpret_p8_s64"><div>poly8x8_t <b><b>vreinterpret_p8_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_s64" type="checkbox"><label for="vreinterpret_p16_s64"><div>poly16x4_t <b><b>vreinterpret_p16_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_s64" type="checkbox"><label for="vreinterpret_u64_s64"><div>uint64x1_t <b><b>vreinterpret_u64_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_s64" type="checkbox"><label for="vreinterpret_f64_s64"><div>float64x1_t <b><b>vreinterpret_f64_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_p64" type="checkbox"><label for="vreinterpret_u64_p64"><div>uint64x1_t <b><b>vreinterpret_u64_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_s64" type="checkbox"><label for="vreinterpret_f16_s64"><div>float16x4_t <b><b>vreinterpret_f16_s64</b></b> (int64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_f16" type="checkbox"><label for="vreinterpret_s8_f16"><div>int8x8_t <b><b>vreinterpret_s8_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_f16" type="checkbox"><label for="vreinterpret_s16_f16"><div>int16x4_t <b><b>vreinterpret_s16_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_f16" type="checkbox"><label for="vreinterpret_s32_f16"><div>int32x2_t <b><b>vreinterpret_s32_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_f16" type="checkbox"><label for="vreinterpret_f32_f16"><div>float32x2_t <b><b>vreinterpret_f32_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_f16" type="checkbox"><label for="vreinterpret_u8_f16"><div>uint8x8_t <b><b>vreinterpret_u8_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_f16" type="checkbox"><label for="vreinterpret_u16_f16"><div>uint16x4_t <b><b>vreinterpret_u16_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_f16" type="checkbox"><label for="vreinterpret_u32_f16"><div>uint32x2_t <b><b>vreinterpret_u32_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_f16" type="checkbox"><label for="vreinterpret_p8_f16"><div>poly8x8_t <b><b>vreinterpret_p8_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_f16" type="checkbox"><label for="vreinterpret_p16_f16"><div>poly16x4_t <b><b>vreinterpret_p16_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_f16" type="checkbox"><label for="vreinterpret_u64_f16"><div>uint64x1_t <b><b>vreinterpret_u64_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_f16" type="checkbox"><label for="vreinterpret_s64_f16"><div>int64x1_t <b><b>vreinterpret_s64_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_f16" type="checkbox"><label for="vreinterpret_f64_f16"><div>float64x1_t <b><b>vreinterpret_f64_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p64_f16" type="checkbox"><label for="vreinterpret_p64_f16"><div>poly64x1_t <b><b>vreinterpret_p64_f16</b></b> (float16x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4H </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_s8" type="checkbox"><label for="vreinterpretq_s16_s8"><div>int16x8_t <b><b>vreinterpretq_s16_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_s8" type="checkbox"><label for="vreinterpretq_s32_s8"><div>int32x4_t <b><b>vreinterpretq_s32_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_s8" type="checkbox"><label for="vreinterpretq_f32_s8"><div>float32x4_t <b><b>vreinterpretq_f32_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_s8" type="checkbox"><label for="vreinterpretq_u8_s8"><div>uint8x16_t <b><b>vreinterpretq_u8_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_s8" type="checkbox"><label for="vreinterpretq_u16_s8"><div>uint16x8_t <b><b>vreinterpretq_u16_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_s8" type="checkbox"><label for="vreinterpretq_u32_s8"><div>uint32x4_t <b><b>vreinterpretq_u32_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_s8" type="checkbox"><label for="vreinterpretq_p8_s8"><div>poly8x16_t <b><b>vreinterpretq_p8_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_s8" type="checkbox"><label for="vreinterpretq_p16_s8"><div>poly16x8_t <b><b>vreinterpretq_p16_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_s8" type="checkbox"><label for="vreinterpretq_u64_s8"><div>uint64x2_t <b><b>vreinterpretq_u64_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_s8" type="checkbox"><label for="vreinterpretq_s64_s8"><div>int64x2_t <b><b>vreinterpretq_s64_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_s8" type="checkbox"><label for="vreinterpretq_f64_s8"><div>float64x2_t <b><b>vreinterpretq_f64_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_s8" type="checkbox"><label for="vreinterpretq_p64_s8"><div>poly64x2_t <b><b>vreinterpretq_p64_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_s8" type="checkbox"><label for="vreinterpretq_p128_s8"><div>poly128_t <b><b>vreinterpretq_p128_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_s8" type="checkbox"><label for="vreinterpretq_f16_s8"><div>float16x8_t <b><b>vreinterpretq_f16_s8</b></b> (int8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_s16" type="checkbox"><label for="vreinterpretq_s8_s16"><div>int8x16_t <b><b>vreinterpretq_s8_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_s16" type="checkbox"><label for="vreinterpretq_s32_s16"><div>int32x4_t <b><b>vreinterpretq_s32_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_s16" type="checkbox"><label for="vreinterpretq_f32_s16"><div>float32x4_t <b><b>vreinterpretq_f32_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_s16" type="checkbox"><label for="vreinterpretq_u8_s16"><div>uint8x16_t <b><b>vreinterpretq_u8_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_s16" type="checkbox"><label for="vreinterpretq_u16_s16"><div>uint16x8_t <b><b>vreinterpretq_u16_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_s16" type="checkbox"><label for="vreinterpretq_u32_s16"><div>uint32x4_t <b><b>vreinterpretq_u32_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_s16" type="checkbox"><label for="vreinterpretq_p8_s16"><div>poly8x16_t <b><b>vreinterpretq_p8_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_s16" type="checkbox"><label for="vreinterpretq_p16_s16"><div>poly16x8_t <b><b>vreinterpretq_p16_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_s16" type="checkbox"><label for="vreinterpretq_u64_s16"><div>uint64x2_t <b><b>vreinterpretq_u64_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_s16" type="checkbox"><label for="vreinterpretq_s64_s16"><div>int64x2_t <b><b>vreinterpretq_s64_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_s16" type="checkbox"><label for="vreinterpretq_f64_s16"><div>float64x2_t <b><b>vreinterpretq_f64_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_s16" type="checkbox"><label for="vreinterpretq_p64_s16"><div>poly64x2_t <b><b>vreinterpretq_p64_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_s16" type="checkbox"><label for="vreinterpretq_p128_s16"><div>poly128_t <b><b>vreinterpretq_p128_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_s16" type="checkbox"><label for="vreinterpretq_f16_s16"><div>float16x8_t <b><b>vreinterpretq_f16_s16</b></b> (int16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_s32" type="checkbox"><label for="vreinterpretq_s8_s32"><div>int8x16_t <b><b>vreinterpretq_s8_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_s32" type="checkbox"><label for="vreinterpretq_s16_s32"><div>int16x8_t <b><b>vreinterpretq_s16_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_s32" type="checkbox"><label for="vreinterpretq_f32_s32"><div>float32x4_t <b><b>vreinterpretq_f32_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_s32" type="checkbox"><label for="vreinterpretq_u8_s32"><div>uint8x16_t <b><b>vreinterpretq_u8_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_s32" type="checkbox"><label for="vreinterpretq_u16_s32"><div>uint16x8_t <b><b>vreinterpretq_u16_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_s32" type="checkbox"><label for="vreinterpretq_u32_s32"><div>uint32x4_t <b><b>vreinterpretq_u32_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_s32" type="checkbox"><label for="vreinterpretq_p8_s32"><div>poly8x16_t <b><b>vreinterpretq_p8_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_s32" type="checkbox"><label for="vreinterpretq_p16_s32"><div>poly16x8_t <b><b>vreinterpretq_p16_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_s32" type="checkbox"><label for="vreinterpretq_u64_s32"><div>uint64x2_t <b><b>vreinterpretq_u64_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_s32" type="checkbox"><label for="vreinterpretq_s64_s32"><div>int64x2_t <b><b>vreinterpretq_s64_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_s32" type="checkbox"><label for="vreinterpretq_f64_s32"><div>float64x2_t <b><b>vreinterpretq_f64_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_s32" type="checkbox"><label for="vreinterpretq_p64_s32"><div>poly64x2_t <b><b>vreinterpretq_p64_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_s32" type="checkbox"><label for="vreinterpretq_p128_s32"><div>poly128_t <b><b>vreinterpretq_p128_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_s32" type="checkbox"><label for="vreinterpretq_f16_s32"><div>float16x8_t <b><b>vreinterpretq_f16_s32</b></b> (int32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_f32" type="checkbox"><label for="vreinterpretq_s8_f32"><div>int8x16_t <b><b>vreinterpretq_s8_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_f32" type="checkbox"><label for="vreinterpretq_s16_f32"><div>int16x8_t <b><b>vreinterpretq_s16_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_f32" type="checkbox"><label for="vreinterpretq_s32_f32"><div>int32x4_t <b><b>vreinterpretq_s32_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_f32" type="checkbox"><label for="vreinterpretq_u8_f32"><div>uint8x16_t <b><b>vreinterpretq_u8_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_f32" type="checkbox"><label for="vreinterpretq_u16_f32"><div>uint16x8_t <b><b>vreinterpretq_u16_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_f32" type="checkbox"><label for="vreinterpretq_u32_f32"><div>uint32x4_t <b><b>vreinterpretq_u32_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_f32" type="checkbox"><label for="vreinterpretq_p8_f32"><div>poly8x16_t <b><b>vreinterpretq_p8_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_f32" type="checkbox"><label for="vreinterpretq_p16_f32"><div>poly16x8_t <b><b>vreinterpretq_p16_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_f32" type="checkbox"><label for="vreinterpretq_u64_f32"><div>uint64x2_t <b><b>vreinterpretq_u64_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_f32" type="checkbox"><label for="vreinterpretq_s64_f32"><div>int64x2_t <b><b>vreinterpretq_s64_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_f32" type="checkbox"><label for="vreinterpretq_f64_f32"><div>float64x2_t <b><b>vreinterpretq_f64_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_f32" type="checkbox"><label for="vreinterpretq_p64_f32"><div>poly64x2_t <b><b>vreinterpretq_p64_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_f32" type="checkbox"><label for="vreinterpretq_p128_f32"><div>poly128_t <b><b>vreinterpretq_p128_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_f64" type="checkbox"><label for="vreinterpretq_p64_f64"><div>poly64x2_t <b><b>vreinterpretq_p64_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_f64" type="checkbox"><label for="vreinterpretq_p128_f64"><div>poly128_t <b><b>vreinterpretq_p128_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_f32" type="checkbox"><label for="vreinterpretq_f16_f32"><div>float16x8_t <b><b>vreinterpretq_f16_f32</b></b> (float32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_u8" type="checkbox"><label for="vreinterpretq_s8_u8"><div>int8x16_t <b><b>vreinterpretq_s8_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_u8" type="checkbox"><label for="vreinterpretq_s16_u8"><div>int16x8_t <b><b>vreinterpretq_s16_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_u8" type="checkbox"><label for="vreinterpretq_s32_u8"><div>int32x4_t <b><b>vreinterpretq_s32_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_u8" type="checkbox"><label for="vreinterpretq_f32_u8"><div>float32x4_t <b><b>vreinterpretq_f32_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_u8" type="checkbox"><label for="vreinterpretq_u16_u8"><div>uint16x8_t <b><b>vreinterpretq_u16_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_u8" type="checkbox"><label for="vreinterpretq_u32_u8"><div>uint32x4_t <b><b>vreinterpretq_u32_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_u8" type="checkbox"><label for="vreinterpretq_p8_u8"><div>poly8x16_t <b><b>vreinterpretq_p8_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_u8" type="checkbox"><label for="vreinterpretq_p16_u8"><div>poly16x8_t <b><b>vreinterpretq_p16_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_u8" type="checkbox"><label for="vreinterpretq_u64_u8"><div>uint64x2_t <b><b>vreinterpretq_u64_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_u8" type="checkbox"><label for="vreinterpretq_s64_u8"><div>int64x2_t <b><b>vreinterpretq_s64_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_u8" type="checkbox"><label for="vreinterpretq_f64_u8"><div>float64x2_t <b><b>vreinterpretq_f64_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_u8" type="checkbox"><label for="vreinterpretq_p64_u8"><div>poly64x2_t <b><b>vreinterpretq_p64_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_u8" type="checkbox"><label for="vreinterpretq_p128_u8"><div>poly128_t <b><b>vreinterpretq_p128_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_u8" type="checkbox"><label for="vreinterpretq_f16_u8"><div>float16x8_t <b><b>vreinterpretq_f16_u8</b></b> (uint8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_u16" type="checkbox"><label for="vreinterpretq_s8_u16"><div>int8x16_t <b><b>vreinterpretq_s8_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_u16" type="checkbox"><label for="vreinterpretq_s16_u16"><div>int16x8_t <b><b>vreinterpretq_s16_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_u16" type="checkbox"><label for="vreinterpretq_s32_u16"><div>int32x4_t <b><b>vreinterpretq_s32_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_u16" type="checkbox"><label for="vreinterpretq_f32_u16"><div>float32x4_t <b><b>vreinterpretq_f32_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_u16" type="checkbox"><label for="vreinterpretq_u8_u16"><div>uint8x16_t <b><b>vreinterpretq_u8_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_u16" type="checkbox"><label for="vreinterpretq_u32_u16"><div>uint32x4_t <b><b>vreinterpretq_u32_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_u16" type="checkbox"><label for="vreinterpretq_p8_u16"><div>poly8x16_t <b><b>vreinterpretq_p8_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_u16" type="checkbox"><label for="vreinterpretq_p16_u16"><div>poly16x8_t <b><b>vreinterpretq_p16_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_u16" type="checkbox"><label for="vreinterpretq_u64_u16"><div>uint64x2_t <b><b>vreinterpretq_u64_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_u16" type="checkbox"><label for="vreinterpretq_s64_u16"><div>int64x2_t <b><b>vreinterpretq_s64_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_u16" type="checkbox"><label for="vreinterpretq_f64_u16"><div>float64x2_t <b><b>vreinterpretq_f64_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_u16" type="checkbox"><label for="vreinterpretq_p64_u16"><div>poly64x2_t <b><b>vreinterpretq_p64_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_u16" type="checkbox"><label for="vreinterpretq_p128_u16"><div>poly128_t <b><b>vreinterpretq_p128_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_u16" type="checkbox"><label for="vreinterpretq_f16_u16"><div>float16x8_t <b><b>vreinterpretq_f16_u16</b></b> (uint16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_u32" type="checkbox"><label for="vreinterpretq_s8_u32"><div>int8x16_t <b><b>vreinterpretq_s8_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_u32" type="checkbox"><label for="vreinterpretq_s16_u32"><div>int16x8_t <b><b>vreinterpretq_s16_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_u32" type="checkbox"><label for="vreinterpretq_s32_u32"><div>int32x4_t <b><b>vreinterpretq_s32_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_u32" type="checkbox"><label for="vreinterpretq_f32_u32"><div>float32x4_t <b><b>vreinterpretq_f32_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_u32" type="checkbox"><label for="vreinterpretq_u8_u32"><div>uint8x16_t <b><b>vreinterpretq_u8_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_u32" type="checkbox"><label for="vreinterpretq_u16_u32"><div>uint16x8_t <b><b>vreinterpretq_u16_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_u32" type="checkbox"><label for="vreinterpretq_p8_u32"><div>poly8x16_t <b><b>vreinterpretq_p8_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_u32" type="checkbox"><label for="vreinterpretq_p16_u32"><div>poly16x8_t <b><b>vreinterpretq_p16_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_u32" type="checkbox"><label for="vreinterpretq_u64_u32"><div>uint64x2_t <b><b>vreinterpretq_u64_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_u32" type="checkbox"><label for="vreinterpretq_s64_u32"><div>int64x2_t <b><b>vreinterpretq_s64_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_u32" type="checkbox"><label for="vreinterpretq_f64_u32"><div>float64x2_t <b><b>vreinterpretq_f64_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_u32" type="checkbox"><label for="vreinterpretq_p64_u32"><div>poly64x2_t <b><b>vreinterpretq_p64_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_u32" type="checkbox"><label for="vreinterpretq_p128_u32"><div>poly128_t <b><b>vreinterpretq_p128_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_u32" type="checkbox"><label for="vreinterpretq_f16_u32"><div>float16x8_t <b><b>vreinterpretq_f16_u32</b></b> (uint32x4_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.4S </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_p8" type="checkbox"><label for="vreinterpretq_s8_p8"><div>int8x16_t <b><b>vreinterpretq_s8_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_p8" type="checkbox"><label for="vreinterpretq_s16_p8"><div>int16x8_t <b><b>vreinterpretq_s16_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_p8" type="checkbox"><label for="vreinterpretq_s32_p8"><div>int32x4_t <b><b>vreinterpretq_s32_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_p8" type="checkbox"><label for="vreinterpretq_f32_p8"><div>float32x4_t <b><b>vreinterpretq_f32_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_p8" type="checkbox"><label for="vreinterpretq_u8_p8"><div>uint8x16_t <b><b>vreinterpretq_u8_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_p8" type="checkbox"><label for="vreinterpretq_u16_p8"><div>uint16x8_t <b><b>vreinterpretq_u16_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_p8" type="checkbox"><label for="vreinterpretq_u32_p8"><div>uint32x4_t <b><b>vreinterpretq_u32_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_p8" type="checkbox"><label for="vreinterpretq_p16_p8"><div>poly16x8_t <b><b>vreinterpretq_p16_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_p8" type="checkbox"><label for="vreinterpretq_u64_p8"><div>uint64x2_t <b><b>vreinterpretq_u64_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_p8" type="checkbox"><label for="vreinterpretq_s64_p8"><div>int64x2_t <b><b>vreinterpretq_s64_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_p8" type="checkbox"><label for="vreinterpretq_f64_p8"><div>float64x2_t <b><b>vreinterpretq_f64_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_p8" type="checkbox"><label for="vreinterpretq_p64_p8"><div>poly64x2_t <b><b>vreinterpretq_p64_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_p8" type="checkbox"><label for="vreinterpretq_p128_p8"><div>poly128_t <b><b>vreinterpretq_p128_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_p8" type="checkbox"><label for="vreinterpretq_f16_p8"><div>float16x8_t <b><b>vreinterpretq_f16_p8</b></b> (poly8x16_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.16B </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_p16" type="checkbox"><label for="vreinterpretq_s8_p16"><div>int8x16_t <b><b>vreinterpretq_s8_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_p16" type="checkbox"><label for="vreinterpretq_s16_p16"><div>int16x8_t <b><b>vreinterpretq_s16_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_p16" type="checkbox"><label for="vreinterpretq_s32_p16"><div>int32x4_t <b><b>vreinterpretq_s32_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_p16" type="checkbox"><label for="vreinterpretq_f32_p16"><div>float32x4_t <b><b>vreinterpretq_f32_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_p16" type="checkbox"><label for="vreinterpretq_u8_p16"><div>uint8x16_t <b><b>vreinterpretq_u8_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_p16" type="checkbox"><label for="vreinterpretq_u16_p16"><div>uint16x8_t <b><b>vreinterpretq_u16_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_p16" type="checkbox"><label for="vreinterpretq_u32_p16"><div>uint32x4_t <b><b>vreinterpretq_u32_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_p16" type="checkbox"><label for="vreinterpretq_p8_p16"><div>poly8x16_t <b><b>vreinterpretq_p8_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_p16" type="checkbox"><label for="vreinterpretq_u64_p16"><div>uint64x2_t <b><b>vreinterpretq_u64_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_p16" type="checkbox"><label for="vreinterpretq_s64_p16"><div>int64x2_t <b><b>vreinterpretq_s64_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_p16" type="checkbox"><label for="vreinterpretq_f64_p16"><div>float64x2_t <b><b>vreinterpretq_f64_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_p16" type="checkbox"><label for="vreinterpretq_p64_p16"><div>poly64x2_t <b><b>vreinterpretq_p64_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_p16" type="checkbox"><label for="vreinterpretq_p128_p16"><div>poly128_t <b><b>vreinterpretq_p128_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_p16" type="checkbox"><label for="vreinterpretq_f16_p16"><div>float16x8_t <b><b>vreinterpretq_f16_p16</b></b> (poly16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_u64" type="checkbox"><label for="vreinterpretq_s8_u64"><div>int8x16_t <b><b>vreinterpretq_s8_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_u64" type="checkbox"><label for="vreinterpretq_s16_u64"><div>int16x8_t <b><b>vreinterpretq_s16_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_u64" type="checkbox"><label for="vreinterpretq_s32_u64"><div>int32x4_t <b><b>vreinterpretq_s32_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_u64" type="checkbox"><label for="vreinterpretq_f32_u64"><div>float32x4_t <b><b>vreinterpretq_f32_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_u64" type="checkbox"><label for="vreinterpretq_u8_u64"><div>uint8x16_t <b><b>vreinterpretq_u8_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_u64" type="checkbox"><label for="vreinterpretq_u16_u64"><div>uint16x8_t <b><b>vreinterpretq_u16_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_u64" type="checkbox"><label for="vreinterpretq_u32_u64"><div>uint32x4_t <b><b>vreinterpretq_u32_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_u64" type="checkbox"><label for="vreinterpretq_p8_u64"><div>poly8x16_t <b><b>vreinterpretq_p8_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_u64" type="checkbox"><label for="vreinterpretq_p16_u64"><div>poly16x8_t <b><b>vreinterpretq_p16_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_u64" type="checkbox"><label for="vreinterpretq_s64_u64"><div>int64x2_t <b><b>vreinterpretq_s64_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_u64" type="checkbox"><label for="vreinterpretq_f64_u64"><div>float64x2_t <b><b>vreinterpretq_f64_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_s64" type="checkbox"><label for="vreinterpretq_f64_s64"><div>float64x2_t <b><b>vreinterpretq_f64_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_s64" type="checkbox"><label for="vreinterpretq_p64_s64"><div>poly64x2_t <b><b>vreinterpretq_p64_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_s64" type="checkbox"><label for="vreinterpretq_p128_s64"><div>poly128_t <b><b>vreinterpretq_p128_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_u64" type="checkbox"><label for="vreinterpretq_p64_u64"><div>poly64x2_t <b><b>vreinterpretq_p64_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_u64" type="checkbox"><label for="vreinterpretq_p128_u64"><div>poly128_t <b><b>vreinterpretq_p128_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_u64" type="checkbox"><label for="vreinterpretq_f16_u64"><div>float16x8_t <b><b>vreinterpretq_f16_u64</b></b> (uint64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_s64" type="checkbox"><label for="vreinterpretq_s8_s64"><div>int8x16_t <b><b>vreinterpretq_s8_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_s64" type="checkbox"><label for="vreinterpretq_s16_s64"><div>int16x8_t <b><b>vreinterpretq_s16_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_s64" type="checkbox"><label for="vreinterpretq_s32_s64"><div>int32x4_t <b><b>vreinterpretq_s32_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_s64" type="checkbox"><label for="vreinterpretq_f32_s64"><div>float32x4_t <b><b>vreinterpretq_f32_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_s64" type="checkbox"><label for="vreinterpretq_u8_s64"><div>uint8x16_t <b><b>vreinterpretq_u8_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_s64" type="checkbox"><label for="vreinterpretq_u16_s64"><div>uint16x8_t <b><b>vreinterpretq_u16_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_s64" type="checkbox"><label for="vreinterpretq_u32_s64"><div>uint32x4_t <b><b>vreinterpretq_u32_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_s64" type="checkbox"><label for="vreinterpretq_p8_s64"><div>poly8x16_t <b><b>vreinterpretq_p8_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_s64" type="checkbox"><label for="vreinterpretq_p16_s64"><div>poly16x8_t <b><b>vreinterpretq_p16_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_s64" type="checkbox"><label for="vreinterpretq_u64_s64"><div>uint64x2_t <b><b>vreinterpretq_u64_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_p64" type="checkbox"><label for="vreinterpretq_u64_p64"><div>uint64x2_t <b><b>vreinterpretq_u64_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_s64" type="checkbox"><label for="vreinterpretq_f16_s64"><div>float16x8_t <b><b>vreinterpretq_f16_s64</b></b> (int64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_f16" type="checkbox"><label for="vreinterpretq_s8_f16"><div>int8x16_t <b><b>vreinterpretq_s8_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_f16" type="checkbox"><label for="vreinterpretq_s16_f16"><div>int16x8_t <b><b>vreinterpretq_s16_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_f16" type="checkbox"><label for="vreinterpretq_s32_f16"><div>int32x4_t <b><b>vreinterpretq_s32_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_f16" type="checkbox"><label for="vreinterpretq_f32_f16"><div>float32x4_t <b><b>vreinterpretq_f32_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_f16" type="checkbox"><label for="vreinterpretq_u8_f16"><div>uint8x16_t <b><b>vreinterpretq_u8_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_f16" type="checkbox"><label for="vreinterpretq_u16_f16"><div>uint16x8_t <b><b>vreinterpretq_u16_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_f16" type="checkbox"><label for="vreinterpretq_u32_f16"><div>uint32x4_t <b><b>vreinterpretq_u32_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_f16" type="checkbox"><label for="vreinterpretq_p8_f16"><div>poly8x16_t <b><b>vreinterpretq_p8_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_f16" type="checkbox"><label for="vreinterpretq_p16_f16"><div>poly16x8_t <b><b>vreinterpretq_p16_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_f16" type="checkbox"><label for="vreinterpretq_u64_f16"><div>uint64x2_t <b><b>vreinterpretq_u64_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_f16" type="checkbox"><label for="vreinterpretq_s64_f16"><div>int64x2_t <b><b>vreinterpretq_s64_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>v7/A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_f16" type="checkbox"><label for="vreinterpretq_f64_f16"><div>float64x2_t <b><b>vreinterpretq_f64_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p64_f16" type="checkbox"><label for="vreinterpretq_p64_f16"><div>poly64x2_t <b><b>vreinterpretq_p64_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p128_f16" type="checkbox"><label for="vreinterpretq_p128_f16"><div>poly128_t <b><b>vreinterpretq_p128_f16</b></b> (float16x8_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.8H </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_f64" type="checkbox"><label for="vreinterpret_s8_f64"><div>int8x8_t <b><b>vreinterpret_s8_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_f64" type="checkbox"><label for="vreinterpret_s16_f64"><div>int16x4_t <b><b>vreinterpret_s16_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_f64" type="checkbox"><label for="vreinterpret_s32_f64"><div>int32x2_t <b><b>vreinterpret_s32_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_f64" type="checkbox"><label for="vreinterpret_u8_f64"><div>uint8x8_t <b><b>vreinterpret_u8_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_f64" type="checkbox"><label for="vreinterpret_u16_f64"><div>uint16x4_t <b><b>vreinterpret_u16_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_f64" type="checkbox"><label for="vreinterpret_u32_f64"><div>uint32x2_t <b><b>vreinterpret_u32_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_f64" type="checkbox"><label for="vreinterpret_p8_f64"><div>poly8x8_t <b><b>vreinterpret_p8_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_f64" type="checkbox"><label for="vreinterpret_p16_f64"><div>poly16x4_t <b><b>vreinterpret_p16_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_f64" type="checkbox"><label for="vreinterpret_u64_f64"><div>uint64x1_t <b><b>vreinterpret_u64_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_f64" type="checkbox"><label for="vreinterpret_s64_f64"><div>int64x1_t <b><b>vreinterpret_s64_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_f64" type="checkbox"><label for="vreinterpret_f16_f64"><div>float16x4_t <b><b>vreinterpret_f16_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f32_f64" type="checkbox"><label for="vreinterpret_f32_f64"><div>float32x2_t <b><b>vreinterpret_f32_f64</b></b> (float64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_f64" type="checkbox"><label for="vreinterpretq_s8_f64"><div>int8x16_t <b><b>vreinterpretq_s8_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_f64" type="checkbox"><label for="vreinterpretq_s16_f64"><div>int16x8_t <b><b>vreinterpretq_s16_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_f64" type="checkbox"><label for="vreinterpretq_s32_f64"><div>int32x4_t <b><b>vreinterpretq_s32_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_f64" type="checkbox"><label for="vreinterpretq_u8_f64"><div>uint8x16_t <b><b>vreinterpretq_u8_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_f64" type="checkbox"><label for="vreinterpretq_u16_f64"><div>uint16x8_t <b><b>vreinterpretq_u16_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_f64" type="checkbox"><label for="vreinterpretq_u32_f64"><div>uint32x4_t <b><b>vreinterpretq_u32_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_f64" type="checkbox"><label for="vreinterpretq_p8_f64"><div>poly8x16_t <b><b>vreinterpretq_p8_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_f64" type="checkbox"><label for="vreinterpretq_p16_f64"><div>poly16x8_t <b><b>vreinterpretq_p16_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_f64" type="checkbox"><label for="vreinterpretq_u64_f64"><div>uint64x2_t <b><b>vreinterpretq_u64_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_f64" type="checkbox"><label for="vreinterpretq_s64_f64"><div>int64x2_t <b><b>vreinterpretq_s64_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_f64" type="checkbox"><label for="vreinterpretq_f16_f64"><div>float16x8_t <b><b>vreinterpretq_f16_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f32_f64" type="checkbox"><label for="vreinterpretq_f32_f64"><div>float32x4_t <b><b>vreinterpretq_f32_f64</b></b> (float64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s8_p64" type="checkbox"><label for="vreinterpret_s8_p64"><div>int8x8_t <b><b>vreinterpret_s8_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s16_p64" type="checkbox"><label for="vreinterpret_s16_p64"><div>int16x4_t <b><b>vreinterpret_s16_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s32_p64" type="checkbox"><label for="vreinterpret_s32_p64"><div>int32x2_t <b><b>vreinterpret_s32_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u8_p64" type="checkbox"><label for="vreinterpret_u8_p64"><div>uint8x8_t <b><b>vreinterpret_u8_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u16_p64" type="checkbox"><label for="vreinterpret_u16_p64"><div>uint16x4_t <b><b>vreinterpret_u16_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u32_p64" type="checkbox"><label for="vreinterpret_u32_p64"><div>uint32x2_t <b><b>vreinterpret_u32_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.2S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p8_p64" type="checkbox"><label for="vreinterpret_p8_p64"><div>poly8x8_t <b><b>vreinterpret_p8_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.8B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_p16_p64" type="checkbox"><label for="vreinterpret_p16_p64"><div>poly16x4_t <b><b>vreinterpret_p16_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_u64_p64" type="checkbox"><label for="vreinterpret_u64_p64"><div>uint64x1_t <b><b>vreinterpret_u64_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_s64_p64" type="checkbox"><label for="vreinterpret_s64_p64"><div>int64x1_t <b><b>vreinterpret_s64_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f64_p64" type="checkbox"><label for="vreinterpret_f64_p64"><div>float64x1_t <b><b>vreinterpret_f64_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.1D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpret_f16_p64" type="checkbox"><label for="vreinterpret_f16_p64"><div>float16x4_t <b><b>vreinterpret_f16_p64</b></b> (poly64x1_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1D </pre>      <h4>Results</h4>      <pre>Vd.4H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_p64" type="checkbox"><label for="vreinterpretq_s8_p64"><div>int8x16_t <b><b>vreinterpretq_s8_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_p64" type="checkbox"><label for="vreinterpretq_s16_p64"><div>int16x8_t <b><b>vreinterpretq_s16_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_p64" type="checkbox"><label for="vreinterpretq_s32_p64"><div>int32x4_t <b><b>vreinterpretq_s32_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_p64" type="checkbox"><label for="vreinterpretq_u8_p64"><div>uint8x16_t <b><b>vreinterpretq_u8_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_p64" type="checkbox"><label for="vreinterpretq_u16_p64"><div>uint16x8_t <b><b>vreinterpretq_u16_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_p64" type="checkbox"><label for="vreinterpretq_u32_p64"><div>uint32x4_t <b><b>vreinterpretq_u32_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_p64" type="checkbox"><label for="vreinterpretq_p8_p64"><div>poly8x16_t <b><b>vreinterpretq_p8_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_p64" type="checkbox"><label for="vreinterpretq_p16_p64"><div>poly16x8_t <b><b>vreinterpretq_p16_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_p64" type="checkbox"><label for="vreinterpretq_u64_p64"><div>uint64x2_t <b><b>vreinterpretq_u64_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_p64" type="checkbox"><label for="vreinterpretq_s64_p64"><div>int64x2_t <b><b>vreinterpretq_s64_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_p64" type="checkbox"><label for="vreinterpretq_f64_p64"><div>float64x2_t <b><b>vreinterpretq_f64_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_p64" type="checkbox"><label for="vreinterpretq_f16_p64"><div>float16x8_t <b><b>vreinterpretq_f16_p64</b></b> (poly64x2_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.2D </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s8_p128" type="checkbox"><label for="vreinterpretq_s8_p128"><div>int8x16_t <b><b>vreinterpretq_s8_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s16_p128" type="checkbox"><label for="vreinterpretq_s16_p128"><div>int16x8_t <b><b>vreinterpretq_s16_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s32_p128" type="checkbox"><label for="vreinterpretq_s32_p128"><div>int32x4_t <b><b>vreinterpretq_s32_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u8_p128" type="checkbox"><label for="vreinterpretq_u8_p128"><div>uint8x16_t <b><b>vreinterpretq_u8_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u16_p128" type="checkbox"><label for="vreinterpretq_u16_p128"><div>uint16x8_t <b><b>vreinterpretq_u16_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u32_p128" type="checkbox"><label for="vreinterpretq_u32_p128"><div>uint32x4_t <b><b>vreinterpretq_u32_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p8_p128" type="checkbox"><label for="vreinterpretq_p8_p128"><div>poly8x16_t <b><b>vreinterpretq_p8_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_p16_p128" type="checkbox"><label for="vreinterpretq_p16_p128"><div>poly16x8_t <b><b>vreinterpretq_p16_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_u64_p128" type="checkbox"><label for="vreinterpretq_u64_p128"><div>uint64x2_t <b><b>vreinterpretq_u64_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_s64_p128" type="checkbox"><label for="vreinterpretq_s64_p128"><div>int64x2_t <b><b>vreinterpretq_s64_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f64_p128" type="checkbox"><label for="vreinterpretq_f64_p128"><div>float64x2_t <b><b>vreinterpretq_f64_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.2D &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A64</p> </article>  </div><div class="intrinsic"><input id="vreinterpretq_f16_p128" type="checkbox"><label for="vreinterpretq_f16_p128"><div>float16x8_t <b><b>vreinterpretq_f16_p128</b></b> (poly128_t a)<span class="right">Vector reinterpret cast operation</span></div></label><article>      <h4>Description</h4><p></p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/nop-no-operation">NOP</a> 
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vd.1Q </pre>      <h4>Results</h4>      <pre>Vd.8H &rarr; result
+</pre>       <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vldrq_p128" type="checkbox"><label for="vldrq_p128"><div>poly128_t <b><b>vldrq_p128</b></b> (poly128_t const * ptr)<span class="right">Load SIMD&amp;FP register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Load SIMD&amp;FP Register (register offset). This instruction loads a SIMD&amp;FP register from memory. The address that is used for the load is calculated from a base register value and an offset register value. The offset can be optionally shifted and extended.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/ldr-register-simdfp-load-simdfp-register-register-offset">LDR</a> Qd,[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn </pre>      <h4>Results</h4>      <pre>Qd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(64) offset = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.ExtendReg.3" title="function: bits(N) ExtendReg(integer reg, ExtendType type, integer shift)">ExtendReg</a>(m, extend_type, shift);
+if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    boolean is_load_store = memop IN {<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_STORE" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_STORE</a>, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a>};
+    SetNotTagCheckedInstruction(is_load_store &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(64) address;
+bits(datasize) data;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+address = address + offset;
+
+case memop of
+    when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_STORE" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_STORE</a>
+        data = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address, datasize DIV 8, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = data;
+
+    when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a>
+        data = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address, datasize DIV 8, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = data;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vstrq_p128" type="checkbox"><label for="vstrq_p128"><div>void <b><b>vstrq_p128</b></b> (poly128_t * ptr, poly128_t val)<span class="right">Store SIMD&amp;FP register</span></div></label><article>      <h4>Description</h4><p><p class="aml">Store SIMD&amp;FP register (register offset). This instruction stores a single SIMD&amp;FP register to memory. The address that is used for the store is calculated from a base register value and an offset register value. The offset can be optionally shifted and extended.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/str-register-simdfp-store-simdfp-register-register-offset">STR</a> Qt,[Xn]
+</pre>      <h4>Argument Preparation</h4><pre>ptr &rarr; Xn <br />
+val &rarr; Qt </pre>      <h4>Results</h4>      <pre>void &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(64) offset = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#impl-aarch64.ExtendReg.3" title="function: bits(N) ExtendReg(integer reg, ExtendType type, integer shift)">ExtendReg</a>(m, extend_type, shift);
+if <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.HaveMTEExt.0" title="function: boolean HaveMTEExt()">HaveMTEExt</a>() then
+    boolean is_load_store = memop IN {<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_STORE" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_STORE</a>, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a>};
+    SetNotTagCheckedInstruction(is_load_store &amp;&amp; n == 31);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(64) address;
+bits(datasize) data;
+
+if n == 31 then
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.CheckSPAlignment.0" title="function: CheckSPAlignment()">CheckSPAlignment</a>();
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.SP.read.0" title="accessor: bits(width) SP[]">SP</a>[];
+else
+    address = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];
+
+address = address + offset;
+
+case memop of
+    when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_STORE" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_STORE</a>
+        data = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[t];
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.write.3" title="accessor: Mem[bits(64) address, integer size, AccType acctype] = bits(size*8) value">Mem</a>[address, datasize DIV 8, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>] = data;
+
+    when <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-instrs-pseudocode#MemOp_LOAD" title="enumeration MemOp {MemOp_LOAD, MemOp_STORE, MemOp_PREFETCH}">MemOp_LOAD</a>
+        data = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Mem.read.3" title="accessor: bits(size*8) Mem[bits(64) address, integer size, AccType acctype]">Mem</a>[address, datasize DIV 8, <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#AccType_VEC" title="enumeration AccType {AccType_NORMAL, AccType_VEC, AccType_STREAM, AccType_VECSTREAM, AccType_ATOMIC, AccType_ATOMICRW, AccType_ORDERED, AccType_ORDEREDRW, AccType_ORDEREDATOMIC, AccType_ORDEREDATOMICRW,
+ AccType_LIMITEDORDERED, AccType_UNPRIV, AccType_IFETCH, AccType_PTW, AccType_NONFAULT, AccType_CNOTFIRST, AccType_NV2REGISTER,   AccType_DC, AccType_DC_UNPRIV, AccType_IC, AccType_DCZVA, AccType_AT}">AccType_VEC</a>];
+        <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[t] = data;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaeseq_u8" type="checkbox"><label for="vaeseq_u8"><div>uint8x16_t <b><b>vaeseq_u8</b></b> (uint8x16_t data, uint8x16_t key)<span class="right">AES single round encryption</span></div></label><article>      <h4>Description</h4><p><p class="aml">AES single round encryption.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/aese-aes-single-round-encryption">AESE</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>data &rarr; Vd.16B <br />
+key &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(128) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(128) result;
+result = operand1 EOR operand2;
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.AESSubBytes.1" title="function: bits(128) AESSubBytes(bits(128) op)">AESSubBytes</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.AESShiftRows.1" title="function: bits(128) AESShiftRows(bits(128) op)">AESShiftRows</a>(result));
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaesdq_u8" type="checkbox"><label for="vaesdq_u8"><div>uint8x16_t <b><b>vaesdq_u8</b></b> (uint8x16_t data, uint8x16_t key)<span class="right">AES single round decryption</span></div></label><article>      <h4>Description</h4><p><p class="aml">AES single round decryption.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/aesd-aes-single-round-decryption">AESD</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>data &rarr; Vd.16B <br />
+key &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(128) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(128) result;
+result = operand1 EOR operand2;
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.AESInvSubBytes.1" title="function: bits(128) AESInvSubBytes(bits(128) op)">AESInvSubBytes</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.AESInvShiftRows.1" title="function: bits(128) AESInvShiftRows(bits(128) op)">AESInvShiftRows</a>(result));
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaesmcq_u8" type="checkbox"><label for="vaesmcq_u8"><div>uint8x16_t <b><b>vaesmcq_u8</b></b> (uint8x16_t data)<span class="right">AES mix columns</span></div></label><article>      <h4>Description</h4><p><p class="aml">AES mix columns.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/aesmc-aes-mix-columns">AESMC</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>data &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(128) result;
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.AESMixColumns.1" title="function: bits(128) AESMixColumns(bits (128) op)">AESMixColumns</a>(operand);
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vaesimcq_u8" type="checkbox"><label for="vaesimcq_u8"><div>uint8x16_t <b><b>vaesimcq_u8</b></b> (uint8x16_t data)<span class="right">AES inverse mix columns</span></div></label><article>      <h4>Description</h4><p><p class="aml">AES inverse mix columns.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/aesimc-aes-inverse-mix-columns">AESIMC</a> Vd.16B,Vn.16B
+</pre>      <h4>Argument Preparation</h4><pre>data &rarr; Vn.16B </pre>      <h4>Results</h4>      <pre>Vd.16B &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(128) result;
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.AESInvMixColumns.1" title="function: bits(128) AESInvMixColumns(bits (128) op)">AESInvMixColumns</a>(operand);
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha1cq_u32" type="checkbox"><label for="vsha1cq_u32"><div>uint32x4_t <b><b>vsha1cq_u32</b></b> (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)<span class="right">SHA1 hash update (choose)</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA1 hash update (choose).</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha1c-sha1-hash-update-choose">SHA1C</a> Qd,Sn,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>hash_abcd &rarr; Qd <br />
+hash_e &rarr; Sn <br />
+wk &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Qd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) X = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(32) Y = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];    // Note: 32 not 128 bits wide
+bits(128) W = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(32) t;
+
+for e = 0 to 3
+    t = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SHAchoose.3" title="function: bits(32) SHAchoose(bits(32) x, bits(32) y, bits(32) z)">SHAchoose</a>(X&lt;63:32&gt;, X&lt;95:64&gt;, X&lt;127:96&gt;);
+    Y = Y + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(X&lt;31:0&gt;, 5) + t + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[W, e, 32];
+    X&lt;63:32&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(X&lt;63:32&gt;, 30);
+    &lt;Y, X&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(Y:X, 32);
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = X;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha1pq_u32" type="checkbox"><label for="vsha1pq_u32"><div>uint32x4_t <b><b>vsha1pq_u32</b></b> (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)<span class="right">SHA1 hash update (parity)</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA1 hash update (parity).</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha1p-sha1-hash-update-parity">SHA1P</a> Qd,Sn,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>hash_abcd &rarr; Qd <br />
+hash_e &rarr; Sn <br />
+wk &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Qd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) X = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(32) Y = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];    // Note: 32 not 128 bits wide
+bits(128) W = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(32) t;
+
+for e = 0 to 3
+    t = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SHAparity.3" title="function: bits(32) SHAparity(bits(32) x, bits(32) y, bits(32) z)">SHAparity</a>(X&lt;63:32&gt;, X&lt;95:64&gt;, X&lt;127:96&gt;);
+    Y = Y + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(X&lt;31:0&gt;, 5) + t + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[W, e, 32];
+    X&lt;63:32&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(X&lt;63:32&gt;, 30);
+    &lt;Y, X&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(Y:X, 32);
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = X;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha1mq_u32" type="checkbox"><label for="vsha1mq_u32"><div>uint32x4_t <b><b>vsha1mq_u32</b></b> (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)<span class="right">SHA1 hash update (majority)</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA1 hash update (majority).</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha1m-sha1-hash-update-majority">SHA1M</a> Qd,Sn,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>hash_abcd &rarr; Qd <br />
+hash_e &rarr; Sn <br />
+wk &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Qd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) X = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(32) Y = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];    // Note: 32 not 128 bits wide
+bits(128) W = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(32) t;
+
+for e = 0 to 3
+    t = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SHAmajority.3" title="function: bits(32) SHAmajority(bits(32) x, bits(32) y, bits(32) z)">SHAmajority</a>(X&lt;63:32&gt;, X&lt;95:64&gt;, X&lt;127:96&gt;);
+    Y = Y + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(X&lt;31:0&gt;, 5) + t + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[W, e, 32];
+    X&lt;63:32&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(X&lt;63:32&gt;, 30);
+    &lt;Y, X&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(Y:X, 32);
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = X;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha1h_u32" type="checkbox"><label for="vsha1h_u32"><div>uint32_t <b><b>vsha1h_u32</b></b> (uint32_t hash_e)<span class="right">SHA1 fixed rotate</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA1 fixed rotate.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha1h-sha1-fixed-rotate">SHA1H</a> Sd,Sn
+</pre>      <h4>Argument Preparation</h4><pre>hash_e &rarr; Sn </pre>      <h4>Results</h4>      <pre>Sd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(32) operand = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];    // read element [0] only,  [1-3] zeroed
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(operand, 30);</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha1su0q_u32" type="checkbox"><label for="vsha1su0q_u32"><div>uint32x4_t <b><b>vsha1su0q_u32</b></b> (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)<span class="right">SHA1 schedule update 0</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA1 schedule update 0.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha1su0-sha1-schedule-update-0">SHA1SU0</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>w0_3 &rarr; Vd.4S <br />
+w4_7 &rarr; Vn.4S <br />
+w8_11 &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(128) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(128) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128) result;
+
+result = operand2&lt;63:0&gt;:operand1&lt;127:64&gt;;
+result = result EOR operand1 EOR operand3;
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha1su1q_u32" type="checkbox"><label for="vsha1su1q_u32"><div>uint32x4_t <b><b>vsha1su1q_u32</b></b> (uint32x4_t tw0_3, uint32x4_t w12_15)<span class="right">SHA1 schedule update 1</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA1 schedule update 1.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha1su1-sha1-schedule-update-1">SHA1SU1</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>tw0_3 &rarr; Vd.4S <br />
+w12_15 &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(128) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(128) result;
+bits(128) T = operand1 EOR <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(operand2, 32);
+result&lt;31:0&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(T&lt;31:0&gt;, 1);
+result&lt;63:32&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(T&lt;63:32&gt;, 1);
+result&lt;95:64&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(T&lt;95:64&gt;, 1);
+result&lt;127:96&gt; = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(T&lt;127:96&gt;, 1) EOR <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROL.2" title="function: bits(N) ROL(bits(N) x, integer shift)">ROL</a>(T&lt;31:0&gt;, 2);
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha256hq_u32" type="checkbox"><label for="vsha256hq_u32"><div>uint32x4_t <b><b>vsha256hq_u32</b></b> (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)<span class="right">SHA256 hash update (part 1)</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA256 hash update (part 1).</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha256h-sha256-hash-update-part-1">SHA256H</a> Qd,Qn,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>hash_abcd &rarr; Qd <br />
+hash_efgh &rarr; Qn <br />
+wk &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Qd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) result;
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SHA256hash.4" title="function: bits(128) SHA256hash(bits (128) X, bits(128) Y, bits(128) W, boolean part1)">SHA256hash</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d], <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n], <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m], TRUE);
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha256h2q_u32" type="checkbox"><label for="vsha256h2q_u32"><div>uint32x4_t <b><b>vsha256h2q_u32</b></b> (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)<span class="right">SHA256 hash update (part 2)</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA256 hash update (part 2).</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha256h2-sha256-hash-update-part-2">SHA256H2</a> Qd,Qn,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>hash_efgh &rarr; Qd <br />
+hash_abcd &rarr; Qn <br />
+wk &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Qd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) result;
+result = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.SHA256hash.4" title="function: bits(128) SHA256hash(bits (128) X, bits(128) Y, bits(128) W, boolean part1)">SHA256hash</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n], <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d], <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m], FALSE);
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha256su0q_u32" type="checkbox"><label for="vsha256su0q_u32"><div>uint32x4_t <b><b>vsha256su0q_u32</b></b> (uint32x4_t w0_3, uint32x4_t w4_7)<span class="right">SHA256 schedule update 0</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA256 schedule update 0.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha256su0-sha256-schedule-update-0">SHA256SU0</a> Vd.4S,Vn.4S
+</pre>      <h4>Argument Preparation</h4><pre>w0_3 &rarr; Vd.4S <br />
+w4_7 &rarr; Vn.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(128) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(128) result;
+bits(128) T = operand2&lt;31:0&gt;:operand1&lt;127:32&gt;;
+bits(32) elt;
+
+for e = 0 to 3
+    elt = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[T, e, 32];
+    elt = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROR.2" title="function: bits(N) ROR(bits(N) x, integer shift)">ROR</a>(elt, 7) EOR <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROR.2" title="function: bits(N) ROR(bits(N) x, integer shift)">ROR</a>(elt, 18) EOR <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(elt, 3);
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 32] = elt + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 32];
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vsha256su1q_u32" type="checkbox"><label for="vsha256su1q_u32"><div>uint32x4_t <b><b>vsha256su1q_u32</b></b> (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)<span class="right">SHA256 schedule update 1</span></div></label><article>      <h4>Description</h4><p><p class="aml">SHA256 schedule update 1.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/sha256su1-sha256-schedule-update-1">SHA256SU1</a> Vd.4S,Vn.4S,Vm.4S
+</pre>      <h4>Argument Preparation</h4><pre>tw0_3 &rarr; Vd.4S <br />
+w8_11 &rarr; Vn.4S <br />
+w12_15 &rarr; Vm.4S </pre>      <h4>Results</h4>      <pre>Vd.4S &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#AArch64.CheckFPAdvSIMDEnabled.0" title="function: AArch64.CheckFPAdvSIMDEnabled()">AArch64.CheckFPAdvSIMDEnabled</a>();
+
+bits(128) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[d];
+bits(128) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[n];
+bits(128) operand3 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.read.1" title="accessor: bits(width) V[integer n]">V</a>[m];
+bits(128) result;
+bits(128) T0 = operand3&lt;31:0&gt;:operand2&lt;127:32&gt;;
+bits(64) T1;
+bits(32) elt;
+
+T1 = operand3&lt;127:64&gt;;
+for e = 0 to 1
+    elt = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[T1, e, 32];
+    elt = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROR.2" title="function: bits(N) ROR(bits(N) x, integer shift)">ROR</a>(elt, 17) EOR <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROR.2" title="function: bits(N) ROR(bits(N) x, integer shift)">ROR</a>(elt, 19) EOR <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(elt, 10);
+    elt = elt + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 32] + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[T0, e, 32];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 32] = elt;
+
+T1 = result&lt;63:0&gt;;
+for e = 2 to 3
+    elt = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[T1, e-2, 32];
+    elt = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROR.2" title="function: bits(N) ROR(bits(N) x, integer shift)">ROR</a>(elt, 17) EOR <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.ROR.2" title="function: bits(N) ROR(bits(N) x, integer shift)">ROR</a>(elt, 19) EOR <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.LSR.2" title="function: bits(N) LSR(bits(N) x, integer shift)">LSR</a>(elt, 10);
+    elt = elt + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, 32] + <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[T0, e, 32];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 32] = elt;
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_p64" type="checkbox"><label for="vmull_p64"><div>poly128_t <b><b>vmull_p64</b></b> (poly64_t a, poly64_t b)<span class="right">Polynomial multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Polynomial Multiply Long. This instruction multiplies corresponding elements in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/pmull-pmull2-polynomial-multiply-long">PMULL</a> Vd.1Q,Vn.1D,Vm.1D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.1D <br />
+b &rarr; Vm.1D </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="vmull_high_p64" type="checkbox"><label for="vmull_high_p64"><div>poly128_t <b><b>vmull_high_p64</b></b> (poly64x2_t a, poly64x2_t b)<span class="right">Polynomial multiply long</span></div></label><article>      <h4>Description</h4><p><p class="aml">Polynomial Multiply Long. This instruction multiplies corresponding elements in the lower or upper half of the vectors of the two source SIMD&amp;FP registers, places the results in a vector, and writes the vector to the destination SIMD&amp;FP register. The destination vector elements are twice as long as the elements that are multiplied.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/pmull-pmull2-polynomial-multiply-long">PMULL2</a> Vd.1Q,Vn.2D,Vm.2D
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Vn.2D <br />
+b &rarr; Vm.2D </pre>      <h4>Results</h4>      <pre>Vd.1Q &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock"><a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-exceptions-pseudocode#impl-aarch64.CheckFPAdvSIMDEnabled64.0" title="function: CheckFPAdvSIMDEnabled64()">CheckFPAdvSIMDEnabled64</a>();
+bits(datasize) operand1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[n, part];
+bits(datasize) operand2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.Vpart.read.2" title="accessor: bits(width) Vpart[integer n, integer part]">Vpart</a>[m, part];
+bits(2*datasize) result;
+bits(esize) element1;
+bits(esize) element2;
+
+for e = 0 to elements-1
+    element1 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand1, e, esize];
+    element2 = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.read.3" title="accessor: bits(size) Elem[bits(N) vector, integer e, integer size]">Elem</a>[operand2, e, esize];
+    <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Elem.write.3" title="accessor: Elem[bits(N) &amp;vector, integer e, integer size] = bits(size) value">Elem</a>[result, e, 2*esize] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.PolynomialMult.2" title="function: bits(M+N) PolynomialMult(bits(M) op1, bits(N) op2)">PolynomialMult</a>(element1, element2);
+
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-simd-and-floating-point-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.V.write.1" title="accessor: V[integer n] = bits(width) value">V</a>[d] = result;</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="__crc32b" type="checkbox"><label for="__crc32b"><div>uint32_t <b><b>__crc32b</b></b> (uint32_t a, uint8_t b)<span class="right"><span class="asm-code">CRC32 checksum</span></div></label><article>      <h4>Description</h4><p><p class="aml"><span class="asm-code">CRC32</span> checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/crc32b-crc32h-crc32w-crc32x-crc32-checksum">CRC32B</a> Wd,Wn,Wm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Wn <br />
+b &rarr; Wm </pre>      <h4>Results</h4>      <pre>Wd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(32) acc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];    // accumulator
+bits(size) val = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];    // input value
+bits(32) poly = 0x04C11DB7&lt;31:0&gt;;
+
+bits(32+size) tempacc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(acc):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(size);
+bits(size+32) tempval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(val):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(32);
+
+// Poly32Mod2 on a bitstring does a polynomial Modulus over {0,1} operation
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Poly32Mod2.2" title="function: bits(32) Poly32Mod2(bits(N) data, bits(32) poly)">Poly32Mod2</a>(tempacc EOR tempval, poly));</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="__crc32h" type="checkbox"><label for="__crc32h"><div>uint32_t <b><b>__crc32h</b></b> (uint32_t a, uint16_t b)<span class="right"><span class="asm-code">CRC32 checksum</span></div></label><article>      <h4>Description</h4><p><p class="aml"><span class="asm-code">CRC32</span> checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/crc32b-crc32h-crc32w-crc32x-crc32-checksum">CRC32H</a> Wd,Wn,Wm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Wn <br />
+b &rarr; Wm </pre>      <h4>Results</h4>      <pre>Wd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(32) acc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];    // accumulator
+bits(size) val = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];    // input value
+bits(32) poly = 0x04C11DB7&lt;31:0&gt;;
+
+bits(32+size) tempacc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(acc):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(size);
+bits(size+32) tempval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(val):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(32);
+
+// Poly32Mod2 on a bitstring does a polynomial Modulus over {0,1} operation
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Poly32Mod2.2" title="function: bits(32) Poly32Mod2(bits(N) data, bits(32) poly)">Poly32Mod2</a>(tempacc EOR tempval, poly));</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="__crc32w" type="checkbox"><label for="__crc32w"><div>uint32_t <b><b>__crc32w</b></b> (uint32_t a, uint32_t b)<span class="right"><span class="asm-code">CRC32 checksum</span></div></label><article>      <h4>Description</h4><p><p class="aml"><span class="asm-code">CRC32</span> checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/crc32b-crc32h-crc32w-crc32x-crc32-checksum">CRC32W</a> Wd,Wn,Wm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Wn <br />
+b &rarr; Wm </pre>      <h4>Results</h4>      <pre>Wd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(32) acc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];    // accumulator
+bits(size) val = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];    // input value
+bits(32) poly = 0x04C11DB7&lt;31:0&gt;;
+
+bits(32+size) tempacc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(acc):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(size);
+bits(size+32) tempval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(val):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(32);
+
+// Poly32Mod2 on a bitstring does a polynomial Modulus over {0,1} operation
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Poly32Mod2.2" title="function: bits(32) Poly32Mod2(bits(N) data, bits(32) poly)">Poly32Mod2</a>(tempacc EOR tempval, poly));</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="__crc32d" type="checkbox"><label for="__crc32d"><div>uint32_t <b><b>__crc32d</b></b> (uint32_t a, uint64_t b)<span class="right"><span class="asm-code">CRC32 checksum</span></div></label><article>      <h4>Description</h4><p><p class="aml"><span class="asm-code">CRC32</span> checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/crc32b-crc32h-crc32w-crc32x-crc32-checksum">CRC32X</a> Wd,Wn,Xm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Wn <br />
+b &rarr; Xm </pre>      <h4>Results</h4>      <pre>Wd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(32) acc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];    // accumulator
+bits(size) val = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];    // input value
+bits(32) poly = 0x04C11DB7&lt;31:0&gt;;
+
+bits(32+size) tempacc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(acc):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(size);
+bits(size+32) tempval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(val):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(32);
+
+// Poly32Mod2 on a bitstring does a polynomial Modulus over {0,1} operation
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Poly32Mod2.2" title="function: bits(32) Poly32Mod2(bits(N) data, bits(32) poly)">Poly32Mod2</a>(tempacc EOR tempval, poly));</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="__crc32cb" type="checkbox"><label for="__crc32cb"><div>uint32_t <b><b>__crc32cb</b></b> (uint32_t a, uint8_t b)<span class="right"><span class="asm-code">CRC32 checksum</span></div></label><article>      <h4>Description</h4><p><p class="aml"><span class="asm-code">CRC32</span> checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x1EDC6F41 is used for the CRC calculation.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/crc32cb-crc32ch-crc32cw-crc32cx-crc32c-checksum">CRC32CB</a> Wd,Wn,Wm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Wn <br />
+b &rarr; Wm </pre>      <h4>Results</h4>      <pre>Wd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(32) acc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];    // accumulator
+bits(size) val = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];    // input value
+bits(32) poly = 0x1EDC6F41&lt;31:0&gt;;
+
+bits(32+size) tempacc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(acc):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(size);
+bits(size+32) tempval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(val):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(32);
+
+// Poly32Mod2 on a bitstring does a polynomial Modulus over {0,1} operation
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Poly32Mod2.2" title="function: bits(32) Poly32Mod2(bits(N) data, bits(32) poly)">Poly32Mod2</a>(tempacc EOR tempval, poly));</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="__crc32ch" type="checkbox"><label for="__crc32ch"><div>uint32_t <b><b>__crc32ch</b></b> (uint32_t a, uint16_t b)<span class="right"><span class="asm-code">CRC32 checksum</span></div></label><article>      <h4>Description</h4><p><p class="aml"><span class="asm-code">CRC32</span> checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x1EDC6F41 is used for the CRC calculation.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/crc32cb-crc32ch-crc32cw-crc32cx-crc32c-checksum">CRC32CH</a> Wd,Wn,Wm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Wn <br />
+b &rarr; Wm </pre>      <h4>Results</h4>      <pre>Wd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(32) acc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];    // accumulator
+bits(size) val = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];    // input value
+bits(32) poly = 0x1EDC6F41&lt;31:0&gt;;
+
+bits(32+size) tempacc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(acc):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(size);
+bits(size+32) tempval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(val):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(32);
+
+// Poly32Mod2 on a bitstring does a polynomial Modulus over {0,1} operation
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Poly32Mod2.2" title="function: bits(32) Poly32Mod2(bits(N) data, bits(32) poly)">Poly32Mod2</a>(tempacc EOR tempval, poly));</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="__crc32cw" type="checkbox"><label for="__crc32cw"><div>uint32_t <b><b>__crc32cw</b></b> (uint32_t a, uint32_t b)<span class="right"><span class="asm-code">CRC32 checksum</span></div></label><article>      <h4>Description</h4><p><p class="aml"><span class="asm-code">CRC32</span> checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x1EDC6F41 is used for the CRC calculation.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/crc32cb-crc32ch-crc32cw-crc32cx-crc32c-checksum">CRC32CW</a> Wd,Wn,Wm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Wn <br />
+b &rarr; Wm </pre>      <h4>Results</h4>      <pre>Wd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(32) acc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];    // accumulator
+bits(size) val = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];    // input value
+bits(32) poly = 0x1EDC6F41&lt;31:0&gt;;
+
+bits(32+size) tempacc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(acc):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(size);
+bits(size+32) tempval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(val):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(32);
+
+// Poly32Mod2 on a bitstring does a polynomial Modulus over {0,1} operation
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Poly32Mod2.2" title="function: bits(32) Poly32Mod2(bits(N) data, bits(32) poly)">Poly32Mod2</a>(tempacc EOR tempval, poly));</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div><div class="intrinsic"><input id="__crc32cd" type="checkbox"><label for="__crc32cd"><div>uint32_t <b><b>__crc32cd</b></b> (uint32_t a, uint64_t b)<span class="right"><span class="asm-code">CRC32 checksum</span></div></label><article>      <h4>Description</h4><p><p class="aml"><span class="asm-code">CRC32</span> checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x1EDC6F41 is used for the CRC calculation.</p>
+</p>      <h4>A64 Instruction</h4><pre><a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/crc32cb-crc32ch-crc32cw-crc32cx-crc32c-checksum">CRC32CX</a> Wd,Wn,Xm
+</pre>      <h4>Argument Preparation</h4><pre>a &rarr; Wn <br />
+b &rarr; Xm </pre>      <h4>Results</h4>      <pre>Wd &rarr; result
+</pre>  <h4>Operation</h4>
+<pre class="codeblock">bits(32) acc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[n];    // accumulator
+bits(size) val = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.read.1" title="accessor: bits(width) X[integer n]">X</a>[m];    // input value
+bits(32) poly = 0x1EDC6F41&lt;31:0&gt;;
+
+bits(32+size) tempacc = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(acc):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(size);
+bits(size+32) tempval = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(val):<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Zeros.1" title="function: bits(N) Zeros(integer N)">Zeros</a>(32);
+
+// Poly32Mod2 on a bitstring does a polynomial Modulus over {0,1} operation
+<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/aarch64-functions-pseudocode#impl-aarch64.X.write.1" title="accessor: X[integer n] = bits(width) value">X</a>[d] = <a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.BitReverse.1" title="function: bits(N) BitReverse(bits(N) data)">BitReverse</a>(<a href="https://developer.arm.com/docs/ddi0596/a/a64-base-instructions-alphabetic-order/../a64-shared-pseudocode-functions/shared-functions-pseudocode#impl-shared.Poly32Mod2.2" title="function: bits(32) Poly32Mod2(bits(N) data, bits(32) poly)">Poly32Mod2</a>(tempacc EOR tempval, poly));</pre>
+      <h4>Supported architectures</h4>      <p>A32/A64</p> </article>  </div>
+</section>
+            
+        </div>
+    </div>    
+</div>
+
+</div>
+<!-- END ProductItemContent -->
+
+</main>
+
+
+<footer class="c-footer u-no-print" id="footer">
+
+    <!-- START Newsletter -->
+    <div class="c-footer__newsletter">
+        <div class="row">
+            <div class="columns">
+                <!-- START NewsLetterHorizontal -->
+<div class="c-widget c-newsletter-widget is-horizontal">
+    
+    
+    <!-- Newsletter form -->
+    <div class="row">
+        <div class="large-4 columns">
+            <h4 class="c-newsletter-widget__title">Stay Informed</h4>
+            <span class="c-newsletter-widget__description">Sign up for news and updates.</span>
+        </div>
+        <div class="large-8 columns">
+            <div class="row">
+                <form class="js-newsletter__form" data-abide>
+                    <div class="large-3 columns">
+                        <label for="news_sign_first_name" class="hide">First Name</label>
+                        <input type="text" name="First Name" id="news_sign_first_id" class="input" placeholder="First Name" required />
+                        <small class="error">Please enter your first name.</small>
+                    </div>
+                    <div class="large-3 columns">
+                        <label for="news_sign_last_name" class="hide">Last Name</label>
+                        <input type="text" name="Last Name" id="news_sign_last_id" class="input" placeholder="Last Name" required />
+                        <small class="error">Please enter your last name.</small>
+                    </div>
+                    <div class="large-4 columns">
+                        <label for="news_sign_email_address" class="hide">E-Mail</label>
+                        <input type="email" name="Email Address" id="news_sign_email_address_id" class="input" placeholder="E-Mail" required />
+                        <small class="error">Please enter your e-mail.</small>
+                    </div>
+                    <div class="large-2 columns end">
+                        <input type="submit" value="Sign up" class="c-button" style="margin-bottom: 0; margin-top: 0;" />
+                    </div>
+                </form>
+            </div>
+        </div>
+    </div>
+    <!-- END newsletter form -->
+
+    <!-- START newsletter modal window -->
+    <div class="reveal-modal medium" id="newsletterModal" data-reveal aria-hidden="true" role="dialog">
+        <div class="modal-hider">
+            <iframe width="100%" height="100%" data-src="/forms/newsletter-signup"></iframe>
+        </div>
+        <p>
+            <a class="close-reveal-modal" aria-label="Close">&#215;</a>
+        </p>
+    </div>
+    <!-- END newsletter modal window -->
+
+</div>
+<!-- End NewsLetterHorizontal -->
+
+            </div>
+        </div>
+    </div>
+    <!-- END Newsletter -->
+
+    <!-- START Internal Footer -->
+    <div class="c-footer__internal">
+        <div class="row small-text-center large-text-left">
+            <div class="large-3 columns spacing-3 pushing-3">
+                <!-- START Footer Section -->
+            <h3 class="c-footer-section__title">
+Arm Developer            </h3>
+            <div class="row">
+                <div class="columns">
+                    <ul class="o-list c-footer-section__list">
+                            <li><a class="c-footer-section__link" href="/embedded" title="Embedded Software Developers">Embedded Software Developers</a></li>
+    <li><a class="c-footer-section__link" href="/open-source" title="Linux and Open Source">Linux and Open Source</a></li>
+    <li><a class="c-footer-section__link" href="https://www.arm.com/resources/education" title="Education">Education</a></li>
+    <li><a class="c-footer-section__link" href="https://www.arm.com/resources/research" title="Research">Research</a></li>
+    <li><a class="c-footer-section__link" href="/graphics" title="Graphics and Multimedia Development">Graphics and Multimedia Development</a></li>
+    <li><a class="c-footer-section__link" href="/soc" title="SoC Design">SoC Design</a></li>
+    <li><a class="c-footer-section__link" href="/hpc" title="High Performance Computing">High Performance Computing</a></li>
+
+
+                    </ul>
+                </div>
+            </div>
+            <h3 class="c-footer-section__title">
+                    <a href="/products/architecture" title="Architecture" class="home">Architecture</a>
+            </h3>
+            <div class="row">
+                <div class="columns">
+                    <ul class="o-list c-footer-section__list">
+                        
+        <li><a class="c-footer-section__link" href="/products/architecture/cpu-architecture" title="CPU Architecture">CPU Architecture</a></li>
+        <li><a class="c-footer-section__link" href="/products/architecture/system-architectures" title="System Architectures">System Architectures</a></li>
+        <li><a class="c-footer-section__link" href="/products/architecture/security-architectures" title="Security Architectures">Security Architectures</a></li>
+        <li><a class="c-footer-section__link" href="/products/architecture/instruction-sets" title="Instruction Sets">Instruction Sets</a></li>
+        <li><a class="c-footer-section__link" href="/products/architecture/platform-design" title="Platform Design">Platform Design</a></li>
+        <li><a class="c-footer-section__link" href="/products/architecture/reference-library" title="Reference Library">Reference Library</a></li>
+
+                    </ul>
+                </div>
+            </div>
+<!-- END Footer Section -->
+
+            </div>
+            <div class="large-offset-1 large-8 columns spacing-3 pushing-3">
+                <!-- START Internal Right -->
+
+
+            <h3 class="c-footer-section__title">
+                    <a href="/products" title="IP Products" class="home">IP Products</a>
+            </h3>
+<div class="row">                <div class="large-4 medium-6 columns left">
+                    <ul class="o-list c-footer-section__list">
+                        <li>
+
+                            <a href="/products/processors" title="Processors">
+                                    <strong class="c-footer-section__subtitle">Processors</strong>
+                            </a>
+                        </li>
+                        
+        <li><a class="c-footer-section__link" href="/products/processors/cortex-a" title="Cortex-A">Cortex-A</a></li>
+        <li><a class="c-footer-section__link" href="/products/processors/cortex-r" title="Cortex-R">Cortex-R</a></li>
+        <li><a class="c-footer-section__link" href="/products/processors/cortex-m" title="Cortex-M">Cortex-M</a></li>
+        <li><a class="c-footer-section__link" href="/products/processors/classic-processors" title="Classic Processors">Classic Processors</a></li>
+        <li><a class="c-footer-section__link" href="/products/processors/machine-learning" title="Machine Learning">Machine Learning</a></li>
+
+                    </ul>
+                </div>
+                <div class="large-4 medium-6 columns left">
+                    <ul class="o-list c-footer-section__list">
+                        <li>
+
+                            <a href="/products/physical-ip" title="Physical IP">
+                                    <strong class="c-footer-section__subtitle">Physical IP</strong>
+                            </a>
+                        </li>
+                        
+        <li><a class="c-footer-section__link" href="/products/physical-ip/logic-ip" title="Logic IP">Logic IP</a></li>
+        <li><a class="c-footer-section__link" href="/products/physical-ip/memory-compilers" title="Memory Compilers">Memory Compilers</a></li>
+        <li><a class="c-footer-section__link" href="/products/physical-ip/interface-ip" title="Interface IP">Interface IP</a></li>
+        <li><a class="c-footer-section__link" href="/products/physical-ip/pop-ip" title="POP IP">POP IP</a></li>
+
+                    </ul>
+                </div>
+                <div class="large-4 medium-6 columns left">
+                    <ul class="o-list c-footer-section__list">
+                        <li>
+
+                            <a href="/products/system-ip" title="System IP">
+                                    <strong class="c-footer-section__subtitle">System IP</strong>
+                            </a>
+                        </li>
+                        
+        <li><a class="c-footer-section__link" href="/products/system-ip/free-system-ip-whitepapers" title="Free System IP Whitepapers">Free System IP Whitepapers</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-ip/corelink-interconnect" title="CoreLink Interconnect">CoreLink Interconnect</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-ip/coresight-debug-and-trace" title="CoreSight Debug and Trace">CoreSight Debug and Trace</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-ip/socrates-system-builder" title="Socrates System Builder">Socrates System Builder</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-ip/memory-controllers" title="CoreLink Memory Controllers">CoreLink Memory Controllers</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-ip/system-controllers" title="CoreLink System Controllers">CoreLink System Controllers</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-ip/security-ip" title="Security IP">Security IP</a></li>
+
+                    </ul>
+                </div>
+</div><div class="row">                <div class="large-4 medium-6 columns left">
+                    <ul class="o-list c-footer-section__list">
+                        <li>
+
+                            <a href="/products/graphics-and-multimedia" title="Graphics and Multimedia Processors">
+                                    <strong class="c-footer-section__subtitle">Graphics and Multimedia Processors</strong>
+                            </a>
+                        </li>
+                        
+        <li><a class="c-footer-section__link" href="/products/graphics-and-multimedia/mali-gpus" title="Mali GPUs">Mali GPUs</a></li>
+        <li><a class="c-footer-section__link" href="/products/graphics-and-multimedia/mali-video-processors" title="Mali Video Processors">Mali Video Processors</a></li>
+        <li><a class="c-footer-section__link" href="/products/graphics-and-multimedia/mali-display-processors" title="Mali Display Processors">Mali Display Processors</a></li>
+        <li><a class="c-footer-section__link" href="/products/graphics-and-multimedia/assertive-display" title="Assertive Display">Assertive Display</a></li>
+
+                    </ul>
+                </div>
+                <div class="large-4 medium-6 columns left">
+                    <ul class="o-list c-footer-section__list">
+                        <li>
+
+                            <a href="/products/system-design" title="System Design Tools">
+                                    <strong class="c-footer-section__subtitle">System Design Tools</strong>
+                            </a>
+                        </li>
+                        
+        <li><a class="c-footer-section__link" href="/products/system-design/subsystems" title="Subsystems">Subsystems</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-design/corstone-foundation-ip" title="Corstone Foundation IP">Corstone Foundation IP</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-design/system-guidance" title="System Guidance">System Guidance</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-design/fast-models" title="Fast Models">Fast Models</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-design/cycle-models" title="Cycle Models">Cycle Models</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-design/development-boards" title="Development Boards">Development Boards</a></li>
+        <li><a class="c-footer-section__link" href="/products/system-design/fixed-virtual-platforms" title="Fixed Virtual Platforms">Fixed Virtual Platforms</a></li>
+
+                    </ul>
+                </div>
+                <div class="large-4 medium-6 columns left">
+                    <ul class="o-list c-footer-section__list">
+                        <li>
+
+                            <a href="/products/software-development-tools" title="Software Tools">
+                                    <strong class="c-footer-section__subtitle">Software Tools</strong>
+                            </a>
+                        </li>
+                        
+        <li><a class="c-footer-section__link" href="/products/software-development-tools/arm-development-studio" title="Arm Development Studio">Arm Development Studio</a></li>
+        <li><a class="c-footer-section__link" href="/products/software-development-tools/graphics-development-tools" title="Graphics Development Tools">Graphics Development Tools</a></li>
+        <li><a class="c-footer-section__link" href="/products/software-development-tools/buy" title="Buy">Buy</a></li>
+        <li><a class="c-footer-section__link" href="/products/software-development-tools/compilers" title="Compilers">Compilers</a></li>
+        <li><a class="c-footer-section__link" href="/products/software-development-tools/hpc" title="HPC">HPC</a></li>
+        <li><a class="c-footer-section__link" href="/products/software-development-tools/keil-mdk" title="Keil MDK">Keil MDK</a></li>
+        <li><a class="c-footer-section__link" href="/products/software-development-tools/debug-probes-and-adapters" title="Debug Probes and Adapters">Debug Probes and Adapters</a></li>
+
+                    </ul>
+                </div>
+</div><!-- END Internal Right -->
+
+            </div>
+        </div>
+    </div>
+    <!-- END Internal Footer -->
+
+    <!-- START External Footer -->
+    <div class="c-footer__external">
+
+        <!-- START External links -->
+        <div class="row small-text-center large-text-left">
+            <div class="large-4 spacing-3 columns">
+                <!-- START Footer Section -->
+            <h3 class="c-footer-section__title">
+Arm Corporate            </h3>
+            <div class="row">
+                <div class="columns">
+                    <ul class="o-list c-footer-section__list">
+                            <li><a class="c-footer-section__link" href="https://www.arm.com/" title="arm.com">arm.com</a></li>
+    <li><a class="c-footer-section__link" href="https://www.arm.com/company" title="Company Profile">Company Profile</a></li>
+    <li><a class="c-footer-section__link" href="https://www.arm.com/company/careers" title="Careers">Careers</a></li>
+    <li><a class="c-footer-section__link" href="https://www.arm.com/company/news" title="Newsroom">Newsroom</a></li>
+    <li><a class="c-footer-section__link" href="https://www.arm.com/company/offices" title="Our Offices">Our Offices</a></li>
+
+
+                    </ul>
+                </div>
+            </div>
+<!-- END Footer Section -->
+
+            </div>
+            <!-- START External Right -->
+        <div class="large-4 spacing-3 columns">
+                <h3 class="c-footer-section__title">
+More                </h3>
+                <div class="row">
+                    <div class="columns">
+                        <ul class="o-list c-footer-section__list">
+                                <li><a class="c-footer-section__link" href="http://community.arm.com/" title="Arm Community">Arm Community</a></li>
+    <li><a class="c-footer-section__link" href="/support" title="Support">Support</a></li>
+    <li><a class="c-footer-section__link" href="/products/designstart" title="DesignStart">DesignStart</a></li>
+    <li><a class="c-footer-section__link" href="http://www.keil.com/" title="Keil Tools">Keil Tools</a></li>
+    <li><a class="c-footer-section__link" href="/graphics" title="Mali Developer">Mali Developer</a></li>
+
+
+                        </ul>
+                    </div>
+                </div>
+        </div>
+        <div class="large-4 spacing-3 columns">
+                <h3 class="c-footer-section__title">
+Social                </h3>
+                <div class="row">
+                    <div class="columns">
+                        <ul class="o-list c-footer-section__list">
+                                <li><a class="c-footer-section__link" href="https://www.facebook.com/pg/ARM-71946799587/" title="Facebook">Facebook</a></li>
+    <li><a class="c-footer-section__link" href="http://www.linkedin.com/company/Arm" title="LinkedIn">LinkedIn</a></li>
+    <li><a class="c-footer-section__link" href="https://twitter.com/Arm" title="Twitter">Twitter</a></li>
+    <li><a class="c-footer-section__link" href="https://www.youtube.com/user/Armflix" title="YouTube">YouTube</a></li>
+    <li><a class="c-footer-section__link" href="http://i.youku.com/armchina" title="优酷 (YouKu)">优酷 (YouKu)</a></li>
+    <li><a class="c-footer-section__link" href="http://weibo.com/armcn" title="@Arm中国 (Arm Sina)">@Arm中国 (Arm Sina)</a></li>
+
+
+                        </ul>
+                    </div>
+                </div>
+        </div>
+<!-- END External Right -->
+
+        </div>
+        <!-- END External Links -->
+
+        <!-- START Trademark Statement -->
+        <div class="row small-text-center large-text-left">
+            <div class="columns spacing-1">
+                <div class="c-footer__trademark">
+                    AMBA, Arm, Arm7, Arm9, Arm11, Artisan, big.LITTLE, Cordio, CoreLink, CoreSight,
+Cortex, DesignStart, Jazelle, Keil, Mali, Mbed, NEON, POP, SecurCore, Socrates,
+Thumb, TrustZone, ULINK, &#181;Vision, Versatile are trademarks or registered trademarks
+of Arm Limited (or its subsidiaries) in the US and/or elsewhere.
+The related technology may be protected by any or all of patents, copyrights,
+designs and trade secrets. All rights reserved. All other brands or product 
+names are the property of their respective holders. <a href="http://www.arm.com/about/trademarks/">Click here for further details</a>.
+                </div>
+            </div>
+        </div>
+        <!-- END Trademark Statement -->
+
+        <!-- START Legal -->
+        <div class="c-legal row small-text-center large-text-left" role="contentinfo">
+            <div class="large-1 spacing-3 columns">
+                <div class="c-legal__logo"></div>
+            </div>
+            <div class="large-8 spacing-3 columns">
+                    <p class="c-legal__links">
+
+                <a href="https://www.arm.com/company/policies/cookies">Cookie Policy</a>
+ |                <a href="https://www.arm.com/company/policies/terms-and-conditions">Terms of Use</a>
+ |                <a href="https://www.arm.com/company/policies/privacy">Privacy Policy</a>
+ |                <a href="https://www.arm.com/company/policies/accessibility">Accessibility</a>
+ |                <a href="https://login.arm.com/subscriptions.php">Subscription Center</a>
+ |                <a href="https://www.arm.com/company/policies/trademarks">Trademarks</a>
+        <br class="hide-for-large-up" />
+    </p>
+
+            </div>
+            <div class="large-3 spacing-3 columns">
+                <p class="c-legal__copyright">Copyright &#169; 1995-2018 Arm Limited (or its affiliates). All rights reserved. </p>
+            </div>
+        </div>
+        <!-- END Legal -->
+
+    </div>
+    <!-- END External Footer -->
+
+</footer>
+
+
+
+    <div class="c-component c-policies u-no-print" role="contentinfo">
+            <div class="c-component c-policy c-cookie-policy js-policy" data-key="com.arm.accepted.cookie" data-updated="01/02/2018 16:28:25" data-iscookiepolicy="true" title="Cookie Policy" role="alert" style="display: none;">
+        <div class="row">
+            <div class="small-12 large-9 small-text-center large-text-left columns">
+                <p>Important Information for the Arm website. This site uses cookies to store information on your computer. By continuing to use our site, you consent to our cookies. If you are not happy with the use of these cookies, please review our <a class="cookie-link" target="_blank" href="http://www.arm.com/about/cookie_policy.php" title="Cookie Policy">Cookie Policy</a> to learn how they can be disabled. By disabling cookies, some features of the site will not work.</p>
+            </div>
+            <div class="small-12 large-3 text-center columns">
+                <a class="c-button c-policy__accept-button js-accept-policy" tabindex="1" title="Accept and hide this message ">Accept and hide this message  <i class="fa fa-times"></i></a>
+            </div>
+        </div>
+    </div>
+
+        
+    </div>
+
+<script type="text/javascript" src="https://nebula-cdn.kampyle.com/we/8144/onsite/embed.js"></script>
+
+
+    
+
+<script src="/bundles/modernizr?v=inCVuEFe6J4Q07A0AcRsbJic_UE5MwpRMNGcOtk94TE1"></script>
+
+
+
+<script type="text/javascript">
+    if (Modernizr && !Modernizr.svg) {
+        var imgs = document.getElementsByTagName('img');
+        var svgExtension = /.*\.svg$/;
+        var l = imgs.length;
+        for (var i = 0; i < l; i++) {
+            if (imgs[i].src.match(svgExtension)) {
+                imgs[i].src = imgs[i].src.slice(0, -3) + 'png';
+            }
+        }
+    }
+</script>
+
+
+<script src="/shared/vendor/jquery-1.12.4.min.js"></script>
+<script src="/shared/vendor/foundation.min.js"></script>
+<script src="/shared/vendor/moment.min.js"></script>
+<script src="/shared/vendor/js/jquery-rss/src/jquery.rss.js"></script>
+
+<script src="/bundles/clipboard?v=IPc2U7tMxf_2TKh6_qbfzIsYI3pmBbWZxHb5M8V-fhg1"></script>
+
+<script src="/bundles/placeholder?v=Aw-bm4sJPSuBeTzPpRw_GfXYXI4wKmH607vgMic22c01"></script>
+
+<script src="/bundles/waypoints?v=E5Sm2NPVxzLqGyd5lIz-NjBvArn4w7w7IvCs35wz6dA1"></script>
+
+
+
+<script src="/shared/developer.arm.com/js/common.js?v=09142182FF441DC932039AB1D8CD216F"></script>
+<script src="/shared/developer.arm.com/js/app.bundle.js?v=09142182FF441DC932039AB1D8CD216F"></script>
+
+
+<script src="/shared/arm.com-new/js/app.constants.js?v=09142182FF441DC932039AB1D8CD216F"></script>
+<script src="/shared/arm.com-new/js/app.navigation.js?v=09142182FF441DC932039AB1D8CD216F"></script>
+<script type="text/javascript">
+    (function() {
+        var $userMenu = $('.c-user-menu__root');
+        if ($userMenu) {
+            $userMenu.navigation();
+        }
+    })();
+</script>
+
+
+
+<script src="/bundles/jquery-ui?v=atr-jO-t-9RdxuVusckf7yNy0MEEBlVW5TaJCAetR6A1"></script>
+
+<script src="/bundles/jqueryval?v=shBfM8gvrYJt6eNs9xKMaOYfzyGdVGLhvPUMJ92MwmM1"></script>
+
+<script src="/sitecore%20modules/Web/Web%20Forms%20for%20Marketers/mvc/wffm.min.js"></script>
+<script>
+  $(document).ready(function() {
+    $("form[data-wffm]").each(function() { $(this).wffmForm(); });
+  });
+</script>
+
+<link rel="stylesheet" type="text/css" href="//fast.fonts.net/t/1.css?apiType=css&projectid=5616bfa5-8ba9-4061-8e15-3a2d29551ced" />
+
+
+<script src="//munchkin.marketo.net/munchkin.js" type="text/javascript"></script>
+<script type="text/javascript">
+    Munchkin.init('312-SAX-488', {'asyncOnly': true});
+</script>
+
+
+
+    
+    
+    
+</body>
+</html>
diff --git a/library/stdarch/crates/stdarch-verify/build.rs b/library/stdarch/crates/stdarch-verify/build.rs
new file mode 100644
index 000000000..c0dc81b6a
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/build.rs
@@ -0,0 +1,28 @@
+use std::path::Path;
+
+fn main() {
+    let dir = Path::new(env!("CARGO_MANIFEST_DIR"));
+    let root = dir.parent().unwrap();
+    eprintln!("root: {}", root.display());
+    walk(&root.join("core_arch/src/x86"));
+    walk(&root.join("core_arch/src/x86_64"));
+    walk(&root.join("core_arch/src/arm"));
+    walk(&root.join("core_arch/src/aarch64"));
+}
+
+fn walk(root: &Path) {
+    for file in root.read_dir().unwrap() {
+        eprintln!("root: {}", root.display());
+        let file = file.unwrap();
+        if file.file_type().unwrap().is_dir() {
+            walk(&file.path());
+            continue;
+        }
+        let path = file.path();
+        if path.extension().and_then(|s| s.to_str()) != Some("rs") {
+            continue;
+        }
+
+        println!("cargo:rerun-if-changed={}", path.display());
+    }
+}
diff --git a/library/stdarch/crates/stdarch-verify/mips-msa.h b/library/stdarch/crates/stdarch-verify/mips-msa.h
new file mode 100644
index 000000000..881f1918f
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/mips-msa.h
@@ -0,0 +1,707 @@
+v16i8 __builtin_msa_add_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_add_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_add_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_add_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_adds_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_adds_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_adds_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_adds_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_adds_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_adds_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_adds_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_adds_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_adds_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_adds_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_adds_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_adds_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_addv_b (v16i8, v16i8);
+v8i16 __builtin_msa_addv_h (v8i16, v8i16);
+v4i32 __builtin_msa_addv_w (v4i32, v4i32);
+v2i64 __builtin_msa_addv_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_addvi_b (v16i8, imm0_31);
+v8i16 __builtin_msa_addvi_h (v8i16, imm0_31);
+v4i32 __builtin_msa_addvi_w (v4i32, imm0_31);
+v2i64 __builtin_msa_addvi_d (v2i64, imm0_31);
+
+v16u8 __builtin_msa_and_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_andi_b (v16u8, imm0_255);
+
+v16i8 __builtin_msa_asub_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_asub_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_asub_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_asub_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_asub_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_asub_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_asub_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_asub_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_ave_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_ave_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_ave_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_ave_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_ave_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_ave_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_ave_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_ave_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_aver_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_aver_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_aver_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_aver_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_aver_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_aver_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_aver_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_aver_u_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bclr_b (v16u8, v16u8);
+v8u16 __builtin_msa_bclr_h (v8u16, v8u16);
+v4u32 __builtin_msa_bclr_w (v4u32, v4u32);
+v2u64 __builtin_msa_bclr_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bclri_b (v16u8, imm0_7);
+v8u16 __builtin_msa_bclri_h (v8u16, imm0_15);
+v4u32 __builtin_msa_bclri_w (v4u32, imm0_31);
+v2u64 __builtin_msa_bclri_d (v2u64, imm0_63);
+
+v16u8 __builtin_msa_binsl_b (v16u8, v16u8, v16u8);
+v8u16 __builtin_msa_binsl_h (v8u16, v8u16, v8u16);
+v4u32 __builtin_msa_binsl_w (v4u32, v4u32, v4u32);
+v2u64 __builtin_msa_binsl_d (v2u64, v2u64, v2u64);
+
+v16u8 __builtin_msa_binsli_b (v16u8, v16u8, imm0_7);
+v8u16 __builtin_msa_binsli_h (v8u16, v8u16, imm0_15);
+v4u32 __builtin_msa_binsli_w (v4u32, v4u32, imm0_31);
+v2u64 __builtin_msa_binsli_d (v2u64, v2u64, imm0_63);
+
+v16u8 __builtin_msa_binsr_b (v16u8, v16u8, v16u8);
+v8u16 __builtin_msa_binsr_h (v8u16, v8u16, v8u16);
+v4u32 __builtin_msa_binsr_w (v4u32, v4u32, v4u32);
+v2u64 __builtin_msa_binsr_d (v2u64, v2u64, v2u64);
+
+v16u8 __builtin_msa_binsri_b (v16u8, v16u8, imm0_7);
+v8u16 __builtin_msa_binsri_h (v8u16, v8u16, imm0_15);
+v4u32 __builtin_msa_binsri_w (v4u32, v4u32, imm0_31);
+v2u64 __builtin_msa_binsri_d (v2u64, v2u64, imm0_63);
+
+v16u8 __builtin_msa_bmnz_v (v16u8, v16u8, v16u8);
+
+v16u8 __builtin_msa_bmnzi_b (v16u8, v16u8, imm0_255);
+
+v16u8 __builtin_msa_bmz_v (v16u8, v16u8, v16u8);
+
+v16u8 __builtin_msa_bmzi_b (v16u8, v16u8, imm0_255);
+
+v16u8 __builtin_msa_bneg_b (v16u8, v16u8);
+v8u16 __builtin_msa_bneg_h (v8u16, v8u16);
+v4u32 __builtin_msa_bneg_w (v4u32, v4u32);
+v2u64 __builtin_msa_bneg_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bnegi_b (v16u8, imm0_7);
+v8u16 __builtin_msa_bnegi_h (v8u16, imm0_15);
+v4u32 __builtin_msa_bnegi_w (v4u32, imm0_31);
+v2u64 __builtin_msa_bnegi_d (v2u64, imm0_63);
+
+i32 __builtin_msa_bnz_b (v16u8);
+i32 __builtin_msa_bnz_h (v8u16);
+i32 __builtin_msa_bnz_w (v4u32);
+i32 __builtin_msa_bnz_d (v2u64);
+
+i32 __builtin_msa_bnz_v (v16u8);
+
+v16u8 __builtin_msa_bsel_v (v16u8, v16u8, v16u8);
+
+v16u8 __builtin_msa_bseli_b (v16u8, v16u8, imm0_255);
+
+v16u8 __builtin_msa_bset_b (v16u8, v16u8);
+v8u16 __builtin_msa_bset_h (v8u16, v8u16);
+v4u32 __builtin_msa_bset_w (v4u32, v4u32);
+v2u64 __builtin_msa_bset_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bseti_b (v16u8, imm0_7);
+v8u16 __builtin_msa_bseti_h (v8u16, imm0_15);
+v4u32 __builtin_msa_bseti_w (v4u32, imm0_31);
+v2u64 __builtin_msa_bseti_d (v2u64, imm0_63);
+
+i32 __builtin_msa_bz_b (v16u8);
+i32 __builtin_msa_bz_h (v8u16);
+i32 __builtin_msa_bz_w (v4u32);
+i32 __builtin_msa_bz_d (v2u64);
+
+i32 __builtin_msa_bz_v (v16u8);
+
+v16i8 __builtin_msa_ceq_b (v16i8, v16i8);
+v8i16 __builtin_msa_ceq_h (v8i16, v8i16);
+v4i32 __builtin_msa_ceq_w (v4i32, v4i32);
+v2i64 __builtin_msa_ceq_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ceqi_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_ceqi_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_ceqi_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_ceqi_d (v2i64, imm_n16_15);
+
+i32 __builtin_msa_cfcmsa (imm0_31);
+
+v16i8 __builtin_msa_cle_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_cle_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_cle_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_cle_s_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_cle_u_b (v16u8, v16u8);
+v8i16 __builtin_msa_cle_u_h (v8u16, v8u16);
+v4i32 __builtin_msa_cle_u_w (v4u32, v4u32);
+v2i64 __builtin_msa_cle_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_clei_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_clei_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_clei_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_clei_s_d (v2i64, imm_n16_15);
+
+v16i8 __builtin_msa_clei_u_b (v16u8, imm0_31);
+v8i16 __builtin_msa_clei_u_h (v8u16, imm0_31);
+v4i32 __builtin_msa_clei_u_w (v4u32, imm0_31);
+v2i64 __builtin_msa_clei_u_d (v2u64, imm0_31);
+
+v16i8 __builtin_msa_clt_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_clt_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_clt_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_clt_s_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_clt_u_b (v16u8, v16u8);
+v8i16 __builtin_msa_clt_u_h (v8u16, v8u16);
+v4i32 __builtin_msa_clt_u_w (v4u32, v4u32);
+v2i64 __builtin_msa_clt_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_clti_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_clti_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_clti_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_clti_s_d (v2i64, imm_n16_15);
+
+v16i8 __builtin_msa_clti_u_b (v16u8, imm0_31);
+v8i16 __builtin_msa_clti_u_h (v8u16, imm0_31);
+v4i32 __builtin_msa_clti_u_w (v4u32, imm0_31);
+v2i64 __builtin_msa_clti_u_d (v2u64, imm0_31);
+
+i32 __builtin_msa_copy_s_b (v16i8, imm0_15);
+i32 __builtin_msa_copy_s_h (v8i16, imm0_7);
+i32 __builtin_msa_copy_s_w (v4i32, imm0_3);
+i64 __builtin_msa_copy_s_d (v2i64, imm0_1);
+
+u32 __builtin_msa_copy_u_b (v16i8, imm0_15);
+u32 __builtin_msa_copy_u_h (v8i16, imm0_7);
+u32 __builtin_msa_copy_u_w (v4i32, imm0_3);
+u64 __builtin_msa_copy_u_d (v2i64, imm0_1);
+
+void __builtin_msa_ctcmsa (imm0_31, i32);
+
+v16i8 __builtin_msa_div_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_div_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_div_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_div_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_div_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_div_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_div_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_div_u_d (v2u64, v2u64);
+
+v8i16 __builtin_msa_dotp_s_h (v16i8, v16i8);
+v4i32 __builtin_msa_dotp_s_w (v8i16, v8i16);
+v2i64 __builtin_msa_dotp_s_d (v4i32, v4i32);
+
+v8u16 __builtin_msa_dotp_u_h (v16u8, v16u8);
+v4u32 __builtin_msa_dotp_u_w (v8u16, v8u16);
+v2u64 __builtin_msa_dotp_u_d (v4u32, v4u32);
+
+v8i16 __builtin_msa_dpadd_s_h (v8i16, v16i8, v16i8);
+v4i32 __builtin_msa_dpadd_s_w (v4i32, v8i16, v8i16);
+v2i64 __builtin_msa_dpadd_s_d (v2i64, v4i32, v4i32);
+
+v8u16 __builtin_msa_dpadd_u_h (v8u16, v16u8, v16u8);
+v4u32 __builtin_msa_dpadd_u_w (v4u32, v8u16, v8u16);
+v2u64 __builtin_msa_dpadd_u_d (v2u64, v4u32, v4u32);
+
+v8i16 __builtin_msa_dpsub_s_h (v8i16, v16i8, v16i8);
+v4i32 __builtin_msa_dpsub_s_w (v4i32, v8i16, v8i16);
+v2i64 __builtin_msa_dpsub_s_d (v2i64, v4i32, v4i32);
+
+v8i16 __builtin_msa_dpsub_u_h (v8i16, v16u8, v16u8);
+v4i32 __builtin_msa_dpsub_u_w (v4i32, v8u16, v8u16);
+v2i64 __builtin_msa_dpsub_u_d (v2i64, v4u32, v4u32);
+
+v4f32 __builtin_msa_fadd_w (v4f32, v4f32);
+v2f64 __builtin_msa_fadd_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcaf_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcaf_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fceq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fceq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fclass_w (v4f32);
+v2i64 __builtin_msa_fclass_d (v2f64);
+
+v4i32 __builtin_msa_fcle_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcle_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fclt_w (v4f32, v4f32);
+v2i64 __builtin_msa_fclt_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcne_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcne_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcor_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcor_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcueq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcueq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcule_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcule_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcult_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcult_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcun_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcun_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcune_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcune_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fdiv_w (v4f32, v4f32);
+v2f64 __builtin_msa_fdiv_d (v2f64, v2f64);
+
+v8i16 __builtin_msa_fexdo_h (v4f32, v4f32);
+v4f32 __builtin_msa_fexdo_w (v2f64, v2f64);
+
+v4f32 __builtin_msa_fexp2_w (v4f32, v4i32);
+v2f64 __builtin_msa_fexp2_d (v2f64, v2i64);
+
+v4f32 __builtin_msa_fexupl_w (v8i16);
+v2f64 __builtin_msa_fexupl_d (v4f32);
+
+v4f32 __builtin_msa_fexupr_w (v8i16);
+v2f64 __builtin_msa_fexupr_d (v4f32);
+
+v4f32 __builtin_msa_ffint_s_w (v4i32);
+v2f64 __builtin_msa_ffint_s_d (v2i64);
+
+v4f32 __builtin_msa_ffint_u_w (v4u32);
+v2f64 __builtin_msa_ffint_u_d (v2u64);
+
+v4f32 __builtin_msa_ffql_w (v8i16);
+v2f64 __builtin_msa_ffql_d (v4i32);
+
+v4f32 __builtin_msa_ffqr_w (v8i16);
+v2f64 __builtin_msa_ffqr_d (v4i32);
+
+v16i8 __builtin_msa_fill_b (i32);
+v8i16 __builtin_msa_fill_h (i32);
+v4i32 __builtin_msa_fill_w (i32);
+v2i64 __builtin_msa_fill_d (i64);
+
+v4f32 __builtin_msa_flog2_w (v4f32);
+v2f64 __builtin_msa_flog2_d (v2f64);
+
+v4f32 __builtin_msa_fmadd_w (v4f32, v4f32, v4f32);
+v2f64 __builtin_msa_fmadd_d (v2f64, v2f64, v2f64);
+
+v4f32 __builtin_msa_fmax_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmax_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmax_a_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmax_a_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmin_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmin_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmin_a_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmin_a_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmsub_w (v4f32, v4f32, v4f32);
+v2f64 __builtin_msa_fmsub_d (v2f64, v2f64, v2f64);
+
+v4f32 __builtin_msa_fmul_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmul_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_frint_w (v4f32);
+v2f64 __builtin_msa_frint_d (v2f64);
+
+v4f32 __builtin_msa_frcp_w (v4f32);
+v2f64 __builtin_msa_frcp_d (v2f64);
+
+v4f32 __builtin_msa_frsqrt_w (v4f32);
+v2f64 __builtin_msa_frsqrt_d (v2f64);
+
+v4i32 __builtin_msa_fsaf_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsaf_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fseq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fseq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsle_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsle_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fslt_w (v4f32, v4f32);
+v2i64 __builtin_msa_fslt_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsne_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsne_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsor_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsor_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fsqrt_w (v4f32);
+v2f64 __builtin_msa_fsqrt_d (v2f64);
+
+v4f32 __builtin_msa_fsub_w (v4f32, v4f32);
+v2f64 __builtin_msa_fsub_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsueq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsueq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsule_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsule_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsult_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsult_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsun_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsun_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsune_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsune_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_ftint_s_w (v4f32);
+v2i64 __builtin_msa_ftint_s_d (v2f64);
+
+v4u32 __builtin_msa_ftint_u_w (v4f32);
+v2u64 __builtin_msa_ftint_u_d (v2f64);
+
+v8i16 __builtin_msa_ftq_h (v4f32, v4f32);
+v4i32 __builtin_msa_ftq_w (v2f64, v2f64);
+
+v4i32 __builtin_msa_ftrunc_s_w (v4f32);
+v2i64 __builtin_msa_ftrunc_s_d (v2f64);
+
+v4u32 __builtin_msa_ftrunc_u_w (v4f32);
+v2u64 __builtin_msa_ftrunc_u_d (v2f64);
+
+v8i16 __builtin_msa_hadd_s_h (v16i8, v16i8);
+v4i32 __builtin_msa_hadd_s_w (v8i16, v8i16);
+v2i64 __builtin_msa_hadd_s_d (v4i32, v4i32);
+
+v8u16 __builtin_msa_hadd_u_h (v16u8, v16u8);
+v4u32 __builtin_msa_hadd_u_w (v8u16, v8u16);
+v2u64 __builtin_msa_hadd_u_d (v4u32, v4u32);
+
+v8i16 __builtin_msa_hsub_s_h (v16i8, v16i8);
+v4i32 __builtin_msa_hsub_s_w (v8i16, v8i16);
+v2i64 __builtin_msa_hsub_s_d (v4i32, v4i32);
+
+v8i16 __builtin_msa_hsub_u_h (v16u8, v16u8);
+v4i32 __builtin_msa_hsub_u_w (v8u16, v8u16);
+v2i64 __builtin_msa_hsub_u_d (v4u32, v4u32);
+
+v16i8 __builtin_msa_ilvev_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvev_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvev_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvev_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ilvl_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvl_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvl_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvl_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ilvod_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvod_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvod_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvod_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ilvr_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvr_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvr_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvr_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_insert_b (v16i8, imm0_15, i32);
+v8i16 __builtin_msa_insert_h (v8i16, imm0_7, i32);
+v4i32 __builtin_msa_insert_w (v4i32, imm0_3, i32);
+v2i64 __builtin_msa_insert_d (v2i64, imm0_1, i64);
+
+v16i8 __builtin_msa_insve_b (v16i8, imm0_15, v16i8);
+v8i16 __builtin_msa_insve_h (v8i16, imm0_7, v8i16);
+v4i32 __builtin_msa_insve_w (v4i32, imm0_3, v4i32);
+v2i64 __builtin_msa_insve_d (v2i64, imm0_1, v2i64);
+
+v16i8 __builtin_msa_ld_b (void *, imm_n512_511);
+v8i16 __builtin_msa_ld_h (void *, imm_n1024_1022);
+v4i32 __builtin_msa_ld_w (void *, imm_n2048_2044);
+v2i64 __builtin_msa_ld_d (void *, imm_n4096_4088);
+
+v16i8 __builtin_msa_ldi_b (imm_n512_511);
+v8i16 __builtin_msa_ldi_h (imm_n512_511);
+v4i32 __builtin_msa_ldi_w (imm_n512_511);
+v2i64 __builtin_msa_ldi_d (imm_n512_511);
+
+v8i16 __builtin_msa_madd_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_madd_q_w (v4i32, v4i32, v4i32);
+
+v8i16 __builtin_msa_maddr_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_maddr_q_w (v4i32, v4i32, v4i32);
+
+v16i8 __builtin_msa_maddv_b (v16i8, v16i8, v16i8);
+v8i16 __builtin_msa_maddv_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_maddv_w (v4i32, v4i32, v4i32);
+v2i64 __builtin_msa_maddv_d (v2i64, v2i64, v2i64);
+
+v16i8 __builtin_msa_max_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_max_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_max_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_max_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_max_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_max_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_max_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_max_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_max_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_max_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_max_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_max_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_maxi_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_maxi_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_maxi_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_maxi_s_d (v2i64, imm_n16_15);
+
+v16u8 __builtin_msa_maxi_u_b (v16u8, imm0_31);
+v8u16 __builtin_msa_maxi_u_h (v8u16, imm0_31);
+v4u32 __builtin_msa_maxi_u_w (v4u32, imm0_31);
+v2u64 __builtin_msa_maxi_u_d (v2u64, imm0_31);
+
+v16i8 __builtin_msa_min_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_min_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_min_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_min_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_min_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_min_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_min_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_min_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_min_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_min_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_min_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_min_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_mini_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_mini_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_mini_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_mini_s_d (v2i64, imm_n16_15);
+
+v16u8 __builtin_msa_mini_u_b (v16u8, imm0_31);
+v8u16 __builtin_msa_mini_u_h (v8u16, imm0_31);
+v4u32 __builtin_msa_mini_u_w (v4u32, imm0_31);
+v2u64 __builtin_msa_mini_u_d (v2u64, imm0_31);
+
+v16i8 __builtin_msa_mod_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_mod_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_mod_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_mod_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_mod_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_mod_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_mod_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_mod_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_move_v (v16i8);
+
+v8i16 __builtin_msa_msub_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_msub_q_w (v4i32, v4i32, v4i32);
+
+v8i16 __builtin_msa_msubr_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_msubr_q_w (v4i32, v4i32, v4i32);
+
+v16i8 __builtin_msa_msubv_b (v16i8, v16i8, v16i8);
+v8i16 __builtin_msa_msubv_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_msubv_w (v4i32, v4i32, v4i32);
+v2i64 __builtin_msa_msubv_d (v2i64, v2i64, v2i64);
+
+v8i16 __builtin_msa_mul_q_h (v8i16, v8i16);
+v4i32 __builtin_msa_mul_q_w (v4i32, v4i32);
+
+v8i16 __builtin_msa_mulr_q_h (v8i16, v8i16);
+v4i32 __builtin_msa_mulr_q_w (v4i32, v4i32);
+
+v16i8 __builtin_msa_mulv_b (v16i8, v16i8);
+v8i16 __builtin_msa_mulv_h (v8i16, v8i16);
+v4i32 __builtin_msa_mulv_w (v4i32, v4i32);
+v2i64 __builtin_msa_mulv_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_nloc_b (v16i8);
+v8i16 __builtin_msa_nloc_h (v8i16);
+v4i32 __builtin_msa_nloc_w (v4i32);
+v2i64 __builtin_msa_nloc_d (v2i64);
+
+v16i8 __builtin_msa_nlzc_b (v16i8);
+v8i16 __builtin_msa_nlzc_h (v8i16);
+v4i32 __builtin_msa_nlzc_w (v4i32);
+v2i64 __builtin_msa_nlzc_d (v2i64);
+
+v16u8 __builtin_msa_nor_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_nori_b (v16u8, imm0_255);
+
+v16u8 __builtin_msa_or_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_ori_b (v16u8, imm0_255);
+
+v16i8 __builtin_msa_pckev_b (v16i8, v16i8);
+v8i16 __builtin_msa_pckev_h (v8i16, v8i16);
+v4i32 __builtin_msa_pckev_w (v4i32, v4i32);
+v2i64 __builtin_msa_pckev_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_pckod_b (v16i8, v16i8);
+v8i16 __builtin_msa_pckod_h (v8i16, v8i16);
+v4i32 __builtin_msa_pckod_w (v4i32, v4i32);
+v2i64 __builtin_msa_pckod_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_pcnt_b (v16i8);
+v8i16 __builtin_msa_pcnt_h (v8i16);
+v4i32 __builtin_msa_pcnt_w (v4i32);
+v2i64 __builtin_msa_pcnt_d (v2i64);
+
+v16i8 __builtin_msa_sat_s_b (v16i8, imm0_7);
+v8i16 __builtin_msa_sat_s_h (v8i16, imm0_15);
+v4i32 __builtin_msa_sat_s_w (v4i32, imm0_31);
+v2i64 __builtin_msa_sat_s_d (v2i64, imm0_63);
+
+v16u8 __builtin_msa_sat_u_b (v16u8, imm0_7);
+v8u16 __builtin_msa_sat_u_h (v8u16, imm0_15);
+v4u32 __builtin_msa_sat_u_w (v4u32, imm0_31);
+v2u64 __builtin_msa_sat_u_d (v2u64, imm0_63);
+
+v16i8 __builtin_msa_shf_b (v16i8, imm0_255);
+v8i16 __builtin_msa_shf_h (v8i16, imm0_255);
+v4i32 __builtin_msa_shf_w (v4i32, imm0_255);
+
+v16i8 __builtin_msa_sld_b (v16i8, v16i8, i32);
+v8i16 __builtin_msa_sld_h (v8i16, v8i16, i32);
+v4i32 __builtin_msa_sld_w (v4i32, v4i32, i32);
+v2i64 __builtin_msa_sld_d (v2i64, v2i64, i32);
+
+v16i8 __builtin_msa_sldi_b (v16i8, v16i8, imm0_15);
+v8i16 __builtin_msa_sldi_h (v8i16, v8i16, imm0_7);
+v4i32 __builtin_msa_sldi_w (v4i32, v4i32, imm0_3);
+v2i64 __builtin_msa_sldi_d (v2i64, v2i64, imm0_1);
+
+v16i8 __builtin_msa_sll_b (v16i8, v16i8);
+v8i16 __builtin_msa_sll_h (v8i16, v8i16);
+v4i32 __builtin_msa_sll_w (v4i32, v4i32);
+v2i64 __builtin_msa_sll_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_slli_b (v16i8, imm0_7);
+v8i16 __builtin_msa_slli_h (v8i16, imm0_15);
+v4i32 __builtin_msa_slli_w (v4i32, imm0_31);
+v2i64 __builtin_msa_slli_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_splat_b (v16i8, i32);
+v8i16 __builtin_msa_splat_h (v8i16, i32);
+v4i32 __builtin_msa_splat_w (v4i32, i32);
+v2i64 __builtin_msa_splat_d (v2i64, i32);
+
+v16i8 __builtin_msa_splati_b (v16i8, imm0_15);
+v8i16 __builtin_msa_splati_h (v8i16, imm0_7);
+v4i32 __builtin_msa_splati_w (v4i32, imm0_3);
+v2i64 __builtin_msa_splati_d (v2i64, imm0_1);
+
+v16i8 __builtin_msa_sra_b (v16i8, v16i8);
+v8i16 __builtin_msa_sra_h (v8i16, v8i16);
+v4i32 __builtin_msa_sra_w (v4i32, v4i32);
+v2i64 __builtin_msa_sra_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srai_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srai_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srai_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srai_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_srar_b (v16i8, v16i8);
+v8i16 __builtin_msa_srar_h (v8i16, v8i16);
+v4i32 __builtin_msa_srar_w (v4i32, v4i32);
+v2i64 __builtin_msa_srar_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srari_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srari_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srari_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srari_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_srl_b (v16i8, v16i8);
+v8i16 __builtin_msa_srl_h (v8i16, v8i16);
+v4i32 __builtin_msa_srl_w (v4i32, v4i32);
+v2i64 __builtin_msa_srl_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srli_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srli_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srli_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srli_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_srlr_b (v16i8, v16i8);
+v8i16 __builtin_msa_srlr_h (v8i16, v8i16);
+v4i32 __builtin_msa_srlr_w (v4i32, v4i32);
+v2i64 __builtin_msa_srlr_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srlri_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srlri_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srlri_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srlri_d (v2i64, imm0_63);
+
+void __builtin_msa_st_b (v16i8, void *, imm_n512_511);
+void __builtin_msa_st_h (v8i16, void *, imm_n1024_1022);
+void __builtin_msa_st_w (v4i32, void *, imm_n2048_2044);
+void __builtin_msa_st_d (v2i64, void *, imm_n4096_4088);
+
+v16i8 __builtin_msa_subs_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_subs_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_subs_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_subs_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_subs_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_subs_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_subs_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_subs_u_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_subsus_u_b (v16u8, v16i8);
+v8u16 __builtin_msa_subsus_u_h (v8u16, v8i16);
+v4u32 __builtin_msa_subsus_u_w (v4u32, v4i32);
+v2u64 __builtin_msa_subsus_u_d (v2u64, v2i64);
+
+v16i8 __builtin_msa_subsuu_s_b (v16u8, v16u8);
+v8i16 __builtin_msa_subsuu_s_h (v8u16, v8u16);
+v4i32 __builtin_msa_subsuu_s_w (v4u32, v4u32);
+v2i64 __builtin_msa_subsuu_s_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_subv_b (v16i8, v16i8);
+v8i16 __builtin_msa_subv_h (v8i16, v8i16);
+v4i32 __builtin_msa_subv_w (v4i32, v4i32);
+v2i64 __builtin_msa_subv_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_subvi_b (v16i8, imm0_31);
+v8i16 __builtin_msa_subvi_h (v8i16, imm0_31);
+v4i32 __builtin_msa_subvi_w (v4i32, imm0_31);
+v2i64 __builtin_msa_subvi_d (v2i64, imm0_31);
+
+v16i8 __builtin_msa_vshf_b (v16i8, v16i8, v16i8);
+v8i16 __builtin_msa_vshf_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_vshf_w (v4i32, v4i32, v4i32);
+v2i64 __builtin_msa_vshf_d (v2i64, v2i64, v2i64);
+
+v16u8 __builtin_msa_xor_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_xori_b (v16u8, imm0_255);
diff --git a/library/stdarch/crates/stdarch-verify/src/lib.rs b/library/stdarch/crates/stdarch-verify/src/lib.rs
new file mode 100644
index 000000000..22108d26a
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/src/lib.rs
@@ -0,0 +1,525 @@
+#![deny(rust_2018_idioms)]
+#[macro_use]
+extern crate quote;
+#[macro_use]
+extern crate syn;
+
+use proc_macro::TokenStream;
+use std::{fs::File, io::Read, path::Path};
+use syn::ext::IdentExt;
+
+#[proc_macro]
+pub fn x86_functions(input: TokenStream) -> TokenStream {
+    functions(input, &["core_arch/src/x86", "core_arch/src/x86_64"])
+}
+
+#[proc_macro]
+pub fn arm_functions(input: TokenStream) -> TokenStream {
+    functions(
+        input,
+        &[
+            "core_arch/src/arm",
+            "core_arch/src/aarch64",
+            "core_arch/src/arm_shared/neon",
+        ],
+    )
+}
+
+#[proc_macro]
+pub fn mips_functions(input: TokenStream) -> TokenStream {
+    functions(input, &["core_arch/src/mips"])
+}
+
+fn functions(input: TokenStream, dirs: &[&str]) -> TokenStream {
+    let dir = Path::new(env!("CARGO_MANIFEST_DIR"));
+    let root = dir.parent().expect("root-dir not found");
+
+    let mut files = Vec::new();
+    for dir in dirs {
+        walk(&root.join(dir), &mut files);
+    }
+    assert!(!files.is_empty());
+
+    let mut functions = Vec::new();
+    for &mut (ref mut file, ref path) in &mut files {
+        for mut item in file.items.drain(..) {
+            match item {
+                syn::Item::Fn(f) => functions.push((f, path)),
+                syn::Item::Mod(ref mut m) => {
+                    if let Some(ref mut m) = m.content {
+                        for i in m.1.drain(..) {
+                            if let syn::Item::Fn(f) = i {
+                                functions.push((f, path))
+                            }
+                        }
+                    }
+                }
+                _ => (),
+            }
+        }
+    }
+    assert!(!functions.is_empty());
+
+    let mut tests = std::collections::HashSet::<String>::new();
+    for f in &functions {
+        let id = format!("{}", f.0.sig.ident);
+        if id.starts_with("test_") {
+            tests.insert(id);
+        }
+    }
+    assert!(!tests.is_empty());
+
+    functions.retain(|&(ref f, _)| {
+        if let syn::Visibility::Public(_) = f.vis {
+            if f.sig.unsafety.is_some() {
+                return true;
+            }
+        }
+        false
+    });
+    assert!(!functions.is_empty());
+
+    let input = proc_macro2::TokenStream::from(input);
+
+    let functions = functions
+        .iter()
+        .map(|&(ref f, path)| {
+            let name = &f.sig.ident;
+            // println!("{}", name);
+            let mut arguments = Vec::new();
+            let mut const_arguments = Vec::new();
+            for input in f.sig.inputs.iter() {
+                let ty = match *input {
+                    syn::FnArg::Typed(ref c) => &c.ty,
+                    _ => panic!("invalid argument on {}", name),
+                };
+                arguments.push(to_type(ty));
+            }
+            for generic in f.sig.generics.params.iter() {
+                let ty = match *generic {
+                    syn::GenericParam::Const(ref c) => &c.ty,
+                    _ => panic!("invalid generic argument on {}", name),
+                };
+                const_arguments.push(to_type(ty));
+            }
+            let ret = match f.sig.output {
+                syn::ReturnType::Default => quote! { None },
+                syn::ReturnType::Type(_, ref t) => {
+                    let ty = to_type(t);
+                    quote! { Some(#ty) }
+                }
+            };
+            let instrs = find_instrs(&f.attrs);
+            let target_feature = if let Some(i) = find_target_feature(&f.attrs) {
+                quote! { Some(#i) }
+            } else {
+                quote! { None }
+            };
+
+            let required_const = find_required_const("rustc_args_required_const", &f.attrs);
+            let mut legacy_const_generics =
+                find_required_const("rustc_legacy_const_generics", &f.attrs);
+            if !required_const.is_empty() && !legacy_const_generics.is_empty() {
+                panic!(
+                    "Can't have both #[rustc_args_required_const] and \
+                     #[rustc_legacy_const_generics]"
+                );
+            }
+
+            // The list of required consts, used to verify the arguments, comes from either the
+            // `rustc_args_required_const` or the `rustc_legacy_const_generics` attribute.
+            let required_const = if required_const.is_empty() {
+                legacy_const_generics.clone()
+            } else {
+                required_const
+            };
+
+            legacy_const_generics.sort();
+            for (idx, ty) in legacy_const_generics
+                .into_iter()
+                .zip(const_arguments.into_iter())
+            {
+                arguments.insert(idx, ty);
+            }
+
+            // strip leading underscore from fn name when building a test
+            // _mm_foo -> mm_foo such that the test name is test_mm_foo.
+            let test_name_string = format!("{}", name);
+            let mut test_name_id = test_name_string.as_str();
+            while test_name_id.starts_with('_') {
+                test_name_id = &test_name_id[1..];
+            }
+            let has_test = tests.contains(&format!("test_{}", test_name_id));
+
+            quote! {
+                Function {
+                    name: stringify!(#name),
+                    arguments: &[#(#arguments),*],
+                    ret: #ret,
+                    target_feature: #target_feature,
+                    instrs: &[#(#instrs),*],
+                    file: stringify!(#path),
+                    required_const: &[#(#required_const),*],
+                    has_test: #has_test,
+                }
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let ret = quote! { #input: &[Function] = &[#(#functions),*]; };
+    // println!("{}", ret);
+    ret.into()
+}
+
+fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
+    match *t {
+        syn::Type::Path(ref p) => match extract_path_ident(&p.path).to_string().as_ref() {
+            // x86 ...
+            "__m128" => quote! { &M128 },
+            "__m128bh" => quote! { &M128BH },
+            "__m128d" => quote! { &M128D },
+            "__m128i" => quote! { &M128I },
+            "__m256" => quote! { &M256 },
+            "__m256bh" => quote! { &M256BH },
+            "__m256d" => quote! { &M256D },
+            "__m256i" => quote! { &M256I },
+            "__m512" => quote! { &M512 },
+            "__m512bh" => quote! { &M512BH },
+            "__m512d" => quote! { &M512D },
+            "__m512i" => quote! { &M512I },
+            "__mmask8" => quote! { &MMASK8 },
+            "__mmask16" => quote! { &MMASK16 },
+            "__mmask32" => quote! { &MMASK32 },
+            "__mmask64" => quote! { &MMASK64 },
+            "_MM_CMPINT_ENUM" => quote! { &MM_CMPINT_ENUM },
+            "_MM_MANTISSA_NORM_ENUM" => quote! { &MM_MANTISSA_NORM_ENUM },
+            "_MM_MANTISSA_SIGN_ENUM" => quote! { &MM_MANTISSA_SIGN_ENUM },
+            "_MM_PERM_ENUM" => quote! { &MM_PERM_ENUM },
+            "__m64" => quote! { &M64 },
+            "bool" => quote! { &BOOL },
+            "f32" => quote! { &F32 },
+            "f64" => quote! { &F64 },
+            "i16" => quote! { &I16 },
+            "i32" => quote! { &I32 },
+            "i64" => quote! { &I64 },
+            "i8" => quote! { &I8 },
+            "u16" => quote! { &U16 },
+            "u32" => quote! { &U32 },
+            "u64" => quote! { &U64 },
+            "u128" => quote! { &U128 },
+            "u8" => quote! { &U8 },
+            "p8" => quote! { &P8 },
+            "p16" => quote! { &P16 },
+            "Ordering" => quote! { &ORDERING },
+            "CpuidResult" => quote! { &CPUID },
+
+            // arm ...
+            "int8x4_t" => quote! { &I8X4 },
+            "int8x8_t" => quote! { &I8X8 },
+            "int8x8x2_t" => quote! { &I8X8X2 },
+            "int8x8x3_t" => quote! { &I8X8X3 },
+            "int8x8x4_t" => quote! { &I8X8X4 },
+            "int8x16x2_t" => quote! { &I8X16X2 },
+            "int8x16x3_t" => quote! { &I8X16X3 },
+            "int8x16x4_t" => quote! { &I8X16X4 },
+            "int8x16_t" => quote! { &I8X16 },
+            "int16x2_t" => quote! { &I16X2 },
+            "int16x4_t" => quote! { &I16X4 },
+            "int16x4x2_t" => quote! { &I16X4X2 },
+            "int16x4x3_t" => quote! { &I16X4X3 },
+            "int16x4x4_t" => quote! { &I16X4X4 },
+            "int16x8_t" => quote! { &I16X8 },
+            "int16x8x2_t" => quote! { &I16X8X2 },
+            "int16x8x3_t" => quote! { &I16X8X3 },
+            "int16x8x4_t" => quote! { &I16X8X4 },
+            "int32x2_t" => quote! { &I32X2 },
+            "int32x2x2_t" => quote! { &I32X2X2 },
+            "int32x2x3_t" => quote! { &I32X2X3 },
+            "int32x2x4_t" => quote! { &I32X2X4 },
+            "int32x4_t" => quote! { &I32X4 },
+            "int32x4x2_t" => quote! { &I32X4X2 },
+            "int32x4x3_t" => quote! { &I32X4X3 },
+            "int32x4x4_t" => quote! { &I32X4X4 },
+            "int64x1_t" => quote! { &I64X1 },
+            "int64x1x2_t" => quote! { &I64X1X2 },
+            "int64x1x3_t" => quote! { &I64X1X3 },
+            "int64x1x4_t" => quote! { &I64X1X4 },
+            "int64x2_t" => quote! { &I64X2 },
+            "int64x2x2_t" => quote! { &I64X2X2 },
+            "int64x2x3_t" => quote! { &I64X2X3 },
+            "int64x2x4_t" => quote! { &I64X2X4 },
+            "uint8x8_t" => quote! { &U8X8 },
+            "uint8x4_t" => quote! { &U8X4 },
+            "uint8x8x2_t" => quote! { &U8X8X2 },
+            "uint8x16x2_t" => quote! { &U8X16X2 },
+            "uint8x16x3_t" => quote! { &U8X16X3 },
+            "uint8x16x4_t" => quote! { &U8X16X4 },
+            "uint8x8x3_t" => quote! { &U8X8X3 },
+            "uint8x8x4_t" => quote! { &U8X8X4 },
+            "uint8x16_t" => quote! { &U8X16 },
+            "uint16x4_t" => quote! { &U16X4 },
+            "uint16x4x2_t" => quote! { &U16X4X2 },
+            "uint16x4x3_t" => quote! { &U16X4X3 },
+            "uint16x4x4_t" => quote! { &U16X4X4 },
+            "uint16x8_t" => quote! { &U16X8 },
+            "uint16x8x2_t" => quote! { &U16X8X2 },
+            "uint16x8x3_t" => quote! { &U16X8X3 },
+            "uint16x8x4_t" => quote! { &U16X8X4 },
+            "uint32x2_t" => quote! { &U32X2 },
+            "uint32x2x2_t" => quote! { &U32X2X2 },
+            "uint32x2x3_t" => quote! { &U32X2X3 },
+            "uint32x2x4_t" => quote! { &U32X2X4 },
+            "uint32x4_t" => quote! { &U32X4 },
+            "uint32x4x2_t" => quote! { &U32X4X2 },
+            "uint32x4x3_t" => quote! { &U32X4X3 },
+            "uint32x4x4_t" => quote! { &U32X4X4 },
+            "uint64x1_t" => quote! { &U64X1 },
+            "uint64x1x2_t" => quote! { &U64X1X2 },
+            "uint64x1x3_t" => quote! { &U64X1X3 },
+            "uint64x1x4_t" => quote! { &U64X1X4 },
+            "uint64x2_t" => quote! { &U64X2 },
+            "uint64x2x2_t" => quote! { &U64X2X2 },
+            "uint64x2x3_t" => quote! { &U64X2X3 },
+            "uint64x2x4_t" => quote! { &U64X2X4 },
+            "float32x2_t" => quote! { &F32X2 },
+            "float32x2x2_t" => quote! { &F32X2X2 },
+            "float32x2x3_t" => quote! { &F32X2X3 },
+            "float32x2x4_t" => quote! { &F32X2X4 },
+            "float32x4_t" => quote! { &F32X4 },
+            "float32x4x2_t" => quote! { &F32X4X2 },
+            "float32x4x3_t" => quote! { &F32X4X3 },
+            "float32x4x4_t" => quote! { &F32X4X4 },
+            "float64x1_t" => quote! { &F64X1 },
+            "float64x1x2_t" => quote! { &F64X1X2 },
+            "float64x1x3_t" => quote! { &F64X1X3 },
+            "float64x1x4_t" => quote! { &F64X1X4 },
+            "float64x2_t" => quote! { &F64X2 },
+            "float64x2x2_t" => quote! { &F64X2X2 },
+            "float64x2x3_t" => quote! { &F64X2X3 },
+            "float64x2x4_t" => quote! { &F64X2X4 },
+            "poly8x8_t" => quote! { &POLY8X8 },
+            "poly8x8x2_t" => quote! { &POLY8X8X2 },
+            "poly8x8x3_t" => quote! { &POLY8X8X3 },
+            "poly8x8x4_t" => quote! { &POLY8X8X4 },
+            "poly8x16x2_t" => quote! { &POLY8X16X2 },
+            "poly8x16x3_t" => quote! { &POLY8X16X3 },
+            "poly8x16x4_t" => quote! { &POLY8X16X4 },
+            "p64" => quote! { &P64 },
+            "poly64x1_t" => quote! { &POLY64X1 },
+            "poly64x2_t" => quote! { &POLY64X2 },
+            "poly8x16_t" => quote! { &POLY8X16 },
+            "poly16x4_t" => quote! { &POLY16X4 },
+            "poly16x4x2_t" => quote! { &P16X4X2 },
+            "poly16x4x3_t" => quote! { &P16X4X3 },
+            "poly16x4x4_t" => quote! { &P16X4X4 },
+            "poly16x8_t" => quote! { &POLY16X8 },
+            "poly16x8x2_t" => quote! { &P16X8X2 },
+            "poly16x8x3_t" => quote! { &P16X8X3 },
+            "poly16x8x4_t" => quote! { &P16X8X4 },
+            "poly64x1x2_t" => quote! { &P64X1X2 },
+            "poly64x1x3_t" => quote! { &P64X1X3 },
+            "poly64x1x4_t" => quote! { &P64X1X4 },
+            "poly64x2x2_t" => quote! { &P64X2X2 },
+            "poly64x2x3_t" => quote! { &P64X2X3 },
+            "poly64x2x4_t" => quote! { &P64X2X4 },
+            "p128" => quote! { &P128 },
+
+            "v16i8" => quote! { &v16i8 },
+            "v8i16" => quote! { &v8i16 },
+            "v4i32" => quote! { &v4i32 },
+            "v2i64" => quote! { &v2i64 },
+            "v16u8" => quote! { &v16u8 },
+            "v8u16" => quote! { &v8u16 },
+            "v4u32" => quote! { &v4u32 },
+            "v2u64" => quote! { &v2u64 },
+            "v8f16" => quote! { &v8f16 },
+            "v4f32" => quote! { &v4f32 },
+            "v2f64" => quote! { &v2f64 },
+
+            s => panic!("unsupported type: \"{}\"", s),
+        },
+        syn::Type::Ptr(syn::TypePtr {
+            ref elem,
+            ref mutability,
+            ..
+        })
+        | syn::Type::Reference(syn::TypeReference {
+            ref elem,
+            ref mutability,
+            ..
+        }) => {
+            // Both pointers and references can have a mut token (*mut and &mut)
+            if mutability.is_some() {
+                let tokens = to_type(&elem);
+                quote! { &Type::MutPtr(#tokens) }
+            } else {
+                // If they don't (*const or &) then they are "const"
+                let tokens = to_type(&elem);
+                quote! { &Type::ConstPtr(#tokens) }
+            }
+        }
+
+        syn::Type::Slice(_) => panic!("unsupported slice"),
+        syn::Type::Array(_) => panic!("unsupported array"),
+        syn::Type::Tuple(_) => quote! { &TUPLE },
+        syn::Type::Never(_) => quote! { &NEVER },
+        _ => panic!("unsupported type"),
+    }
+}
+
+fn extract_path_ident(path: &syn::Path) -> syn::Ident {
+    if path.leading_colon.is_some() {
+        panic!("unsupported leading colon in path")
+    }
+    if path.segments.len() != 1 {
+        panic!("unsupported path that needs name resolution")
+    }
+    match path.segments.first().expect("segment not found").arguments {
+        syn::PathArguments::None => {}
+        _ => panic!("unsupported path that has path arguments"),
+    }
+    path.segments
+        .first()
+        .expect("segment not found")
+        .ident
+        .clone()
+}
+
+fn walk(root: &Path, files: &mut Vec<(syn::File, String)>) {
+    for file in root.read_dir().unwrap() {
+        let file = file.unwrap();
+        if file.file_type().unwrap().is_dir() {
+            walk(&file.path(), files);
+            continue;
+        }
+        let path = file.path();
+        if path.extension().and_then(std::ffi::OsStr::to_str) != Some("rs") {
+            continue;
+        }
+
+        if path.file_name().and_then(std::ffi::OsStr::to_str) == Some("test.rs") {
+            continue;
+        }
+
+        let mut contents = String::new();
+        File::open(&path)
+            .unwrap_or_else(|_| panic!("can't open file at path: {}", path.display()))
+            .read_to_string(&mut contents)
+            .expect("failed to read file to string");
+
+        files.push((
+            syn::parse_str::<syn::File>(&contents).expect("failed to parse"),
+            path.display().to_string(),
+        ));
+    }
+}
+
+fn find_instrs(attrs: &[syn::Attribute]) -> Vec<String> {
+    struct AssertInstr {
+        instr: String,
+    }
+
+    // A small custom parser to parse out the instruction in `assert_instr`.
+    //
+    // TODO: should probably just reuse `Invoc` from the `assert-instr-macro`
+    // crate.
+    impl syn::parse::Parse for AssertInstr {
+        fn parse(content: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
+            let input;
+            parenthesized!(input in content);
+            let _ = input.parse::<syn::Meta>()?;
+            let _ = input.parse::<Token![,]>()?;
+            let ident = input.parse::<syn::Ident>()?;
+            if ident != "assert_instr" {
+                return Err(input.error("expected `assert_instr`"));
+            }
+            let instrs;
+            parenthesized!(instrs in input);
+
+            let mut instr = String::new();
+            while !instrs.is_empty() {
+                if let Ok(lit) = instrs.parse::<syn::LitStr>() {
+                    instr.push_str(&lit.value());
+                } else if let Ok(ident) = instrs.call(syn::Ident::parse_any) {
+                    instr.push_str(&ident.to_string());
+                } else if instrs.parse::<Token![.]>().is_ok() {
+                    instr.push('.');
+                } else if instrs.parse::<Token![,]>().is_ok() {
+                    // consume everything remaining
+                    drop(instrs.parse::<proc_macro2::TokenStream>());
+                    break;
+                } else {
+                    return Err(input.error("failed to parse instruction"));
+                }
+            }
+            Ok(Self { instr })
+        }
+    }
+
+    attrs
+        .iter()
+        .filter(|a| a.path.is_ident("cfg_attr"))
+        .filter_map(|a| {
+            syn::parse2::<AssertInstr>(a.tokens.clone())
+                .ok()
+                .map(|a| a.instr)
+        })
+        .collect()
+}
+
+fn find_target_feature(attrs: &[syn::Attribute]) -> Option<syn::Lit> {
+    attrs
+        .iter()
+        .flat_map(|a| {
+            if let Ok(a) = a.parse_meta() {
+                if let syn::Meta::List(i) = a {
+                    if i.path.is_ident("target_feature") {
+                        return i.nested;
+                    }
+                }
+            }
+            syn::punctuated::Punctuated::new()
+        })
+        .filter_map(|nested| match nested {
+            syn::NestedMeta::Meta(m) => Some(m),
+            syn::NestedMeta::Lit(_) => None,
+        })
+        .find_map(|m| match m {
+            syn::Meta::NameValue(ref i) if i.path.is_ident("enable") => Some(i.clone().lit),
+            _ => None,
+        })
+}
+
+fn find_required_const(name: &str, attrs: &[syn::Attribute]) -> Vec<usize> {
+    attrs
+        .iter()
+        .flat_map(|a| {
+            if a.path.segments[0].ident == name {
+                syn::parse::<RustcArgsRequiredConst>(a.tokens.clone().into())
+                    .unwrap()
+                    .args
+            } else {
+                Vec::new()
+            }
+        })
+        .collect()
+}
+
+struct RustcArgsRequiredConst {
+    args: Vec<usize>,
+}
+
+impl syn::parse::Parse for RustcArgsRequiredConst {
+    fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
+        let content;
+        parenthesized!(content in input);
+        let list =
+            syn::punctuated::Punctuated::<syn::LitInt, Token![,]>::parse_terminated(&content)?;
+        Ok(Self {
+            args: list
+                .into_iter()
+                .map(|a| a.base10_parse::<usize>())
+                .collect::<syn::Result<_>>()?,
+        })
+    }
+}
diff --git a/library/stdarch/crates/stdarch-verify/tests/arm.rs b/library/stdarch/crates/stdarch-verify/tests/arm.rs
new file mode 100644
index 000000000..6ce5ce05f
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/tests/arm.rs
@@ -0,0 +1,988 @@
+#![allow(bad_style)]
+#![allow(unused)]
+use std::{collections::HashMap, rc::Rc};
+
+use html5ever::{
+    driver::ParseOpts,
+    parse_document,
+    rcdom::{Node, NodeData, RcDom},
+    tendril::TendrilSink,
+    tree_builder::TreeBuilderOpts,
+};
+
+struct Function {
+    name: &'static str,
+    arguments: &'static [&'static Type],
+    ret: Option<&'static Type>,
+    target_feature: Option<&'static str>,
+    instrs: &'static [&'static str],
+    file: &'static str,
+    required_const: &'static [usize],
+    has_test: bool,
+}
+
+static F16: Type = Type::PrimFloat(16);
+static F32: Type = Type::PrimFloat(32);
+static F64: Type = Type::PrimFloat(64);
+static I16: Type = Type::PrimSigned(16);
+static I32: Type = Type::PrimSigned(32);
+static I64: Type = Type::PrimSigned(64);
+static I8: Type = Type::PrimSigned(8);
+static U16: Type = Type::PrimUnsigned(16);
+static U32: Type = Type::PrimUnsigned(32);
+static U64: Type = Type::PrimUnsigned(64);
+static U8: Type = Type::PrimUnsigned(8);
+static NEVER: Type = Type::Never;
+
+static F16X4: Type = Type::F(16, 4, 1);
+static F16X4X2: Type = Type::F(16, 4, 2);
+static F16X4X3: Type = Type::F(16, 4, 3);
+static F16X4X4: Type = Type::F(16, 4, 4);
+static F16X8: Type = Type::F(16, 8, 1);
+static F16X8X2: Type = Type::F(16, 8, 2);
+static F16X8X3: Type = Type::F(16, 8, 3);
+static F16X8X4: Type = Type::F(16, 8, 4);
+static F32X2: Type = Type::F(32, 2, 1);
+static F32X2X2: Type = Type::F(32, 2, 2);
+static F32X2X3: Type = Type::F(32, 2, 3);
+static F32X2X4: Type = Type::F(32, 2, 4);
+static F32X4: Type = Type::F(32, 4, 1);
+static F32X4X2: Type = Type::F(32, 4, 2);
+static F32X4X3: Type = Type::F(32, 4, 3);
+static F32X4X4: Type = Type::F(32, 4, 4);
+static F64X1: Type = Type::F(64, 1, 1);
+static F64X1X2: Type = Type::F(64, 1, 2);
+static F64X1X3: Type = Type::F(64, 1, 3);
+static F64X1X4: Type = Type::F(64, 1, 4);
+static F64X2: Type = Type::F(64, 2, 1);
+static F64X2X2: Type = Type::F(64, 2, 2);
+static F64X2X3: Type = Type::F(64, 2, 3);
+static F64X2X4: Type = Type::F(64, 2, 4);
+static I16X2: Type = Type::I(16, 2, 1);
+static I16X4: Type = Type::I(16, 4, 1);
+static I16X4X2: Type = Type::I(16, 4, 2);
+static I16X4X3: Type = Type::I(16, 4, 3);
+static I16X4X4: Type = Type::I(16, 4, 4);
+static I16X8: Type = Type::I(16, 8, 1);
+static I16X8X2: Type = Type::I(16, 8, 2);
+static I16X8X3: Type = Type::I(16, 8, 3);
+static I16X8X4: Type = Type::I(16, 8, 4);
+static I32X2: Type = Type::I(32, 2, 1);
+static I32X2X2: Type = Type::I(32, 2, 2);
+static I32X2X3: Type = Type::I(32, 2, 3);
+static I32X2X4: Type = Type::I(32, 2, 4);
+static I32X4: Type = Type::I(32, 4, 1);
+static I32X4X2: Type = Type::I(32, 4, 2);
+static I32X4X3: Type = Type::I(32, 4, 3);
+static I32X4X4: Type = Type::I(32, 4, 4);
+static I64X1: Type = Type::I(64, 1, 1);
+static I64X1X2: Type = Type::I(64, 1, 2);
+static I64X1X3: Type = Type::I(64, 1, 3);
+static I64X1X4: Type = Type::I(64, 1, 4);
+static I64X2: Type = Type::I(64, 2, 1);
+static I64X2X2: Type = Type::I(64, 2, 2);
+static I64X2X3: Type = Type::I(64, 2, 3);
+static I64X2X4: Type = Type::I(64, 2, 4);
+static I8X16: Type = Type::I(8, 16, 1);
+static I8X16X2: Type = Type::I(8, 16, 2);
+static I8X16X3: Type = Type::I(8, 16, 3);
+static I8X16X4: Type = Type::I(8, 16, 4);
+static I8X4: Type = Type::I(8, 4, 1);
+static I8X8: Type = Type::I(8, 8, 1);
+static I8X8X2: Type = Type::I(8, 8, 2);
+static I8X8X3: Type = Type::I(8, 8, 3);
+static I8X8X4: Type = Type::I(8, 8, 4);
+static P128: Type = Type::PrimPoly(128);
+static P16: Type = Type::PrimPoly(16);
+static P16X4X2: Type = Type::P(16, 4, 2);
+static P16X4X3: Type = Type::P(16, 4, 3);
+static P16X4X4: Type = Type::P(16, 4, 4);
+static P16X8X2: Type = Type::P(16, 8, 2);
+static P16X8X3: Type = Type::P(16, 8, 3);
+static P16X8X4: Type = Type::P(16, 8, 4);
+static P64: Type = Type::PrimPoly(64);
+static P64X1X2: Type = Type::P(64, 1, 2);
+static P64X1X3: Type = Type::P(64, 1, 3);
+static P64X1X4: Type = Type::P(64, 1, 4);
+static P64X2X2: Type = Type::P(64, 2, 2);
+static P64X2X3: Type = Type::P(64, 2, 3);
+static P64X2X4: Type = Type::P(64, 2, 4);
+static P8: Type = Type::PrimPoly(8);
+static POLY16X4: Type = Type::P(16, 4, 1);
+static POLY16X8: Type = Type::P(16, 8, 1);
+static POLY64X1: Type = Type::P(64, 1, 1);
+static POLY64X2: Type = Type::P(64, 2, 1);
+static POLY8X16: Type = Type::P(8, 16, 1);
+static POLY8X16X2: Type = Type::P(8, 16, 2);
+static POLY8X16X3: Type = Type::P(8, 16, 3);
+static POLY8X16X4: Type = Type::P(8, 16, 4);
+static POLY8X8: Type = Type::P(8, 8, 1);
+static POLY8X8X2: Type = Type::P(8, 8, 2);
+static POLY8X8X3: Type = Type::P(8, 8, 3);
+static POLY8X8X4: Type = Type::P(8, 8, 4);
+static U16X4: Type = Type::U(16, 4, 1);
+static U16X4X2: Type = Type::U(16, 4, 2);
+static U16X4X3: Type = Type::U(16, 4, 3);
+static U16X4X4: Type = Type::U(16, 4, 4);
+static U16X8: Type = Type::U(16, 8, 1);
+static U16X8X2: Type = Type::U(16, 8, 2);
+static U16X8X3: Type = Type::U(16, 8, 3);
+static U16X8X4: Type = Type::U(16, 8, 4);
+static U32X2: Type = Type::U(32, 2, 1);
+static U32X2X2: Type = Type::U(32, 2, 2);
+static U32X2X3: Type = Type::U(32, 2, 3);
+static U32X2X4: Type = Type::U(32, 2, 4);
+static U32X4: Type = Type::U(32, 4, 1);
+static U32X4X2: Type = Type::U(32, 4, 2);
+static U32X4X3: Type = Type::U(32, 4, 3);
+static U32X4X4: Type = Type::U(32, 4, 4);
+static U64X1: Type = Type::U(64, 1, 1);
+static U64X1X2: Type = Type::U(64, 1, 2);
+static U64X1X3: Type = Type::U(64, 1, 3);
+static U64X1X4: Type = Type::U(64, 1, 4);
+static U64X2: Type = Type::U(64, 2, 1);
+static U64X2X2: Type = Type::U(64, 2, 2);
+static U64X2X3: Type = Type::U(64, 2, 3);
+static U64X2X4: Type = Type::U(64, 2, 4);
+static U8X16: Type = Type::U(8, 16, 1);
+static U8X16X2: Type = Type::U(8, 16, 2);
+static U8X16X3: Type = Type::U(8, 16, 3);
+static U8X16X4: Type = Type::U(8, 16, 4);
+static U8X8: Type = Type::U(8, 8, 1);
+static U8X4: Type = Type::U(8, 4, 1);
+static U8X8X2: Type = Type::U(8, 8, 2);
+static U8X8X3: Type = Type::U(8, 8, 3);
+static U8X8X4: Type = Type::U(8, 8, 4);
+
+#[derive(Debug, Copy, Clone, PartialEq)]
+enum Type {
+    PrimFloat(u8),
+    PrimSigned(u8),
+    PrimUnsigned(u8),
+    PrimPoly(u8),
+    MutPtr(&'static Type),
+    ConstPtr(&'static Type),
+    I(u8, u8, u8),
+    U(u8, u8, u8),
+    P(u8, u8, u8),
+    F(u8, u8, u8),
+    Never,
+}
+
+stdarch_verify::arm_functions!(static FUNCTIONS);
+
+macro_rules! bail {
+    ($($t:tt)*) => (return Err(format!($($t)*)))
+}
+
+#[test]
+fn verify_all_signatures() {
+    // This is a giant HTML blob downloaded from
+    // https://developer.arm.com/technologies/neon/intrinsics which contains all
+    // NEON intrinsics at least. We do manual HTML parsing below.
+    let html = include_bytes!("../arm-intrinsics.html");
+    let mut html = &html[..];
+    let opts = ParseOpts {
+        tree_builder: TreeBuilderOpts {
+            drop_doctype: true,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let dom = parse_document(RcDom::default(), opts)
+        .from_utf8()
+        .read_from(&mut html)
+        .unwrap();
+
+    let accordion = find_accordion(&dom.document).unwrap();
+    let map = parse_intrinsics(&accordion);
+
+    let mut all_valid = true;
+    'outer: for rust in FUNCTIONS {
+        if !rust.has_test {
+            let skip = [
+                "vaddq_s64",
+                "vaddq_u64",
+                "vrsqrte_f32",
+                "vtbl1_s8",
+                "vtbl1_u8",
+                "vtbl1_p8",
+                "vtbl2_s8",
+                "vtbl2_u8",
+                "vtbl2_p8",
+                "vtbl3_s8",
+                "vtbl3_u8",
+                "vtbl3_p8",
+                "vtbl4_s8",
+                "vtbl4_u8",
+                "vtbl4_p8",
+                "vtbx1_s8",
+                "vtbx1_u8",
+                "vtbx1_p8",
+                "vtbx2_s8",
+                "vtbx2_u8",
+                "vtbx2_p8",
+                "vtbx3_s8",
+                "vtbx3_u8",
+                "vtbx3_p8",
+                "vtbx4_s8",
+                "vtbx4_u8",
+                "vtbx4_p8",
+                "udf",
+                "_clz_u8",
+                "_clz_u16",
+                "_clz_u32",
+                "_rbit_u32",
+                "_rev_u16",
+                "_rev_u32",
+                "__breakpoint",
+                "vpminq_f32",
+                "vpminq_f64",
+                "vpmaxq_f32",
+                "vpmaxq_f64",
+                "vcombine_s8",
+                "vcombine_s16",
+                "vcombine_s32",
+                "vcombine_s64",
+                "vcombine_u8",
+                "vcombine_u16",
+                "vcombine_u32",
+                "vcombine_u64",
+                "vcombine_p64",
+                "vcombine_f32",
+                "vcombine_p8",
+                "vcombine_p16",
+                "vcombine_f64",
+                "vtbl1_s8",
+                "vtbl1_u8",
+                "vtbl1_p8",
+                "vtbl2_s8",
+                "vtbl2_u8",
+                "vtbl2_p8",
+                "vtbl3_s8",
+                "vtbl3_u8",
+                "vtbl3_p8",
+                "vtbl4_s8",
+                "vtbl4_u8",
+                "vtbl4_p8",
+                "vtbx1_s8",
+                "vtbx1_u8",
+                "vtbx1_p8",
+                "vtbx2_s8",
+                "vtbx2_u8",
+                "vtbx2_p8",
+                "vtbx3_s8",
+                "vtbx3_u8",
+                "vtbx3_p8",
+                "vtbx4_s8",
+                "vtbx4_u8",
+                "vtbx4_p8",
+                "vqtbl1_s8",
+                "vqtbl1q_s8",
+                "vqtbl1_u8",
+                "vqtbl1q_u8",
+                "vqtbl1_p8",
+                "vqtbl1q_p8",
+                "vqtbx1_s8",
+                "vqtbx1q_s8",
+                "vqtbx1_u8",
+                "vqtbx1q_u8",
+                "vqtbx1_p8",
+                "vqtbx1q_p8",
+                "vqtbl2_s8",
+                "vqtbl2q_s8",
+                "vqtbl2_u8",
+                "vqtbl2q_u8",
+                "vqtbl2_p8",
+                "vqtbl2q_p8",
+                "vqtbx2_s8",
+                "vqtbx2q_s8",
+                "vqtbx2_u8",
+                "vqtbx2q_u8",
+                "vqtbx2_p8",
+                "vqtbx2q_p8",
+                "vqtbl3_s8",
+                "vqtbl3q_s8",
+                "vqtbl3_u8",
+                "vqtbl3q_u8",
+                "vqtbl3_p8",
+                "vqtbl3q_p8",
+                "vqtbx3_s8",
+                "vqtbx3q_s8",
+                "vqtbx3_u8",
+                "vqtbx3q_u8",
+                "vqtbx3_p8",
+                "vqtbx3q_p8",
+                "vqtbl4_s8",
+                "vqtbl4q_s8",
+                "vqtbl4_u8",
+                "vqtbl4q_u8",
+                "vqtbl4_p8",
+                "vqtbl4q_p8",
+                "vqtbx4_s8",
+                "vqtbx4q_s8",
+                "vqtbx4_u8",
+                "vqtbx4q_u8",
+                "vqtbx4_p8",
+                "vqtbx4q_p8",
+                "brk",
+                "_rev_u64",
+                "_clz_u64",
+                "_rbit_u64",
+                "_cls_u32",
+                "_cls_u64",
+                "_prefetch",
+                "vsli_n_s8",
+                "vsliq_n_s8",
+                "vsli_n_s16",
+                "vsliq_n_s16",
+                "vsli_n_s32",
+                "vsliq_n_s32",
+                "vsli_n_s64",
+                "vsliq_n_s64",
+                "vsli_n_u8",
+                "vsliq_n_u8",
+                "vsli_n_u16",
+                "vsliq_n_u16",
+                "vsli_n_u32",
+                "vsliq_n_u32",
+                "vsli_n_u64",
+                "vsliq_n_u64",
+                "vsli_n_p8",
+                "vsliq_n_p8",
+                "vsli_n_p16",
+                "vsliq_n_p16",
+                "vsli_n_p64",
+                "vsliq_n_p64",
+                "vsri_n_s8",
+                "vsriq_n_s8",
+                "vsri_n_s16",
+                "vsriq_n_s16",
+                "vsri_n_s32",
+                "vsriq_n_s32",
+                "vsri_n_s64",
+                "vsriq_n_s64",
+                "vsri_n_u8",
+                "vsriq_n_u8",
+                "vsri_n_u16",
+                "vsriq_n_u16",
+                "vsri_n_u32",
+                "vsriq_n_u32",
+                "vsri_n_u64",
+                "vsriq_n_u64",
+                "vsri_n_p8",
+                "vsriq_n_p8",
+                "vsri_n_p16",
+                "vsriq_n_p16",
+                "vsri_n_p64",
+                "vsriq_n_p64",
+                "__smulbb",
+                "__smultb",
+                "__smulbt",
+                "__smultt",
+                "__smulwb",
+                "__smulwt",
+                "__qadd",
+                "__qsub",
+                "__qdbl",
+                "__smlabb",
+                "__smlabt",
+                "__smlatb",
+                "__smlatt",
+                "__smlawb",
+                "__smlawt",
+                "__qadd8",
+                "__qsub8",
+                "__qsub16",
+                "__qadd16",
+                "__qasx",
+                "__qsax",
+                "__sadd16",
+                "__sadd8",
+                "__smlad",
+                "__smlsd",
+                "__sasx",
+                "__sel",
+                "__shadd8",
+                "__shadd16",
+                "__shsub8",
+                "__usub8",
+                "__ssub8",
+                "__shsub16",
+                "__smuad",
+                "__smuadx",
+                "__smusd",
+                "__smusdx",
+                "__usad8",
+                "__usada8",
+                "__ldrex",
+                "__strex",
+                "__ldrexb",
+                "__strexb",
+                "__ldrexh",
+                "__strexh",
+                "__clrex",
+                "__dbg",
+            ];
+            if !skip.contains(&rust.name) {
+                println!(
+                    "missing run-time test named `test_{}` for `{}`",
+                    {
+                        let mut id = rust.name;
+                        while id.starts_with('_') {
+                            id = &id[1..];
+                        }
+                        id
+                    },
+                    rust.name
+                );
+                all_valid = false;
+            }
+        }
+
+        // Skip some intrinsics that aren't NEON and are located in different
+        // places than the whitelists below.
+        match rust.name {
+            "brk" | "__breakpoint" | "udf" | "_prefetch" => continue,
+            _ => {}
+        }
+        // Skip some intrinsics that are present in GCC and Clang but
+        // are missing from the official documentation.
+        let skip_intrinsic_verify = [
+            "vmov_n_p64",
+            "vmovq_n_p64",
+            "vreinterpret_p64_s64",
+            "vreinterpret_f32_p64",
+            "vreinterpretq_f32_p64",
+            "vreinterpretq_p64_p128",
+            "vreinterpretq_p128_p64",
+            "vreinterpretq_f32_p128",
+            "vqrdmlahh_s16",
+            "vqrdmlahs_s32",
+            "vqrdmlahh_lane_s16",
+            "vqrdmlahh_laneq_s16",
+            "vqrdmlahs_lane_s32",
+            "vqrdmlahs_laneq_s32",
+            "vqrdmlah_s16",
+            "vqrdmlah_s32",
+            "vqrdmlahq_s16",
+            "vqrdmlahq_s32",
+            "vqrdmlah_lane_s16",
+            "vqrdmlah_laneq_s16",
+            "vqrdmlahq_lane_s16",
+            "vqrdmlahq_laneq_s16",
+            "vqrdmlah_lane_s32",
+            "vqrdmlah_laneq_s32",
+            "vqrdmlahq_lane_s32",
+            "vqrdmlahq_laneq_s32",
+            "vqrdmlshh_s16",
+            "vqrdmlshs_s32",
+            "vqrdmlshh_lane_s16",
+            "vqrdmlshh_laneq_s16",
+            "vqrdmlshs_lane_s32",
+            "vqrdmlshs_laneq_s32",
+            "vqrdmlsh_s16",
+            "vqrdmlshq_s16",
+            "vqrdmlsh_s32",
+            "vqrdmlshq_s32",
+            "vqrdmlsh_lane_s16",
+            "vqrdmlsh_laneq_s16",
+            "vqrdmlshq_lane_s16",
+            "vqrdmlshq_laneq_s16",
+            "vqrdmlsh_lane_s32",
+            "vqrdmlsh_laneq_s32",
+            "vqrdmlshq_lane_s32",
+            "vqrdmlshq_laneq_s32",
+            "vcadd_rot270_f32",
+            "vcadd_rot90_f32",
+            "vcaddq_rot270_f32",
+            "vcaddq_rot270_f64",
+            "vcaddq_rot90_f32",
+            "vcaddq_rot90_f64",
+            "vcmla_f32",
+            "vcmlaq_f32",
+            "vcmlaq_f64",
+            "vcmla_rot90_f32",
+            "vcmlaq_rot90_f32",
+            "vcmlaq_rot90_f64",
+            "vcmla_rot180_f32",
+            "vcmlaq_rot180_f32",
+            "vcmlaq_rot180_f64",
+            "vcmla_rot270_f32",
+            "vcmlaq_rot270_f32",
+            "vcmlaq_rot270_f64",
+            "vcmla_lane_f32",
+            "vcmla_laneq_f32",
+            "vcmlaq_lane_f32",
+            "vcmlaq_laneq_f32",
+            "vcmla_rot90_lane_f32",
+            "vcmla_rot90_laneq_f32",
+            "vcmlaq_rot90_lane_f32",
+            "vcmlaq_rot90_laneq_f32",
+            "vcmla_rot180_lane_f32",
+            "vcmla_rot180_laneq_f32",
+            "vcmlaq_rot180_lane_f32",
+            "vcmlaq_rot180_laneq_f32",
+            "vcmla_rot270_lane_f32",
+            "vcmla_rot270_laneq_f32",
+            "vcmlaq_rot270_lane_f32",
+            "vcmlaq_rot270_laneq_f32",
+            "vdot_s32",
+            "vdot_u32",
+            "vdotq_s32",
+            "vdotq_u32",
+            "vdot_lane_s32",
+            "vdot_laneq_s32",
+            "vdotq_lane_s32",
+            "vdotq_laneq_s32",
+            "vdot_lane_u32",
+            "vdot_laneq_u32",
+            "vdotq_lane_u32",
+            "vdotq_laneq_u32",
+            "vbcaxq_s8",
+            "vbcaxq_s16",
+            "vbcaxq_s32",
+            "vbcaxq_s64",
+            "vbcaxq_u8",
+            "vbcaxq_u16",
+            "vbcaxq_u32",
+            "vbcaxq_u64",
+            "veor3q_s8",
+            "veor3q_s16",
+            "veor3q_s32",
+            "veor3q_s64",
+            "veor3q_u8",
+            "veor3q_u16",
+            "veor3q_u32",
+            "veor3q_u64",
+            "vadd_p8",
+            "vadd_p16",
+            "vadd_p64",
+            "vaddq_p8",
+            "vaddq_p16",
+            "vaddq_p64",
+            "vaddq_p128",
+            "vsm4ekeyq_u32",
+            "vsm4eq_u32",
+            "vmmlaq_s32",
+            "vmmlaq_u32",
+            "vusmmlaq_s32",
+            "vsm3partw1q_u32",
+            "vsm3partw2q_u32",
+            "vsm3ss1q_u32",
+            "vsm3tt1aq_u32",
+            "vsm3tt1bq_u32",
+            "vsm3tt2aq_u32",
+            "vsm3tt2bq_u32",
+            "vrax1q_u64",
+            "vxarq_u64",
+            "vsha512hq_u64",
+            "vsha512h2q_u64",
+            "vsha512su0q_u64",
+            "vsha512su1q_u64",
+            "vrnd32x_f32",
+            "vrnd32xq_f32",
+            "vrnd32z_f32",
+            "vrnd32zq_f32",
+            "vrnd64x_f32",
+            "vrnd64xq_f32",
+            "vrnd64z_f32",
+            "vrnd64zq_f32",
+            "vcls_u8",
+            "vcls_u16",
+            "vcls_u32",
+            "vclsq_u8",
+            "vclsq_u16",
+            "vclsq_u32",
+            "vtst_p16",
+            "vtstq_p16",
+            "__dbg",
+        ];
+        let arm = match map.get(rust.name) {
+            Some(i) => i,
+            None => {
+                // Skip all these intrinsics as they're not listed in NEON
+                // descriptions online.
+                //
+                // TODO: we still need to verify these intrinsics or find a
+                // reference for them, need to figure out where though!
+                if !rust.file.ends_with("dsp.rs\"")
+                    && !rust.file.ends_with("simd32.rs\"")
+                    && !rust.file.ends_with("cmsis.rs\"")
+                    && !rust.file.ends_with("v6.rs\"")
+                    && !rust.file.ends_with("v7.rs\"")
+                    && !rust.file.ends_with("v8.rs\"")
+                    && !rust.file.ends_with("tme.rs\"")
+                    && !rust.file.ends_with("ex.rs\"")
+                    && !skip_intrinsic_verify.contains(&rust.name)
+                {
+                    println!(
+                        "missing arm definition for {:?} in {}",
+                        rust.name, rust.file
+                    );
+                    all_valid = false;
+                }
+                continue;
+            }
+        };
+
+        if let Err(e) = matches(rust, arm) {
+            println!("failed to verify `{}`", rust.name);
+            println!("  * {}", e);
+            all_valid = false;
+        }
+    }
+    assert!(all_valid);
+}
+
+fn matches(rust: &Function, arm: &Intrinsic) -> Result<(), String> {
+    if rust.ret != arm.ret.as_ref() {
+        bail!("mismatched return value")
+    }
+    if rust.arguments.len() != arm.arguments.len() {
+        bail!("mismatched argument lengths");
+    }
+
+    let mut nconst = 0;
+    let iter = rust.arguments.iter().zip(&arm.arguments).enumerate();
+    for (i, (rust_ty, (arm, arm_const))) in iter {
+        if *rust_ty != arm {
+            bail!("mismatched arguments")
+        }
+        if *arm_const {
+            nconst += 1;
+            if !rust.required_const.contains(&i) {
+                bail!("argument const mismatch");
+            }
+        }
+    }
+    if nconst != rust.required_const.len() {
+        bail!("wrong number of const arguments");
+    }
+
+    if rust.instrs.is_empty() {
+        bail!(
+            "instruction not listed for `{}`, but arm lists {:?}",
+            rust.name,
+            arm.instruction
+        );
+    } else if false
+    /* not super reliable, but can be used to manually check */
+    {
+        for instr in rust.instrs {
+            if arm.instruction.starts_with(instr) {
+                continue;
+            }
+            // sometimes arm says `foo` and disassemblers say `vfoo`, or
+            // sometimes disassemblers say `vfoo` and arm says `sfoo` or `ffoo`
+            if instr.starts_with('v')
+                && (arm.instruction.starts_with(&instr[1..])
+                    || arm.instruction[1..].starts_with(&instr[1..]))
+            {
+                continue;
+            }
+            bail!(
+                "arm failed to list `{}` as an instruction for `{}` in {:?}",
+                instr,
+                rust.name,
+                arm.instruction,
+            );
+        }
+    }
+
+    // TODO: verify `target_feature`.
+
+    Ok(())
+}
+
+fn find_accordion(node: &Rc<Node>) -> Option<Rc<Node>> {
+    if let NodeData::Element { attrs, .. } = &node.data {
+        for attr in attrs.borrow().iter() {
+            if attr.name.local.eq_str_ignore_ascii_case("class")
+                && attr.value.to_string() == "intrinsic-accordion"
+            {
+                return Some(node.clone());
+            }
+        }
+    }
+
+    node.children
+        .borrow()
+        .iter()
+        .filter_map(|node| find_accordion(node))
+        .next()
+}
+
+#[derive(PartialEq)]
+struct Intrinsic {
+    name: String,
+    ret: Option<Type>,
+    arguments: Vec<(Type, bool)>,
+    instruction: String,
+}
+
+fn parse_intrinsics(node: &Rc<Node>) -> HashMap<String, Intrinsic> {
+    let mut ret = HashMap::new();
+    for child in node.children.borrow().iter() {
+        if let NodeData::Element { .. } = child.data {
+            let f = parse_intrinsic(child);
+            ret.insert(f.name.clone(), f);
+        }
+    }
+    ret
+}
+
+fn parse_intrinsic(node: &Rc<Node>) -> Intrinsic {
+    // <div class='intrinsic'>
+    //  <input>...</input>
+    //  <label for=$name>
+    //    <div>
+    //      $signature...
+    //  <article>
+    //    ...
+
+    let children = node.children.borrow();
+    let mut children = children
+        .iter()
+        .filter(|node| matches!(node.data, NodeData::Element { .. }));
+    let _input = children.next().expect("no <input>");
+    let label = children.next().expect("no <label>");
+    let article = children.next().expect("no <article>");
+    assert!(children.next().is_none());
+
+    // Find `for="..."` in `<label>`
+    let name = match &label.data {
+        NodeData::Element { attrs, .. } => attrs
+            .borrow()
+            .iter()
+            .filter(|attr| attr.name.local.eq_str_ignore_ascii_case("for"))
+            .map(|attr| attr.value.to_string())
+            .next()
+            .expect("no `for` attribute"),
+        _ => panic!(),
+    };
+
+    // Find contents of inner `<div>` in `<label>`
+    let label_children = label.children.borrow();
+    let mut label_children = label_children
+        .iter()
+        .filter(|node| matches!(node.data, NodeData::Element { .. }));
+    let label_div = label_children.next().expect("no <div> in <label>");
+    assert!(label_children.next().is_none());
+    let text = label_div.children.borrow();
+    let mut text = text.iter().filter_map(|node| match &node.data {
+        NodeData::Text { contents } => Some(contents.borrow().to_string()),
+        _ => None,
+    });
+    let ret = text.next().unwrap();
+    let ret = ret.trim();
+    let args = text.next().unwrap();
+    let args = args.trim();
+    assert!(text.next().is_none());
+
+    // Find the instruction within the article
+    let article_children = article.children.borrow();
+    let mut article_children = article_children
+        .iter()
+        .filter(|node| matches!(node.data, NodeData::Element { .. }));
+    let mut instruction = None;
+    while let Some(child) = article_children.next() {
+        let mut header = String::new();
+        collect_text(&mut header, child);
+        if !header.ends_with(" Instruction") {
+            continue;
+        }
+        let next = article_children.next().expect("no next child");
+        assert!(instruction.is_none());
+        let mut instr = String::new();
+        collect_text(&mut instr, &next);
+        instruction = Some(instr);
+    }
+
+    let instruction = match instruction {
+        Some(s) => s.trim().to_lowercase(),
+        None => panic!("can't find instruction for `{}`", name),
+    };
+
+    Intrinsic {
+        name,
+        ret: if ret == "void" {
+            None
+        } else {
+            Some(parse_ty(ret))
+        },
+        instruction,
+        arguments: args // "(...)"
+            .trim_start_matches('(') // "...)"
+            .trim_end_matches(')') // "..."
+            .split(',') // " Type name ", ".."
+            .map(|s| s.trim()) // "Type name"
+            .map(|s| s.rsplitn(2, ' ').nth(1).unwrap()) // "Type"
+            .map(|s| {
+                let const_ = "const ";
+                if s.starts_with(const_) {
+                    (parse_ty(&s[const_.len()..]), true)
+                } else {
+                    (parse_ty(s), false)
+                }
+            })
+            .collect(),
+    }
+}
+
+fn parse_ty(s: &str) -> Type {
+    let suffix = " const *";
+    if s.ends_with(suffix) {
+        Type::ConstPtr(parse_ty_base(&s[..s.len() - suffix.len()]))
+    } else if s.ends_with(" *") {
+        Type::MutPtr(parse_ty_base(&s[..s.len() - 2]))
+    } else {
+        *parse_ty_base(s)
+    }
+}
+
+fn parse_ty_base(s: &str) -> &'static Type {
+    match s {
+        "float16_t" => &F16,
+        "float16x4_t" => &F16X4,
+        "float16x4x2_t" => &F16X4X2,
+        "float16x4x3_t" => &F16X4X3,
+        "float16x4x4_t" => &F16X4X4,
+        "float16x8_t" => &F16X8,
+        "float16x8x2_t" => &F16X8X2,
+        "float16x8x3_t" => &F16X8X3,
+        "float16x8x4_t" => &F16X8X4,
+        "float32_t" => &F32,
+        "float32x2_t" => &F32X2,
+        "float32x2x2_t" => &F32X2X2,
+        "float32x2x3_t" => &F32X2X3,
+        "float32x2x4_t" => &F32X2X4,
+        "float32x4_t" => &F32X4,
+        "float32x4x2_t" => &F32X4X2,
+        "float32x4x3_t" => &F32X4X3,
+        "float32x4x4_t" => &F32X4X4,
+        "float64_t" => &F64,
+        "float64x1_t" => &F64X1,
+        "float64x1x2_t" => &F64X1X2,
+        "float64x1x3_t" => &F64X1X3,
+        "float64x1x4_t" => &F64X1X4,
+        "float64x2_t" => &F64X2,
+        "float64x2x2_t" => &F64X2X2,
+        "float64x2x3_t" => &F64X2X3,
+        "float64x2x4_t" => &F64X2X4,
+        "int16_t" => &I16,
+        "int16x2_t" => &I16X2,
+        "int16x4_t" => &I16X4,
+        "int16x4x2_t" => &I16X4X2,
+        "int16x4x3_t" => &I16X4X3,
+        "int16x4x4_t" => &I16X4X4,
+        "int16x8_t" => &I16X8,
+        "int16x8x2_t" => &I16X8X2,
+        "int16x8x3_t" => &I16X8X3,
+        "int16x8x4_t" => &I16X8X4,
+        "int32_t" | "int" => &I32,
+        "int32x2_t" => &I32X2,
+        "int32x2x2_t" => &I32X2X2,
+        "int32x2x3_t" => &I32X2X3,
+        "int32x2x4_t" => &I32X2X4,
+        "int32x4_t" => &I32X4,
+        "int32x4x2_t" => &I32X4X2,
+        "int32x4x3_t" => &I32X4X3,
+        "int32x4x4_t" => &I32X4X4,
+        "int64_t" => &I64,
+        "int64x1_t" => &I64X1,
+        "int64x1x2_t" => &I64X1X2,
+        "int64x1x3_t" => &I64X1X3,
+        "int64x1x4_t" => &I64X1X4,
+        "int64x2_t" => &I64X2,
+        "int64x2x2_t" => &I64X2X2,
+        "int64x2x3_t" => &I64X2X3,
+        "int64x2x4_t" => &I64X2X4,
+        "int8_t" => &I8,
+        "int8x16_t" => &I8X16,
+        "int8x16x2_t" => &I8X16X2,
+        "int8x16x3_t" => &I8X16X3,
+        "int8x16x4_t" => &I8X16X4,
+        "int8x4_t" => &I8X4,
+        "int8x8_t" => &I8X8,
+        "int8x8x2_t" => &I8X8X2,
+        "int8x8x3_t" => &I8X8X3,
+        "int8x8x4_t" => &I8X8X4,
+        "poly128_t" => &P128,
+        "poly16_t" => &P16,
+        "poly16x4_t" => &POLY16X4,
+        "poly16x4x2_t" => &P16X4X2,
+        "poly16x4x3_t" => &P16X4X3,
+        "poly16x4x4_t" => &P16X4X4,
+        "poly16x8_t" => &POLY16X8,
+        "poly16x8x2_t" => &P16X8X2,
+        "poly16x8x3_t" => &P16X8X3,
+        "poly16x8x4_t" => &P16X8X4,
+        "poly64_t" => &P64,
+        "poly64x1_t" => &POLY64X1,
+        "poly64x1x2_t" => &P64X1X2,
+        "poly64x1x3_t" => &P64X1X3,
+        "poly64x1x4_t" => &P64X1X4,
+        "poly64x2_t" => &POLY64X2,
+        "poly64x2x2_t" => &P64X2X2,
+        "poly64x2x3_t" => &P64X2X3,
+        "poly64x2x4_t" => &P64X2X4,
+        "poly8_t" => &P8,
+        "poly8x16_t" => &POLY8X16,
+        "poly8x16x2_t" => &POLY8X16X2,
+        "poly8x16x3_t" => &POLY8X16X3,
+        "poly8x16x4_t" => &POLY8X16X4,
+        "poly8x8_t" => &POLY8X8,
+        "poly8x8x2_t" => &POLY8X8X2,
+        "poly8x8x3_t" => &POLY8X8X3,
+        "poly8x8x4_t" => &POLY8X8X4,
+        "uint16_t" => &U16,
+        "uint16x4_t" => &U16X4,
+        "uint16x4x2_t" => &U16X4X2,
+        "uint16x4x3_t" => &U16X4X3,
+        "uint16x4x4_t" => &U16X4X4,
+        "uint16x8_t" => &U16X8,
+        "uint16x8x2_t" => &U16X8X2,
+        "uint16x8x3_t" => &U16X8X3,
+        "uint16x8x4_t" => &U16X8X4,
+        "uint32_t" => &U32,
+        "uint32x2_t" => &U32X2,
+        "uint32x2x2_t" => &U32X2X2,
+        "uint32x2x3_t" => &U32X2X3,
+        "uint32x2x4_t" => &U32X2X4,
+        "uint32x4_t" => &U32X4,
+        "uint32x4x2_t" => &U32X4X2,
+        "uint32x4x3_t" => &U32X4X3,
+        "uint32x4x4_t" => &U32X4X4,
+        "uint64_t" => &U64,
+        "uint64x1_t" => &U64X1,
+        "uint64x1x2_t" => &U64X1X2,
+        "uint64x1x3_t" => &U64X1X3,
+        "uint64x1x4_t" => &U64X1X4,
+        "uint64x2_t" => &U64X2,
+        "uint64x2x2_t" => &U64X2X2,
+        "uint64x2x3_t" => &U64X2X3,
+        "uint64x2x4_t" => &U64X2X4,
+        "uint8_t" => &U8,
+        "uint8x16_t" => &U8X16,
+        "uint8x16x2_t" => &U8X16X2,
+        "uint8x16x3_t" => &U8X16X3,
+        "uint8x16x4_t" => &U8X16X4,
+        "uint8x8_t" => &U8X8,
+        "uint8x8x2_t" => &U8X8X2,
+        "uint8x8x3_t" => &U8X8X3,
+        "uint8x8x4_t" => &U8X8X4,
+
+        _ => panic!("failed to parse html type {:?}", s),
+    }
+}
+
+fn collect_text(s: &mut String, node: &Node) {
+    if let NodeData::Text { contents } = &node.data {
+        s.push(' ');
+        s.push_str(&contents.borrow().to_string());
+    }
+    for child in node.children.borrow().iter() {
+        collect_text(s, child);
+    }
+}
diff --git a/library/stdarch/crates/stdarch-verify/tests/mips.rs b/library/stdarch/crates/stdarch-verify/tests/mips.rs
new file mode 100644
index 000000000..1eb86dc29
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/tests/mips.rs
@@ -0,0 +1,366 @@
+//! Verification of MIPS MSA intrinsics
+#![allow(bad_style, unused)]
+
+// This file is obtained from
+// https://gcc.gnu.org/onlinedocs//gcc/MIPS-SIMD-Architecture-Built-in-Functions.html
+static HEADER: &str = include_str!("../mips-msa.h");
+
+stdarch_verify::mips_functions!(static FUNCTIONS);
+
+struct Function {
+    name: &'static str,
+    arguments: &'static [&'static Type],
+    ret: Option<&'static Type>,
+    target_feature: Option<&'static str>,
+    instrs: &'static [&'static str],
+    file: &'static str,
+    required_const: &'static [usize],
+    has_test: bool,
+}
+
+static F16: Type = Type::PrimFloat(16);
+static F32: Type = Type::PrimFloat(32);
+static F64: Type = Type::PrimFloat(64);
+static I8: Type = Type::PrimSigned(8);
+static I16: Type = Type::PrimSigned(16);
+static I32: Type = Type::PrimSigned(32);
+static I64: Type = Type::PrimSigned(64);
+static U8: Type = Type::PrimUnsigned(8);
+static U16: Type = Type::PrimUnsigned(16);
+static U32: Type = Type::PrimUnsigned(32);
+static U64: Type = Type::PrimUnsigned(64);
+static NEVER: Type = Type::Never;
+static TUPLE: Type = Type::Tuple;
+static v16i8: Type = Type::I(8, 16, 1);
+static v8i16: Type = Type::I(16, 8, 1);
+static v4i32: Type = Type::I(32, 4, 1);
+static v2i64: Type = Type::I(64, 2, 1);
+static v16u8: Type = Type::U(8, 16, 1);
+static v8u16: Type = Type::U(16, 8, 1);
+static v4u32: Type = Type::U(32, 4, 1);
+static v2u64: Type = Type::U(64, 2, 1);
+static v8f16: Type = Type::F(16, 8, 1);
+static v4f32: Type = Type::F(32, 4, 1);
+static v2f64: Type = Type::F(64, 2, 1);
+
+#[derive(Debug, Copy, Clone, PartialEq)]
+enum Type {
+    PrimFloat(u8),
+    PrimSigned(u8),
+    PrimUnsigned(u8),
+    PrimPoly(u8),
+    MutPtr(&'static Type),
+    ConstPtr(&'static Type),
+    Tuple,
+    I(u8, u8, u8),
+    U(u8, u8, u8),
+    P(u8, u8, u8),
+    F(u8, u8, u8),
+    Never,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq)]
+#[allow(non_camel_case_types)]
+enum MsaTy {
+    v16i8,
+    v8i16,
+    v4i32,
+    v2i64,
+    v16u8,
+    v8u16,
+    v4u32,
+    v2u64,
+    v8f16,
+    v4f32,
+    v2f64,
+    imm0_1,
+    imm0_3,
+    imm0_7,
+    imm0_15,
+    imm0_31,
+    imm0_63,
+    imm0_255,
+    imm_n16_15,
+    imm_n512_511,
+    imm_n1024_1022,
+    imm_n2048_2044,
+    imm_n4096_4088,
+    i32,
+    u32,
+    i64,
+    u64,
+    Void,
+    MutVoidPtr,
+}
+
+impl<'a> From<&'a str> for MsaTy {
+    fn from(s: &'a str) -> MsaTy {
+        match s {
+            "v16i8" => MsaTy::v16i8,
+            "v8i16" => MsaTy::v8i16,
+            "v4i32" => MsaTy::v4i32,
+            "v2i64" => MsaTy::v2i64,
+            "v16u8" => MsaTy::v16u8,
+            "v8u16" => MsaTy::v8u16,
+            "v4u32" => MsaTy::v4u32,
+            "v2u64" => MsaTy::v2u64,
+            "v8f16" => MsaTy::v8f16,
+            "v4f32" => MsaTy::v4f32,
+            "v2f64" => MsaTy::v2f64,
+            "imm0_1" => MsaTy::imm0_1,
+            "imm0_3" => MsaTy::imm0_3,
+            "imm0_7" => MsaTy::imm0_7,
+            "imm0_15" => MsaTy::imm0_15,
+            "imm0_31" => MsaTy::imm0_31,
+            "imm0_63" => MsaTy::imm0_63,
+            "imm0_255" => MsaTy::imm0_255,
+            "imm_n16_15" => MsaTy::imm_n16_15,
+            "imm_n512_511" => MsaTy::imm_n512_511,
+            "imm_n1024_1022" => MsaTy::imm_n1024_1022,
+            "imm_n2048_2044" => MsaTy::imm_n2048_2044,
+            "imm_n4096_4088" => MsaTy::imm_n4096_4088,
+            "i32" => MsaTy::i32,
+            "u32" => MsaTy::u32,
+            "i64" => MsaTy::i64,
+            "u64" => MsaTy::u64,
+            "void" => MsaTy::Void,
+            "void *" => MsaTy::MutVoidPtr,
+            v => panic!("unknown ty: \"{}\"", v),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct MsaIntrinsic {
+    id: String,
+    arg_tys: Vec<MsaTy>,
+    ret_ty: MsaTy,
+    instruction: String,
+}
+
+struct NoneError;
+
+impl std::convert::TryFrom<&'static str> for MsaIntrinsic {
+    // The intrinsics are just C function declarations of the form:
+    // $ret_ty __builtin_${fn_id}($($arg_ty),*);
+    type Error = NoneError;
+    fn try_from(line: &'static str) -> Result<Self, Self::Error> {
+        return inner(line).ok_or(NoneError);
+
+        fn inner(line: &'static str) -> Option<MsaIntrinsic> {
+            let first_whitespace = line.find(char::is_whitespace)?;
+            let ret_ty = &line[0..first_whitespace];
+            let ret_ty = MsaTy::from(ret_ty);
+
+            let first_parentheses = line.find('(')?;
+            assert!(first_parentheses > first_whitespace);
+            let id = &line[first_whitespace + 1..first_parentheses].trim();
+            assert!(id.starts_with("__builtin"));
+            let mut id_str = "_".to_string();
+            id_str += &id[9..];
+            let id = id_str;
+
+            let mut arg_tys = Vec::new();
+
+            let last_parentheses = line.find(')')?;
+            for arg in (&line[first_parentheses + 1..last_parentheses]).split(',') {
+                let arg = arg.trim();
+                arg_tys.push(MsaTy::from(arg));
+            }
+
+            // The instruction is the intrinsic name without the __msa_ prefix.
+            let instruction = &id[6..];
+            let mut instruction = instruction.to_string();
+            // With all underscores but the first one replaced with a `.`
+            if let Some(first_underscore) = instruction.find('_') {
+                let postfix = instruction[first_underscore + 1..].replace('_', ".");
+                instruction = instruction[0..=first_underscore].to_string();
+                instruction += &postfix;
+            }
+
+            Some(MsaIntrinsic {
+                id,
+                ret_ty,
+                arg_tys,
+                instruction,
+            })
+        }
+    }
+}
+
+#[test]
+fn verify_all_signatures() {
+    // Parse the C intrinsic header file:
+    let mut intrinsics = std::collections::HashMap::<String, MsaIntrinsic>::new();
+    for line in HEADER.lines() {
+        if line.is_empty() {
+            continue;
+        }
+
+        use std::convert::TryFrom;
+        let intrinsic: MsaIntrinsic = TryFrom::try_from(line)
+            .unwrap_or_else(|_| panic!("failed to parse line: \"{}\"", line));
+        assert!(!intrinsics.contains_key(&intrinsic.id));
+        intrinsics.insert(intrinsic.id.clone(), intrinsic);
+    }
+
+    let mut all_valid = true;
+    for rust in FUNCTIONS {
+        if !rust.has_test {
+            let skip = [
+                "__msa_ceqi_d",
+                "__msa_cfcmsa",
+                "__msa_clei_s_d",
+                "__msa_clti_s_d",
+                "__msa_ctcmsa",
+                "__msa_ldi_d",
+                "__msa_maxi_s_d",
+                "__msa_mini_s_d",
+                "break_",
+            ];
+            if !skip.contains(&rust.name) {
+                println!(
+                    "missing run-time test named `test_{}` for `{}`",
+                    {
+                        let mut id = rust.name;
+                        while id.starts_with('_') {
+                            id = &id[1..];
+                        }
+                        id
+                    },
+                    rust.name
+                );
+                all_valid = false;
+            }
+        }
+
+        // Skip some intrinsics that aren't part of MSA
+        match rust.name {
+            "break_" => continue,
+            _ => {}
+        }
+        let mips = match intrinsics.get(rust.name) {
+            Some(i) => i,
+            None => {
+                eprintln!(
+                    "missing mips definition for {:?} in {}",
+                    rust.name, rust.file
+                );
+                all_valid = false;
+                continue;
+            }
+        };
+
+        if let Err(e) = matches(rust, mips) {
+            println!("failed to verify `{}`", rust.name);
+            println!("  * {}", e);
+            all_valid = false;
+        }
+    }
+    assert!(all_valid);
+}
+
+fn matches(rust: &Function, mips: &MsaIntrinsic) -> Result<(), String> {
+    macro_rules! bail {
+        ($($t:tt)*) => (return Err(format!($($t)*)))
+    }
+
+    if rust.ret.is_none() && mips.ret_ty != MsaTy::Void {
+        bail!("mismatched return value")
+    }
+
+    if rust.arguments.len() != mips.arg_tys.len() {
+        bail!("mismatched argument lengths");
+    }
+
+    let mut nconst = 0;
+    for (i, (rust_arg, mips_arg)) in rust.arguments.iter().zip(mips.arg_tys.iter()).enumerate() {
+        match mips_arg {
+            MsaTy::v16i8 if **rust_arg == v16i8 => (),
+            MsaTy::v8i16 if **rust_arg == v8i16 => (),
+            MsaTy::v4i32 if **rust_arg == v4i32 => (),
+            MsaTy::v2i64 if **rust_arg == v2i64 => (),
+            MsaTy::v16u8 if **rust_arg == v16u8 => (),
+            MsaTy::v8u16 if **rust_arg == v8u16 => (),
+            MsaTy::v4u32 if **rust_arg == v4u32 => (),
+            MsaTy::v2u64 if **rust_arg == v2u64 => (),
+            MsaTy::v4f32 if **rust_arg == v4f32 => (),
+            MsaTy::v2f64 if **rust_arg == v2f64 => (),
+            MsaTy::imm0_1
+            | MsaTy::imm0_3
+            | MsaTy::imm0_7
+            | MsaTy::imm0_15
+            | MsaTy::imm0_31
+            | MsaTy::imm0_63
+            | MsaTy::imm0_255
+            | MsaTy::imm_n16_15
+            | MsaTy::imm_n512_511
+            | MsaTy::imm_n1024_1022
+            | MsaTy::imm_n2048_2044
+            | MsaTy::imm_n4096_4088
+                if **rust_arg == I32 => {}
+            MsaTy::i32 if **rust_arg == I32 => (),
+            MsaTy::i64 if **rust_arg == I64 => (),
+            MsaTy::u32 if **rust_arg == U32 => (),
+            MsaTy::u64 if **rust_arg == U64 => (),
+            MsaTy::MutVoidPtr if **rust_arg == Type::MutPtr(&U8) => (),
+            m => bail!(
+                "mismatched argument \"{}\"= \"{:?}\" != \"{:?}\"",
+                i,
+                m,
+                *rust_arg
+            ),
+        }
+
+        let is_const = matches!(
+            mips_arg,
+            MsaTy::imm0_1
+                | MsaTy::imm0_3
+                | MsaTy::imm0_7
+                | MsaTy::imm0_15
+                | MsaTy::imm0_31
+                | MsaTy::imm0_63
+                | MsaTy::imm0_255
+                | MsaTy::imm_n16_15
+                | MsaTy::imm_n512_511
+                | MsaTy::imm_n1024_1022
+                | MsaTy::imm_n2048_2044
+                | MsaTy::imm_n4096_4088
+        );
+        if is_const {
+            nconst += 1;
+            if !rust.required_const.contains(&i) {
+                bail!("argument const mismatch");
+            }
+        }
+    }
+
+    if nconst != rust.required_const.len() {
+        bail!("wrong number of const arguments");
+    }
+
+    if rust.target_feature != Some("msa") {
+        bail!("wrong target_feature");
+    }
+
+    if !rust.instrs.is_empty() {
+        // Normalize slightly to get rid of assembler differences
+        let actual = rust.instrs[0].replace(".", "_");
+        let expected = mips.instruction.replace(".", "_");
+        if actual != expected {
+            bail!(
+                "wrong instruction: \"{}\" != \"{}\"",
+                rust.instrs[0],
+                mips.instruction
+            );
+        }
+    } else {
+        bail!(
+            "missing assert_instr for \"{}\" (should be \"{}\")",
+            mips.id,
+            mips.instruction
+        );
+    }
+
+    Ok(())
+}
diff --git a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
new file mode 100644
index 000000000..89494bfd2
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
@@ -0,0 +1,841 @@
+#![allow(bad_style)]
+#![allow(unused)]
+#![allow(
+    clippy::shadow_reuse,
+    clippy::cast_lossless,
+    clippy::match_same_arms,
+    clippy::nonminimal_bool,
+    clippy::print_stdout,
+    clippy::use_debug,
+    clippy::eq_op,
+    clippy::useless_format
+)]
+
+use std::collections::{BTreeMap, HashMap};
+
+use serde::Deserialize;
+
+const PRINT_INSTRUCTION_VIOLATIONS: bool = false;
+const PRINT_MISSING_LISTS: bool = false;
+const PRINT_MISSING_LISTS_MARKDOWN: bool = false;
+
+struct Function {
+    name: &'static str,
+    arguments: &'static [&'static Type],
+    ret: Option<&'static Type>,
+    target_feature: Option<&'static str>,
+    instrs: &'static [&'static str],
+    file: &'static str,
+    required_const: &'static [usize],
+    has_test: bool,
+}
+
+static F32: Type = Type::PrimFloat(32);
+static F64: Type = Type::PrimFloat(64);
+static I8: Type = Type::PrimSigned(8);
+static I16: Type = Type::PrimSigned(16);
+static I32: Type = Type::PrimSigned(32);
+static I64: Type = Type::PrimSigned(64);
+static U8: Type = Type::PrimUnsigned(8);
+static U16: Type = Type::PrimUnsigned(16);
+static U32: Type = Type::PrimUnsigned(32);
+static U64: Type = Type::PrimUnsigned(64);
+static U128: Type = Type::PrimUnsigned(128);
+static ORDERING: Type = Type::Ordering;
+
+static M64: Type = Type::M64;
+static M128: Type = Type::M128;
+static M128BH: Type = Type::M128BH;
+static M128I: Type = Type::M128I;
+static M128D: Type = Type::M128D;
+static M256: Type = Type::M256;
+static M256BH: Type = Type::M256BH;
+static M256I: Type = Type::M256I;
+static M256D: Type = Type::M256D;
+static M512: Type = Type::M512;
+static M512BH: Type = Type::M512BH;
+static M512I: Type = Type::M512I;
+static M512D: Type = Type::M512D;
+static MMASK8: Type = Type::MMASK8;
+static MMASK16: Type = Type::MMASK16;
+static MMASK32: Type = Type::MMASK32;
+static MMASK64: Type = Type::MMASK64;
+static MM_CMPINT_ENUM: Type = Type::MM_CMPINT_ENUM;
+static MM_MANTISSA_NORM_ENUM: Type = Type::MM_MANTISSA_NORM_ENUM;
+static MM_MANTISSA_SIGN_ENUM: Type = Type::MM_MANTISSA_SIGN_ENUM;
+static MM_PERM_ENUM: Type = Type::MM_PERM_ENUM;
+
+static TUPLE: Type = Type::Tuple;
+static CPUID: Type = Type::CpuidResult;
+static NEVER: Type = Type::Never;
+
+#[derive(Debug)]
+enum Type {
+    PrimFloat(u8),
+    PrimSigned(u8),
+    PrimUnsigned(u8),
+    MutPtr(&'static Type),
+    ConstPtr(&'static Type),
+    M64,
+    M128,
+    M128BH,
+    M128D,
+    M128I,
+    M256,
+    M256BH,
+    M256D,
+    M256I,
+    M512,
+    M512BH,
+    M512D,
+    M512I,
+    MMASK8,
+    MMASK16,
+    MMASK32,
+    MMASK64,
+    MM_CMPINT_ENUM,
+    MM_MANTISSA_NORM_ENUM,
+    MM_MANTISSA_SIGN_ENUM,
+    MM_PERM_ENUM,
+    Tuple,
+    CpuidResult,
+    Never,
+    Ordering,
+}
+
+stdarch_verify::x86_functions!(static FUNCTIONS);
+
+#[derive(Deserialize)]
+struct Data {
+    #[serde(rename = "intrinsic", default)]
+    intrinsics: Vec<Intrinsic>,
+}
+
+#[derive(Deserialize)]
+struct Intrinsic {
+    #[serde(rename = "return")]
+    return_: Return,
+    name: String,
+    #[serde(rename = "CPUID", default)]
+    cpuid: Vec<String>,
+    #[serde(rename = "parameter", default)]
+    parameters: Vec<Parameter>,
+    #[serde(default)]
+    instruction: Vec<Instruction>,
+}
+
+#[derive(Deserialize)]
+struct Parameter {
+    #[serde(rename = "type")]
+    type_: String,
+    #[serde(default)]
+    etype: String,
+}
+
+#[derive(Deserialize)]
+struct Return {
+    #[serde(rename = "type")]
+    type_: String,
+}
+
+#[derive(Deserialize, Debug)]
+struct Instruction {
+    name: String,
+}
+
+macro_rules! bail {
+    ($($t:tt)*) => (return Err(format!($($t)*)))
+}
+
+#[test]
+fn verify_all_signatures() {
+    // This XML document was downloaded from Intel's site. To update this you
+    // can visit intel's intrinsics guide online documentation:
+    //
+    //   https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
+    //
+    // Open up the network console and you'll see an xml file was downloaded
+    // (currently called data-3.4.xml). That's the file we downloaded
+    // here.
+    let xml = include_bytes!("../x86-intel.xml");
+
+    let xml = &xml[..];
+    let data: Data = serde_xml_rs::from_reader(xml).expect("failed to deserialize xml");
+    let mut map = HashMap::new();
+    for intrinsic in &data.intrinsics {
+        map.entry(&intrinsic.name[..])
+            .or_insert_with(Vec::new)
+            .push(intrinsic);
+    }
+
+    let mut all_valid = true;
+    'outer: for rust in FUNCTIONS {
+        if !rust.has_test {
+            // FIXME: this list should be almost empty
+            let skip = [
+                "__readeflags",
+                "__readeflags",
+                "__writeeflags",
+                "__writeeflags",
+                "_mm_comige_ss",
+                "_mm_cvt_ss2si",
+                "_mm_cvtt_ss2si",
+                "_mm_cvt_si2ss",
+                "_mm_set_ps1",
+                "_mm_load_ps1",
+                "_mm_store_ps1",
+                "_mm_getcsr",
+                "_mm_setcsr",
+                "_MM_GET_EXCEPTION_MASK",
+                "_MM_GET_EXCEPTION_STATE",
+                "_MM_GET_FLUSH_ZERO_MODE",
+                "_MM_GET_ROUNDING_MODE",
+                "_MM_SET_EXCEPTION_MASK",
+                "_MM_SET_EXCEPTION_STATE",
+                "_MM_SET_FLUSH_ZERO_MODE",
+                "_MM_SET_ROUNDING_MODE",
+                "_mm_prefetch",
+                "_mm_undefined_ps",
+                "_m_pmaxsw",
+                "_m_pmaxub",
+                "_m_pminsw",
+                "_m_pminub",
+                "_m_pavgb",
+                "_m_pavgw",
+                "_m_psadbw",
+                "_mm_cvt_pi2ps",
+                "_m_maskmovq",
+                "_m_pextrw",
+                "_m_pinsrw",
+                "_m_pmovmskb",
+                "_m_pshufw",
+                "_mm_cvtt_ps2pi",
+                "_mm_cvt_ps2pi",
+                "__cpuid_count",
+                "__cpuid",
+                "__get_cpuid_max",
+                "_xsave",
+                "_xrstor",
+                "_xsetbv",
+                "_xgetbv",
+                "_xsaveopt",
+                "_xsavec",
+                "_xsaves",
+                "_xrstors",
+                "_mm_bslli_si128",
+                "_mm_bsrli_si128",
+                "_mm_undefined_pd",
+                "_mm_undefined_si128",
+                "_mm_cvtps_ph",
+                "_mm256_cvtps_ph",
+                "_rdtsc",
+                "__rdtscp",
+                "_mm256_castps128_ps256",
+                "_mm256_castpd128_pd256",
+                "_mm256_castsi128_si256",
+                "_mm256_undefined_ps",
+                "_mm256_undefined_pd",
+                "_mm256_undefined_si256",
+                "_bextr2_u32",
+                "_mm_tzcnt_32",
+                "_m_paddb",
+                "_m_paddw",
+                "_m_paddd",
+                "_m_paddsb",
+                "_m_paddsw",
+                "_m_paddusb",
+                "_m_paddusw",
+                "_m_psubb",
+                "_m_psubw",
+                "_m_psubd",
+                "_m_psubsb",
+                "_m_psubsw",
+                "_m_psubusb",
+                "_m_psubusw",
+                "_mm_set_pi16",
+                "_mm_set_pi32",
+                "_mm_set_pi8",
+                "_mm_set1_pi16",
+                "_mm_set1_pi32",
+                "_mm_set1_pi8",
+                "_mm_setr_pi16",
+                "_mm_setr_pi32",
+                "_mm_setr_pi8",
+                "ud2",
+                "_mm_min_epi8",
+                "_mm_min_epi32",
+                "_xbegin",
+                "_xend",
+                "_rdrand16_step",
+                "_rdrand32_step",
+                "_rdseed16_step",
+                "_rdseed32_step",
+                "_fxsave",
+                "_fxrstor",
+                "_t1mskc_u64",
+                "_mm256_shuffle_epi32",
+                "_mm256_bslli_epi128",
+                "_mm256_bsrli_epi128",
+                "_mm256_unpackhi_epi8",
+                "_mm256_unpacklo_epi8",
+                "_mm256_unpackhi_epi16",
+                "_mm256_unpacklo_epi16",
+                "_mm256_unpackhi_epi32",
+                "_mm256_unpacklo_epi32",
+                "_mm256_unpackhi_epi64",
+                "_mm256_unpacklo_epi64",
+                "_xsave64",
+                "_xrstor64",
+                "_xsaveopt64",
+                "_xsavec64",
+                "_xsaves64",
+                "_xrstors64",
+                "_mm_cvtsi64x_si128",
+                "_mm_cvtsi128_si64x",
+                "_mm_cvtsi64x_sd",
+                "cmpxchg16b",
+                "_rdrand64_step",
+                "_rdseed64_step",
+                "_bextr2_u64",
+                "_mm_tzcnt_64",
+                "_fxsave64",
+                "_fxrstor64",
+                "_mm512_undefined_ps",
+                "_mm512_undefined_pd",
+                "_mm512_undefined_epi32",
+                "_mm512_undefined",
+            ];
+            if !skip.contains(&rust.name) {
+                println!(
+                    "missing run-time test named `test_{}` for `{}`",
+                    {
+                        let mut id = rust.name;
+                        while id.starts_with('_') {
+                            id = &id[1..];
+                        }
+                        id
+                    },
+                    rust.name
+                );
+                all_valid = false;
+            }
+        }
+
+        match rust.name {
+            // These aren't defined by Intel but they're defined by what appears
+            // to be all other compilers. For more information see
+            // rust-lang/stdarch#307, and otherwise these signatures
+            // have all been manually verified.
+            "__readeflags" |
+            "__writeeflags" |
+            "__cpuid_count" |
+            "__cpuid" |
+            "__get_cpuid_max" |
+            // Not listed with intel, but manually verified
+            "cmpxchg16b" |
+            // The UD2 intrinsic is not defined by Intel, but it was agreed on
+            // in the RFC Issue 2512:
+            // https://github.com/rust-lang/rfcs/issues/2512
+            "ud2"
+                => continue,
+            // Intel requires the mask argument for _mm_shuffle_ps to be an
+            // unsigned integer, but all other _mm_shuffle_.. intrinsics
+            // take a signed-integer. This breaks `_MM_SHUFFLE` for
+            // `_mm_shuffle_ps`:
+            "_mm_shuffle_ps" => continue,
+            _ => {}
+        }
+
+        // these are all AMD-specific intrinsics
+        if let Some(feature) = rust.target_feature {
+            if feature.contains("sse4a") || feature.contains("tbm") {
+                continue;
+            }
+        }
+
+        let intel = match map.remove(rust.name) {
+            Some(i) => i,
+            None => panic!("missing intel definition for {}", rust.name),
+        };
+
+        let mut errors = Vec::new();
+        for intel in intel {
+            match matches(rust, intel) {
+                Ok(()) => continue 'outer,
+                Err(e) => errors.push(e),
+            }
+        }
+        println!("failed to verify `{}`", rust.name);
+        for error in errors {
+            println!("  * {}", error);
+        }
+        all_valid = false;
+    }
+    assert!(all_valid);
+
+    let mut missing = BTreeMap::new();
+    for (name, intel) in &map {
+        // currently focused mainly on missing SIMD intrinsics, but there's
+        // definitely some other assorted ones that we're missing.
+        if !name.starts_with("_mm") {
+            continue;
+        }
+
+        // we'll get to avx-512 later
+        // let avx512 = intel.iter().any(|i| {
+        //     i.name.starts_with("_mm512") || i.cpuid.iter().any(|c| {
+        //         c.contains("512")
+        //     })
+        // });
+        // if avx512 {
+        //     continue
+        // }
+
+        for intel in intel {
+            missing
+                .entry(&intel.cpuid)
+                .or_insert_with(Vec::new)
+                .push(intel);
+        }
+    }
+
+    // generate a bulleted list of missing intrinsics
+    if PRINT_MISSING_LISTS || PRINT_MISSING_LISTS_MARKDOWN {
+        for (k, v) in missing {
+            if PRINT_MISSING_LISTS_MARKDOWN {
+                println!("\n<details><summary>{:?}</summary><p>\n", k);
+                for intel in v {
+                    let url = format!(
+                        "https://software.intel.com/sites/landingpage\
+                         /IntrinsicsGuide/#text={}&expand=5236",
+                        intel.name
+                    );
+                    println!("  * [ ] [`{}`]({})", intel.name, url);
+                }
+                println!("</p></details>\n");
+            } else {
+                println!("\n{:?}\n", k);
+                for intel in v {
+                    println!("\t{}", intel.name);
+                }
+            }
+        }
+    }
+}
+
+fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
+    // Verify that all `#[target_feature]` annotations are correct,
+    // ensuring that we've actually enabled the right instruction
+    // set for this intrinsic.
+    match rust.name {
+        "_bswap" | "_bswap64" => {}
+
+        // These don't actually have a target feature unlike their brethren with
+        // the `x` inside the name which requires adx
+        "_addcarry_u32" | "_addcarry_u64" | "_subborrow_u32" | "_subborrow_u64" => {}
+
+        "_bittest"
+        | "_bittestandset"
+        | "_bittestandreset"
+        | "_bittestandcomplement"
+        | "_bittest64"
+        | "_bittestandset64"
+        | "_bittestandreset64"
+        | "_bittestandcomplement64" => {}
+
+        _ => {
+            if intel.cpuid.is_empty() {
+                bail!("missing cpuid for {}", rust.name);
+            }
+        }
+    }
+
+    for cpuid in &intel.cpuid {
+        // The pause intrinsic is in the SSE2 module, but it is backwards
+        // compatible with CPUs without SSE2, and it therefore does not need the
+        // target-feature attribute.
+        if rust.name == "_mm_pause" {
+            continue;
+        }
+        // this is needed by _xsave and probably some related intrinsics,
+        // but let's just skip it for now.
+        if *cpuid == "XSS" {
+            continue;
+        }
+
+        // these flags on the rdtsc/rtdscp intrinsics we don't test for right
+        // now, but we may wish to add these one day!
+        //
+        // For more info see #308
+        if *cpuid == "TSC" || *cpuid == "RDTSCP" {
+            continue;
+        }
+
+        let cpuid = cpuid
+            .chars()
+            .flat_map(|c| c.to_lowercase())
+            .collect::<String>();
+
+        // Fix mismatching feature names:
+        let fixup_cpuid = |cpuid: String| match cpuid.as_ref() {
+            // The XML file names IFMA as "avx512ifma52", while Rust calls
+            // it "avx512ifma".
+            "avx512ifma52" => String::from("avx512ifma"),
+            // The XML file names BITALG as "avx512_bitalg", while Rust calls
+            // it "avx512bitalg".
+            "avx512_bitalg" => String::from("avx512bitalg"),
+            // The XML file names VBMI as "avx512_vbmi", while Rust calls
+            // it "avx512vbmi".
+            "avx512_vbmi" => String::from("avx512vbmi"),
+            // The XML file names VBMI2 as "avx512_vbmi2", while Rust calls
+            // it "avx512vbmi2".
+            "avx512_vbmi2" => String::from("avx512vbmi2"),
+            // The XML file names VNNI as "avx512_vnni", while Rust calls
+            // it "avx512vnni".
+            "avx512_vnni" => String::from("avx512vnni"),
+            // Some AVX512f intrinsics are also supported by Knight's Corner.
+            // The XML lists them as avx512f/kncni, but we are solely gating
+            // them behind avx512f since we don't have a KNC feature yet.
+            "avx512f/kncni" => String::from("avx512f"),
+            // See: https://github.com/rust-lang/stdarch/issues/738
+            // The intrinsics guide calls `f16c` `fp16c` in disagreement with
+            // Intel's architecture manuals.
+            "fp16c" => String::from("f16c"),
+            "avx512_bf16" => String::from("avx512bf16"),
+            // The XML file names VNNI as "avx512_bf16", while Rust calls
+            // it "avx512bf16".
+            _ => cpuid,
+        };
+        let fixed_cpuid = fixup_cpuid(cpuid);
+
+        let rust_feature = rust
+            .target_feature
+            .unwrap_or_else(|| panic!("no target feature listed for {}", rust.name));
+
+        if rust_feature.contains(&fixed_cpuid) {
+            continue;
+        }
+        bail!(
+            "intel cpuid `{}` not in `{}` for {}",
+            fixed_cpuid,
+            rust_feature,
+            rust.name
+        )
+    }
+
+    if PRINT_INSTRUCTION_VIOLATIONS {
+        if rust.instrs.is_empty() {
+            if !intel.instruction.is_empty() {
+                println!(
+                    "instruction not listed for `{}`, but intel lists {:?}",
+                    rust.name, intel.instruction
+                );
+            }
+
+        // If intel doesn't list any instructions and we do then don't
+        // bother trying to look for instructions in intel, we've just got
+        // some extra assertions on our end.
+        } else if !intel.instruction.is_empty() {
+            for instr in rust.instrs {
+                let asserting = intel.instruction.iter().any(|a| a.name.starts_with(instr));
+                if !asserting {
+                    println!(
+                        "intel failed to list `{}` as an instruction for `{}`",
+                        instr, rust.name
+                    );
+                }
+            }
+        }
+    }
+
+    // Make sure we've got the right return type.
+    if let Some(t) = rust.ret {
+        equate(t, &intel.return_.type_, "", rust.name, false)?;
+    } else if intel.return_.type_ != "" && intel.return_.type_ != "void" {
+        bail!(
+            "{} returns `{}` with intel, void in rust",
+            rust.name,
+            intel.return_.type_
+        )
+    }
+
+    // If there's no arguments on Rust's side intel may list one "void"
+    // argument, so handle that here.
+    if rust.arguments.is_empty() && intel.parameters.len() == 1 {
+        if intel.parameters[0].type_ != "void" {
+            bail!("rust has 0 arguments, intel has one for")
+        }
+    } else {
+        // Otherwise we want all parameters to be exactly the same
+        if rust.arguments.len() != intel.parameters.len() {
+            bail!("wrong number of arguments on {}", rust.name)
+        }
+        for (i, (a, b)) in intel.parameters.iter().zip(rust.arguments).enumerate() {
+            let is_const = rust.required_const.contains(&i);
+            equate(b, &a.type_, &a.etype, &intel.name, is_const)?;
+        }
+    }
+
+    let any_i64 = rust
+        .arguments
+        .iter()
+        .cloned()
+        .chain(rust.ret)
+        .any(|arg| matches!(*arg, Type::PrimSigned(64) | Type::PrimUnsigned(64)));
+    let any_i64_exempt = match rust.name {
+        // These intrinsics have all been manually verified against Clang's
+        // headers to be available on x86, and the u64 arguments seem
+        // spurious I guess?
+        "_xsave" | "_xrstor" | "_xsetbv" | "_xgetbv" | "_xsaveopt" | "_xsavec" | "_xsaves"
+        | "_xrstors" => true,
+
+        // Apparently all of clang/msvc/gcc accept these intrinsics on
+        // 32-bit, so let's do the same
+        "_mm_set_epi64x"
+        | "_mm_set1_epi64x"
+        | "_mm256_set_epi64x"
+        | "_mm256_setr_epi64x"
+        | "_mm256_set1_epi64x"
+        | "_mm512_set1_epi64"
+        | "_mm256_mask_set1_epi64"
+        | "_mm256_maskz_set1_epi64"
+        | "_mm_mask_set1_epi64"
+        | "_mm_maskz_set1_epi64"
+        | "_mm512_set4_epi64"
+        | "_mm512_setr4_epi64"
+        | "_mm512_set_epi64"
+        | "_mm512_setr_epi64"
+        | "_mm512_reduce_add_epi64"
+        | "_mm512_mask_reduce_add_epi64"
+        | "_mm512_reduce_mul_epi64"
+        | "_mm512_mask_reduce_mul_epi64"
+        | "_mm512_reduce_max_epi64"
+        | "_mm512_mask_reduce_max_epi64"
+        | "_mm512_reduce_max_epu64"
+        | "_mm512_mask_reduce_max_epu64"
+        | "_mm512_reduce_min_epi64"
+        | "_mm512_mask_reduce_min_epi64"
+        | "_mm512_reduce_min_epu64"
+        | "_mm512_mask_reduce_min_epu64"
+        | "_mm512_reduce_and_epi64"
+        | "_mm512_mask_reduce_and_epi64"
+        | "_mm512_reduce_or_epi64"
+        | "_mm512_mask_reduce_or_epi64"
+        | "_mm512_mask_set1_epi64"
+        | "_mm512_maskz_set1_epi64"
+        | "_mm_cvt_roundss_si64"
+        | "_mm_cvt_roundss_i64"
+        | "_mm_cvt_roundss_u64"
+        | "_mm_cvtss_i64"
+        | "_mm_cvtss_u64"
+        | "_mm_cvt_roundsd_si64"
+        | "_mm_cvt_roundsd_i64"
+        | "_mm_cvt_roundsd_u64"
+        | "_mm_cvtsd_i64"
+        | "_mm_cvtsd_u64"
+        | "_mm_cvt_roundi64_ss"
+        | "_mm_cvt_roundi64_sd"
+        | "_mm_cvt_roundsi64_ss"
+        | "_mm_cvt_roundsi64_sd"
+        | "_mm_cvt_roundu64_ss"
+        | "_mm_cvt_roundu64_sd"
+        | "_mm_cvti64_ss"
+        | "_mm_cvti64_sd"
+        | "_mm_cvtt_roundss_si64"
+        | "_mm_cvtt_roundss_i64"
+        | "_mm_cvtt_roundss_u64"
+        | "_mm_cvttss_i64"
+        | "_mm_cvttss_u64"
+        | "_mm_cvtt_roundsd_si64"
+        | "_mm_cvtt_roundsd_i64"
+        | "_mm_cvtt_roundsd_u64"
+        | "_mm_cvttsd_i64"
+        | "_mm_cvttsd_u64"
+        | "_mm_cvtu64_ss"
+        | "_mm_cvtu64_sd" => true,
+
+        // These return a 64-bit argument but they're assembled from other
+        // 32-bit registers, so these work on 32-bit just fine. See #308 for
+        // more info.
+        "_rdtsc" | "__rdtscp" => true,
+
+        _ => false,
+    };
+    if any_i64 && !any_i64_exempt && !rust.file.contains("x86_64") {
+        bail!(
+            "intrinsic `{}` uses a 64-bit bare type but may be \
+             available on 32-bit platforms",
+            rust.name
+        )
+    }
+    Ok(())
+}
+
+fn equate(
+    t: &Type,
+    intel: &str,
+    etype: &str,
+    intrinsic: &str,
+    is_const: bool,
+) -> Result<(), String> {
+    // Make pointer adjacent to the type: float * foo => float* foo
+    let mut intel = intel.replace(" *", "*");
+    // Make mutability modifier adjacent to the pointer:
+    // float const * foo => float const* foo
+    intel = intel.replace("const *", "const*");
+    // Normalize mutability modifier to after the type:
+    // const float* foo => float const*
+    if intel.starts_with("const") && intel.ends_with('*') {
+        intel = intel.replace("const ", "");
+        intel = intel.replace("*", " const*");
+    }
+    if etype == "IMM" {
+        // The _bittest intrinsics claim to only accept immediates but actually
+        // accept run-time values as well.
+        if !is_const && !intrinsic.starts_with("_bittest") {
+            return bail!("argument required to be const but isn't");
+        }
+    } else {
+        // const int must be an IMM
+        assert_ne!(intel, "const int");
+        if is_const {
+            return bail!("argument is const but shouldn't be");
+        }
+    }
+    match (t, &intel[..]) {
+        (&Type::PrimFloat(32), "float") => {}
+        (&Type::PrimFloat(64), "double") => {}
+        (&Type::PrimSigned(16), "__int16") => {}
+        (&Type::PrimSigned(16), "short") => {}
+        (&Type::PrimSigned(32), "__int32") => {}
+        (&Type::PrimSigned(32), "const int") => {}
+        (&Type::PrimSigned(32), "int") => {}
+        (&Type::PrimSigned(64), "__int64") => {}
+        (&Type::PrimSigned(64), "long long") => {}
+        (&Type::PrimSigned(8), "__int8") => {}
+        (&Type::PrimSigned(8), "char") => {}
+        (&Type::PrimUnsigned(16), "unsigned short") => {}
+        (&Type::PrimUnsigned(32), "unsigned int") => {}
+        (&Type::PrimUnsigned(32), "const unsigned int") => {}
+        (&Type::PrimUnsigned(64), "unsigned __int64") => {}
+        (&Type::PrimUnsigned(8), "unsigned char") => {}
+        (&Type::M64, "__m64") => {}
+        (&Type::M128, "__m128") => {}
+        (&Type::M128BH, "__m128bh") => {}
+        (&Type::M128I, "__m128i") => {}
+        (&Type::M128D, "__m128d") => {}
+        (&Type::M256, "__m256") => {}
+        (&Type::M256BH, "__m256bh") => {}
+        (&Type::M256I, "__m256i") => {}
+        (&Type::M256D, "__m256d") => {}
+        (&Type::M512, "__m512") => {}
+        (&Type::M512BH, "__m512bh") => {}
+        (&Type::M512I, "__m512i") => {}
+        (&Type::M512D, "__m512d") => {}
+        (&Type::MMASK64, "__mmask64") => {}
+        (&Type::MMASK32, "__mmask32") => {}
+        (&Type::MMASK16, "__mmask16") => {}
+        (&Type::MMASK8, "__mmask8") => {}
+
+        (&Type::MutPtr(&Type::PrimFloat(32)), "float*") => {}
+        (&Type::MutPtr(&Type::PrimFloat(64)), "double*") => {}
+        (&Type::MutPtr(&Type::PrimFloat(32)), "void*") => {}
+        (&Type::MutPtr(&Type::PrimFloat(64)), "void*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(32)), "void*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(16)), "void*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(8)), "void*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(32)), "int*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(32)), "__int32*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(64)), "void*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(64)), "__int64*") => {}
+        (&Type::MutPtr(&Type::PrimSigned(8)), "char*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(16)), "unsigned short*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(32)), "unsigned int*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(64)), "unsigned __int64*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(8)), "void*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(32)), "__mmask32*") => {}
+        (&Type::MutPtr(&Type::PrimUnsigned(64)), "__mmask64*") => {}
+        (&Type::MutPtr(&Type::M64), "__m64*") => {}
+        (&Type::MutPtr(&Type::M128), "__m128*") => {}
+        (&Type::MutPtr(&Type::M128BH), "__m128bh*") => {}
+        (&Type::MutPtr(&Type::M128I), "__m128i*") => {}
+        (&Type::MutPtr(&Type::M128D), "__m128d*") => {}
+        (&Type::MutPtr(&Type::M256), "__m256*") => {}
+        (&Type::MutPtr(&Type::M256BH), "__m256bh*") => {}
+        (&Type::MutPtr(&Type::M256I), "__m256i*") => {}
+        (&Type::MutPtr(&Type::M256D), "__m256d*") => {}
+        (&Type::MutPtr(&Type::M512), "__m512*") => {}
+        (&Type::MutPtr(&Type::M512BH), "__m512bh*") => {}
+        (&Type::MutPtr(&Type::M512I), "__m512i*") => {}
+        (&Type::MutPtr(&Type::M512D), "__m512d*") => {}
+
+        (&Type::ConstPtr(&Type::PrimFloat(32)), "float const*") => {}
+        (&Type::ConstPtr(&Type::PrimFloat(64)), "double const*") => {}
+        (&Type::ConstPtr(&Type::PrimFloat(32)), "void const*") => {}
+        (&Type::ConstPtr(&Type::PrimFloat(64)), "void const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(32)), "int const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(32)), "__int32 const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(8)), "void const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(16)), "void const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(32)), "void const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(64)), "void const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(64)), "__int64 const*") => {}
+        (&Type::ConstPtr(&Type::PrimSigned(8)), "char const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(32)), "unsigned int const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(64)), "unsigned __int64 const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(8)), "void const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(32)), "void const*") => {}
+        (&Type::ConstPtr(&Type::M64), "__m64 const*") => {}
+        (&Type::ConstPtr(&Type::M128), "__m128 const*") => {}
+        (&Type::ConstPtr(&Type::M128BH), "__m128bh const*") => {}
+        (&Type::ConstPtr(&Type::M128I), "__m128i const*") => {}
+        (&Type::ConstPtr(&Type::M128D), "__m128d const*") => {}
+        (&Type::ConstPtr(&Type::M256), "__m256 const*") => {}
+        (&Type::ConstPtr(&Type::M256BH), "__m256bh const*") => {}
+        (&Type::ConstPtr(&Type::M256I), "__m256i const*") => {}
+        (&Type::ConstPtr(&Type::M256D), "__m256d const*") => {}
+        (&Type::ConstPtr(&Type::M512), "__m512 const*") => {}
+        (&Type::ConstPtr(&Type::M512BH), "__m512bh const*") => {}
+        (&Type::ConstPtr(&Type::M512I), "__m512i const*") => {}
+        (&Type::ConstPtr(&Type::M512D), "__m512d const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(32)), "__mmask32*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(64)), "__mmask64*") => {}
+
+        (&Type::MM_CMPINT_ENUM, "_MM_CMPINT_ENUM") => {}
+        (&Type::MM_MANTISSA_NORM_ENUM, "_MM_MANTISSA_NORM_ENUM") => {}
+        (&Type::MM_MANTISSA_SIGN_ENUM, "_MM_MANTISSA_SIGN_ENUM") => {}
+        (&Type::MM_PERM_ENUM, "_MM_PERM_ENUM") => {}
+
+        // This is a macro (?) in C which seems to mutate its arguments, but
+        // that means that we're taking pointers to arguments in rust
+        // as we're not exposing it as a macro.
+        (&Type::MutPtr(&Type::M128), "__m128") if intrinsic == "_MM_TRANSPOSE4_PS" => {}
+
+        // The _rdtsc intrinsic uses a __int64 return type, but this is a bug in
+        // the intrinsics guide: https://github.com/rust-lang/stdarch/issues/559
+        // We have manually fixed the bug by changing the return type to `u64`.
+        (&Type::PrimUnsigned(64), "__int64") if intrinsic == "_rdtsc" => {}
+
+        // The _bittest and _bittest64 intrinsics takes a mutable pointer in the
+        // intrinsics guide even though it never writes through the pointer:
+        (&Type::ConstPtr(&Type::PrimSigned(32)), "__int32*") if intrinsic == "_bittest" => {}
+        (&Type::ConstPtr(&Type::PrimSigned(64)), "__int64*") if intrinsic == "_bittest64" => {}
+        // The _xrstor, _fxrstor, _xrstor64, _fxrstor64 intrinsics take a
+        // mutable pointer in the intrinsics guide even though they never write
+        // through the pointer:
+        (&Type::ConstPtr(&Type::PrimUnsigned(8)), "void*")
+            if intrinsic == "_xrstor"
+                || intrinsic == "_xrstor64"
+                || intrinsic == "_fxrstor"
+                || intrinsic == "_fxrstor64" => {}
+
+        _ => bail!(
+            "failed to equate: `{}` and {:?} for {}",
+            intel,
+            t,
+            intrinsic
+        ),
+    }
+    Ok(())
+}
diff --git a/library/stdarch/crates/stdarch-verify/x86-intel.xml b/library/stdarch/crates/stdarch-verify/x86-intel.xml
new file mode 100644
index 000000000..264ecee0e
--- /dev/null
+++ b/library/stdarch/crates/stdarch-verify/x86-intel.xml
@@ -0,0 +1,148137 @@
+<intrinsics_list version="3.5.3" date="06/30/2020">
+<intrinsic tech="Other" name="_addcarryx_u32">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>ADX</CPUID>
+	<category>Arithmetic</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned char" varname="c_in" etype="UI8"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="b" etype="UI32"/>
+	<parameter type="unsigned int *" varname="out" etype="UI32" memwidth="32"/>
+	<description>Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[32:0] := a[31:0] + b[31:0] + (c_in &gt; 0 ? 1 : 0)
+MEM[out+31:out] := tmp[31:0]
+dst[0] := tmp[32]
+dst[7:1] := 0
+	</operation>
+	<instruction name="ADCX" form="r32, r32" xed="ADCX_GPR32d_GPR32d"/>
+	<instruction name="ADOX" form="r32, r32" xed="ADOX_GPR32d_GPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_addcarryx_u64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>ADX</CPUID>
+	<category>Arithmetic</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned char" varname="c_in" etype="UI8"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="b" etype="UI64"/>
+	<parameter type="unsigned __int64 *" varname="out" etype="UI64" memwidth="64"/>
+	<description>Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[64:0] := a[63:0] + b[63:0] + (c_in &gt; 0 ? 1 : 0)
+MEM[out+63:out] := tmp[63:0]
+dst[0] := tmp[64]
+dst[7:1] := 0
+	</operation>
+	<instruction name="ADCX" form="r64, r64" xed="ADCX_GPR64q_GPR64q"/>
+	<instruction name="ADOX" form="r64, r64" xed="ADOX_GPR64q_GPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" vexEq="TRUE" name="_mm_aesenc_si128">
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="RoundKey" etype="M128"/>
+	<description>Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
+	<operation>a[127:0] := ShiftRows(a[127:0])
+a[127:0] := SubBytes(a[127:0])
+a[127:0] := MixColumns(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction name="AESENC" form="xmm, xmm" xed="AESENC_XMMdq_XMMdq"/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" vexEq="TRUE" name="_mm_aesenclast_si128">
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="RoundKey" etype="M128"/>
+	<description>Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
+	<operation>a[127:0] := ShiftRows(a[127:0])
+a[127:0] := SubBytes(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction name="AESENCLAST" form="xmm, xmm" xed="AESENCLAST_XMMdq_XMMdq"/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" vexEq="TRUE" name="_mm_aesdec_si128">
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="RoundKey" etype="M128"/>
+	<description>Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst".</description>
+	<operation>a[127:0] := InvShiftRows(a[127:0])
+a[127:0] := InvSubBytes(a[127:0])
+a[127:0] := InvMixColumns(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction name="AESDEC" form="xmm, xmm" xed="AESDEC_XMMdq_XMMdq"/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" vexEq="TRUE" name="_mm_aesdeclast_si128">
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="RoundKey" etype="M128"/>
+	<description>Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst".</description>
+	<operation>a[127:0] := InvShiftRows(a[127:0])
+a[127:0] := InvSubBytes(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction name="AESDECLAST" form="xmm, xmm" xed="AESDECLAST_XMMdq_XMMdq"/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" vexEq="TRUE" name="_mm_aesimc_si128">
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<description>Perform the InvMixColumns transformation on "a" and store the result in "dst".</description>
+	<operation>dst[127:0] := InvMixColumns(a[127:0])
+	</operation>
+	<instruction name="AESIMC" form="xmm, xmm" xed="AESIMC_XMMdq_XMMdq"/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" vexEq="TRUE" name="_mm_aeskeygenassist_si128">
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from "a" and an 8-bit round constant specified in "imm8", and store the result in "dst"."</description>
+	<operation>X3[31:0] := a[127:96]
+X2[31:0] := a[95:64]
+X1[31:0] := a[63:32]
+X0[31:0] := a[31:0]
+RCON[31:0] := ZeroExtend32(imm8[7:0])
+dst[31:0] := SubWord(X1)
+dst[63:32] := RotWord(SubWord(X1)) XOR RCON
+dst[95:64] := SubWord(X3)
+dst[127:96] := RotWord(SubWord(X3)) XOR RCON
+	</operation>
+	<instruction name="AESKEYGENASSIST" form="xmm, xmm, imm8" xed="AESKEYGENASSIST_XMMdq_XMMdq_IMMb"/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_dpbf16ps">
+	<type>Tile</type>
+	<type>Floating Point</type>
+	<CPUID>AMXBF16</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="__tile" varname="dst"/>
+	<parameter type="__tile" varname="a"/>
+	<parameter type="__tile" varname="b"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in tiles "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.fp32[n] += FP32(a.row[m].bf16[2*k+0]) * FP32(b.row[k].bf16[2*n+0])
+			tmp.fp32[n] += FP32(a.row[m].bf16[2*k+1]) * FP32(b.row[k].bf16[2*n+1])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction name="TDPBF16PS" form="tmm, tmm, tmm" xed="TDPBF16PS_TMMf32_TMMu32_TMMu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_dpbsud">
+	<type>Tile</type>
+	<CPUID>AMXINT8</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="__tile" varname="dst"/>
+	<parameter type="__tile" varname="a"/>
+	<parameter type="__tile" varname="b"/>
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "a" with corresponding unsigned 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := SignExtend32(x.byte[0]) * ZeroExtend32(y.byte[0])
+	tmp2 := SignExtend32(x.byte[1]) * ZeroExtend32(y.byte[1])
+	tmp3 := SignExtend32(x.byte[2]) * ZeroExtend32(y.byte[2])
+	tmp4 := SignExtend32(x.byte[3]) * ZeroExtend32(y.byte[3])
+	
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction name="TDPBSUD" form="tmm, tmm, tmm" xed="TDPBSUD_TMMi32_TMMu32_TMMu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_dpbusd">
+	<type>Tile</type>
+	<CPUID>AMXINT8</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="__tile" varname="dst"/>
+	<parameter type="__tile" varname="a"/>
+	<parameter type="__tile" varname="b"/>
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := ZeroExtend32(x.byte[0]) * SignExtend32(y.byte[0])
+	tmp2 := ZeroExtend32(x.byte[1]) * SignExtend32(y.byte[1])
+	tmp3 := ZeroExtend32(x.byte[2]) * SignExtend32(y.byte[2])
+	tmp4 := ZeroExtend32(x.byte[3]) * SignExtend32(y.byte[3])
+	
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction name="TDPBUSD" form="tmm, tmm, tmm" xed="TDPBUSD_TMMi32_TMMu32_TMMu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_dpbuud">
+	<type>Tile</type>
+	<CPUID>AMXINT8</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="__tile" varname="dst"/>
+	<parameter type="__tile" varname="a"/>
+	<parameter type="__tile" varname="b"/>
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding unsigned 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := ZeroExtend32(x.byte[0]) * ZeroExtend32(y.byte[0])
+	tmp2 := ZeroExtend32(x.byte[1]) * ZeroExtend32(y.byte[1])
+	tmp3 := ZeroExtend32(x.byte[2]) * ZeroExtend32(y.byte[2])
+	tmp4 := ZeroExtend32(x.byte[3]) * ZeroExtend32(y.byte[3])
+	
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction name="TDPBUUD" form="tmm, tmm, tmm" xed="TDPBUUD_TMMu32_TMMu32_TMMu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_dpbssd">
+	<type>Tile</type>
+	<CPUID>AMXINT8</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="__tile" varname="dst"/>
+	<parameter type="__tile" varname="a"/>
+	<parameter type="__tile" varname="b"/>
+	<description>Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst".</description>
+	<operation>DEFINE DPBD(c, x, y) {
+	tmp1 := SignExtend32(x.byte[0]) * SignExtend32(y.byte[0])
+	tmp2 := SignExtend32(x.byte[1]) * SignExtend32(y.byte[1])
+	tmp3 := SignExtend32(x.byte[2]) * SignExtend32(y.byte[2])
+	tmp4 := SignExtend32(x.byte[3]) * SignExtend32(y.byte[3])
+	
+	RETURN c + tmp1 + tmp2 + tmp3 + tmp4
+}
+FOR m := 0 TO dst.rows - 1
+	tmp := dst.row[m]
+	FOR k := 0 TO (a.colsb / 4) - 1
+		FOR n := 0 TO (dst.colsb / 4) - 1
+			tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n])
+		ENDFOR
+	ENDFOR
+	write_row_and_zero(dst, m, tmp, dst.colsb)
+ENDFOR
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction name="TDPBSSD" form="tmm, tmm, tmm" xed="TDPBSSD_TMMi32_TMMu32_TMMu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_loadconfig">
+	<type>Tile</type>
+	<CPUID>AMXTILE</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="const void *" varname="mem_addr" memwidth="512"/>
+	<description>Load tile configuration from a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If the specified pallette_id is zero, that signifies the init state for both the tile config and the tile data, and the tiles are zeroed. Any invalid configurations will result in #GP fault.</description>
+	<operation>
+//	format of memory payload. each field is a byte.
+//		 0: palette_id
+//		 1: startRow (8b)
+//	 2-15: reserved (must be zero)
+//	16-17: tile0.colsb -- bytes_per_row
+//	18-19: tile1.colsb
+//	20-21: tile2.colsb
+//			...
+//	46-47: tile15.colsb
+//		48: tile0.rows
+//		49: tile1.rows
+//		50: tile2.rows
+//			 ...
+//		63: tile15.rows
+	</operation>
+	<instruction name="LDTILECFG" form="m512" xed="LDTILECFG_MEM"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_storeconfig">
+	<type>Tile</type>
+	<CPUID>AMXTILE</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr" memwidth="512"/>
+	<description>Stores the current tile configuration to a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If tiles are not configured, all zeroes will be stored to memory.</description>
+	<operation>
+//	format of memory payload. each field is a byte.
+//		 0: palette_id
+//		 1: startRow (8b)
+//	 2-15: reserved (must be zero)
+//	16-17: tile0.colsb -- bytes_per_row
+//	18-19: tile1.colsb
+//	20-21: tile2.colsb
+//			...
+//	46-47: tile15.colsb
+//		48: tile0.rows
+//		49: tile1.rows
+//		50: tile2.rows
+//			 ...
+//		63: tile15.rows
+	</operation>
+	<instruction name="STTILECFG" form="m512" xed="STTILECFG_MEM"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_loadd">
+	<type>Tile</type>
+	<CPUID>AMXTILE</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="__tile" varname="dst"/>
+	<parameter type="const void *" varname="base"/>
+	<parameter type="int" varname="stride" etype="UI32"/>
+	<description>Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig".</description>
+	<operation>start := tileconfig.startRow
+IF start == 0 // not restarting, zero incoming state
+	tilezero(dst)
+FI
+nbytes := dst.colsb
+DO WHILE start &lt; dst.rows
+	memptr := base + start * stride
+	write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes)
+	start := start + 1
+OD
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction name="TILELOADD" form="tmm, sibmem" xed="TILELOADD_TMMu32_MEMu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_stream_loadd">
+	<type>Tile</type>
+	<CPUID>AMXTILE</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="__tile" varname="dst"/>
+	<parameter type="const void *" varname="base"/>
+	<parameter type="int" varname="stride" etype="UI32"/>
+	<description>Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig". This intrinsic provides a hint to the implementation that the data will likely not be reused in the near future and the data caching can be optimized accordingly.</description>
+	<operation>start := tileconfig.startRow
+IF start == 0 // not restarting, zero incoming state
+	tilezero(dst)
+FI
+nbytes := dst.colsb
+DO WHILE start &lt; dst.rows
+	memptr := base + start * stride
+	write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes)
+	start := start + 1
+OD
+zero_upper_rows(dst, dst.rows)
+zero_tileconfig_start()
+	</operation>
+	<instruction name="TILELOADDT1" form="tmm, sibmem" xed="TILELOADDT1_TMMu32_MEMu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_release">
+	<type>Tile</type>
+	<CPUID>AMXTILE</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<description>Release the tile configuration to return to the init state, which releases all storage it currently holds.</description>
+	<instruction name="TILERELEASE" xed="TILERELEASE"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_stored">
+	<type>Tile</type>
+	<CPUID>AMXTILE</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="__tile" varname="src" />
+	<parameter type="void *" varname="base"/>
+	<parameter type="int" varname="stride" etype="UI32"/>
+	<description>Store the tile specified by "src" to memory specifieid by "base" address and "stride" using the tile configuration previously configured via "_tile_loadconfig".</description>
+	<operation>start := tileconfig.startRow
+DO WHILE start &lt; src.rows
+	memptr := base + start * stride
+	write_memory(memptr, src.colsb, src.row[start])
+	start := start + 1
+OD
+zero_tileconfig_start()
+	</operation>
+	<instruction name="TILESTORED" form="sibmem, tmm" xed="TILESTORED_MEMu32_TMMu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AMX" name="_tile_zero">
+	<type>Tile</type>
+	<CPUID>AMXTILE</CPUID>
+	<category>Application-Targeted</category>
+	<return type="void"/>
+	<parameter type="__tile" varname="tdest"/>
+	<description>Zero the tile specified by "tdest".</description>
+	<operation>nbytes := palette_table[tileconfig.palette_id].bytes_per_row
+FOR i := 0 TO palette_table[tileconfig.palette_id].max_rows-1
+	FOR j := 0 TO nbytes-1
+		tdest.row[i].byte[j] := 0
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction name="TILEZERO" form="tmm" xed="TILEZERO_TMMu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VADDPD" form="ymm, ymm, ymm" xed="VADDPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VADDPS" form="ymm, ymm, ymm" xed="VADDPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_addsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VADDSUBPD" form="ymm, ymm, ymm" xed="VADDSUBPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_addsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VADDSUBPS" form="ymm, ymm, ymm" xed="VADDSUBPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDPD" form="ymm, ymm, ymm" xed="VANDPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDPS" form="ymm, ymm, ymm" xed="VANDPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDNPD" form="ymm, ymm, ymm" xed="VANDNPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDNPS" form="ymm, ymm, ymm" xed="VANDNPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_blend_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBLENDPD" form="ymm, ymm, ymm, imm8" xed="VBLENDPD_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_blend_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBLENDPS" form="ymm, ymm, ymm, imm8" xed="VBLENDPS_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_blendv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="mask" etype="MASK"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBLENDVPD" form="ymm, ymm, ymm, ymm" xed="VBLENDVPD_YMMqq_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_blendv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="mask" etype="MASK"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBLENDVPS" form="ymm, ymm, ymm, ymm" xed="VBLENDVPS_YMMqq_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDIVPD" form="ymm, ymm, ymm" xed="VDIVPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDIVPS" form="ymm, ymm, ymm" xed="VDIVPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_dp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
+	<operation>
+DEFINE DP(a[127:0], b[127:0], imm8[7:0]) {
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[(4+j)%8]
+			temp[i+31:i] := a[i+31:i] * b[i+31:i]
+		ELSE
+			temp[i+31:i] := FP32(0.0)
+		FI
+	ENDFOR
+	
+	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
+	
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[j%8]
+			tmpdst[i+31:i] := sum[31:0]
+		ELSE
+			tmpdst[i+31:i] := FP32(0.0)
+		FI
+	ENDFOR
+	RETURN tmpdst[127:0]
+}
+dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
+dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDPPS" form="ymm, ymm, ymm, imm8" xed="VDPPS_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_hadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[127:64] + a[63:0]
+dst[127:64] := b[127:64] + b[63:0]
+dst[191:128] := a[255:192] + a[191:128]
+dst[255:192] := b[255:192] + b[191:128]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VHADDPD" form="ymm, ymm, ymm" xed="VHADDPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_hadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+dst[159:128] := a[191:160] + a[159:128]
+dst[191:160] := a[255:224] + a[223:192]
+dst[223:192] := b[191:160] + b[159:128]
+dst[255:224] := b[255:224] + b[223:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VHADDPS" form="ymm, ymm, ymm" xed="VHADDPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_hsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - a[127:64]
+dst[127:64] := b[63:0] - b[127:64]
+dst[191:128] := a[191:128] - a[255:192]
+dst[255:192] := b[191:128] - b[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VHSUBPD" form="ymm, ymm, ymm" xed="VHSUBPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_hsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+dst[159:128] := a[159:128] - a[191:160]
+dst[191:160] := a[223:192] - a[255:224]
+dst[223:192] := b[159:128] - b[191:160]
+dst[255:224] := b[223:192] - b[255:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VHSUBPS" form="ymm, ymm, ymm" xed="VHSUBPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMAXPD" form="ymm, ymm, ymm" xed="VMAXPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMAXPS" form="ymm, ymm, ymm" xed="VMAXPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMINPD" form="ymm, ymm, ymm" xed="VMINPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMINPS" form="ymm, ymm, ymm" xed="VMINPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMULPD" form="ymm, ymm, ymm" xed="VMULPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMULPS" form="ymm, ymm, ymm" xed="VMULPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VORPD" form="ymm, ymm, ymm" xed="VORPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VORPS" form="ymm, ymm, ymm" xed="VORPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFPD" form="ymm, ymm, ymm, imm8" xed="VSHUFPD_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFPS" form="ymm, ymm, ymm, imm8" xed="VSHUFPS_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSUBPD" form="ymm, ymm, ymm" xed="VSUBPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSUBPS" form="ymm, ymm, ymm" xed="VSUBPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VXORPD" form="ymm, ymm, ymm" xed="VXORPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VXORPS" form="ymm, ymm, ymm" xed="VXORPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_cmp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCMPPD" form="xmm, xmm, xmm, imm8" xed="VCMPPD_XMMdq_XMMdq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cmp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCMPPD" form="ymm, ymm, ymm, imm8" xed="VCMPPD_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_cmp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCMPPS" form="xmm, xmm, xmm, imm8" xed="VCMPPS_XMMdq_XMMdq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cmp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCMPPS" form="ymm, ymm, ymm, imm8" xed="VCMPPS_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_cmp_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCMPSD" form="xmm, xmm, xmm, imm8" xed="VCMPSD_XMMdq_XMMdq_XMMq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_cmp_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCMPSS" form="xmm, xmm, xmm, imm8" xed="VCMPSS_XMMdq_XMMdq_XMMd_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="ymm, xmm" xed="VCVTDQ2PD_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="ymm, ymm" xed="VCVTDQ2PS_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="xmm, ymm" xed="VCVTPD2PS_XMMdq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="ymm, ymm" xed="VCVTPS2DQ_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cvtps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2PD" form="ymm, xmm" xed="VCVTPS2PD_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="xmm, ymm" xed="VCVTTPD2DQ_XMMdq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="xmm, ymm" xed="VCVTPD2DQ_XMMdq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="ymm, ymm" xed="VCVTTPS2DQ_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_extractf128_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF128" form="xmm, ymm, imm8" xed="VEXTRACTF128_XMMdq_YMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_extractf128_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF128" form="xmm, ymm, imm8" xed="VEXTRACTF128_XMMdq_YMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_extractf128_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF128" form="xmm, ymm, imm8" xed="VEXTRACTF128_XMMdq_YMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_extract_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__int32" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="index" etype="IMM" immwidth="3"/>
+	<description>Extract a 32-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[31:0] := (a[255:0] &gt;&gt; (index[2:0] * 32))[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_extract_epi64">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="index" etype="IMM" immwidth="2"/>
+	<description>Extract a 64-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[255:0] &gt;&gt; (index[1:0] * 64))[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_zeroall">
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Zero the contents of all XMM or YMM registers.</description>
+	<operation>YMM0[MAX:0] := 0
+YMM1[MAX:0] := 0
+YMM2[MAX:0] := 0
+YMM3[MAX:0] := 0
+YMM4[MAX:0] := 0
+YMM5[MAX:0] := 0
+YMM6[MAX:0] := 0
+YMM7[MAX:0] := 0
+IF _64_BIT_MODE
+	YMM8[MAX:0] := 0
+	YMM9[MAX:0] := 0
+	YMM10[MAX:0] := 0
+	YMM11[MAX:0] := 0
+	YMM12[MAX:0] := 0
+	YMM13[MAX:0] := 0
+	YMM14[MAX:0] := 0
+	YMM15[MAX:0] := 0
+FI
+	</operation>
+	<instruction name="VZEROALL" xed="VZEROALL"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_zeroupper">
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.</description>
+	<operation>YMM0[MAX:128] := 0
+YMM1[MAX:128] := 0
+YMM2[MAX:128] := 0
+YMM3[MAX:128] := 0
+YMM4[MAX:128] := 0
+YMM5[MAX:128] := 0
+YMM6[MAX:128] := 0
+YMM7[MAX:128] := 0
+IF _64_BIT_MODE
+	YMM8[MAX:128] := 0
+	YMM9[MAX:128] := 0
+	YMM10[MAX:128] := 0
+	YMM11[MAX:128] := 0
+	YMM12[MAX:128] := 0
+	YMM13[MAX:128] := 0
+	YMM14[MAX:128] := 0
+	YMM15[MAX:128] := 0
+FI
+	</operation>
+	<instruction name="VZEROUPPER" xed="VZEROUPPER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], b[1:0])
+dst[63:32] := SELECT4(a[127:0], b[33:32])
+dst[95:64] := SELECT4(a[127:0], b[65:64])
+dst[127:96] := SELECT4(a[127:0], b[97:96])
+dst[159:128] := SELECT4(a[255:128], b[129:128])
+dst[191:160] := SELECT4(a[255:128], b[161:160])
+dst[223:192] := SELECT4(a[255:128], b[193:192])
+dst[255:224] := SELECT4(a[255:128], b[225:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="ymm, ymm, ymm" xed="VPERMILPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], b[1:0])
+dst[63:32] := SELECT4(a[127:0], b[33:32])
+dst[95:64] := SELECT4(a[127:0], b[65:64])
+dst[127:96] := SELECT4(a[127:0], b[97:96])
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="xmm, xmm, xmm" xed="VPERMILPS_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="ymm, ymm, imm8" xed="VPERMILPS_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="xmm, xmm, imm8" xed="VPERMILPS_XMMdq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+IF (b[1] == 0) dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) dst[255:192] := a[255:192]; FI
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="ymm, ymm, ymm" xed="VPERMILPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst".</description>
+	<operation>
+IF (b[1] == 0) dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) dst[127:64] := a[127:64]; FI
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="xmm, xmm, xmm" xed="VPERMILPD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="ymm, ymm, imm8" xed="VPERMILPD_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="xmm, xmm, imm8" xed="VPERMILPD_XMMdq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_permute2f128_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src1, src2, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERM2F128" form="ymm, ymm, ymm, imm8" xed="VPERM2F128_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_permute2f128_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src1, src2, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERM2F128" form="ymm, ymm, ymm, imm8" xed="VPERM2F128_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_permute2f128_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m256i" varname="b" etype="M256"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src1, src2, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERM2F128" form="ymm, ymm, ymm, imm8" xed="VPERM2F128_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_broadcast_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="float const *" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<description>Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst".</description>
+	<operation>
+tmp[31:0] := MEM[mem_addr+31:mem_addr]
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="ymm, m32" xed="VBROADCASTSS_YMMqq_MEMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_broadcast_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const *" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<description>Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst".</description>
+	<operation>
+tmp[31:0] := MEM[mem_addr+31:mem_addr]
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="xmm, m32" xed="VBROADCASTSS_XMMdq_MEMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_broadcast_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double const *" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<description>Broadcast a double-precision (64-bit) floating-point element from memory to all elements of "dst".</description>
+	<operation>
+tmp[63:0] := MEM[mem_addr+63:mem_addr]
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTSD" form="ymm, m64" xed="VBROADCASTSD_YMMqq_MEMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_broadcast_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m128 const *" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of "dst".</description>
+	<operation>
+tmp[127:0] := MEM[mem_addr+127:mem_addr]
+dst[127:0] := tmp[127:0]
+dst[255:128] := tmp[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF128" form="ymm, m128" xed="VBROADCASTF128_YMMqq_MEMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_broadcast_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d const *" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of "dst".</description>
+	<operation>
+tmp[127:0] := MEM[mem_addr+127:mem_addr]
+dst[127:0] := tmp[127:0]
+dst[255:128] := tmp[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF128" form="ymm, m128" xed="VBROADCASTF128_YMMqq_MEMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_insertf128_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF128" form="ymm, ymm, xmm, imm8" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_insertf128_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE imm8[0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF128" form="ymm, ymm, xmm, imm8" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_insertf128_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 128 bits from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF128" form="ymm, ymm, xmm, imm8" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_insert_epi8">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__int8" varname="i" etype="UI8"/>
+	<parameter type="const int" varname="index" etype="IMM" immwidth="5"/>
+	<description>Copy "a" to "dst", and insert the 8-bit integer "i" into "dst" at the location specified by "index".</description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index[4:0]*8
+dst[sel+7:sel] := i[7:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_insert_epi16">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__int16" varname="i" etype="UI16"/>
+	<parameter type="const int" varname="index" etype="IMM" immwidth="4"/>
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "index".</description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index[3:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_insert_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__int32" varname="i" etype="UI32"/>
+	<parameter type="const int" varname="index" etype="IMM" immwidth="3"/>
+	<description>Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "index".</description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index[2:0]*32
+dst[sel+31:sel] := i[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_insert_epi64">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__int64" varname="i" etype="UI64"/>
+	<parameter type="const int" varname="index" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "index".</description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index[1:0]*64
+dst[sel+63:sel] := i[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double const *" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<description>Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="ymm, m256" xed="VMOVAPD_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_store_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double *" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVAPD" form="m256, ymm" xed="VMOVAPD_MEMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="float const *" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<description>Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="ymm, m256" xed="VMOVAPS_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_store_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float *" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVAPS" form="m256, ymm" xed="VMOVAPS_MEMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double const *" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<description>Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVUPD" form="ymm, m256" xed="VMOVUPD_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double *" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVUPD" form="m256, ymm" xed="VMOVUPD_MEMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="float const *" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<description>Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVUPS" form="ymm, m256" xed="VMOVUPS_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float *" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVUPS" form="m256, ymm" xed="VMOVUPS_MEMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_load_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i const *" varname="mem_addr" etype="M256" memwidth="256"/>
+	<description>Load 256-bits of integer data from memory into "dst".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA" form="ymm, m256" xed="VMOVDQA_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_store_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m256i *" varname="mem_addr" etype="M256" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<description>Store 256-bits of integer data from "a" into memory.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVDQA" form="m256, ymm" xed="VMOVDQA_MEMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_loadu_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i const *" varname="mem_addr" etype="M256" memwidth="256"/>
+	<description>Load 256-bits of integer data from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU" form="ymm, m256" xed="VMOVDQU_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_storeu_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m256i *" varname="mem_addr" etype="M256" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<description>Store 256-bits of integer data from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVDQU" form="m256, ymm" xed="VMOVDQU_MEMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_maskload_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double const *" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMASKMOVPD" form="ymm, ymm, m256" xed="VMASKMOVPD_YMMqq_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_maskstore_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double *" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMASKMOVPD" form="m256, ymm, ymm" xed="VMASKMOVPD_MEMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_maskload_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const *" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMASKMOVPD" form="xmm, xmm, m128" xed="VMASKMOVPD_XMMdq_XMMdq_MEMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_maskstore_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double *" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMASKMOVPD" form="m128, xmm, xmm" xed="VMASKMOVPD_MEMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_maskload_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="float const *" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMASKMOVPS" form="ymm, ymm, m256" xed="VMASKMOVPS_YMMqq_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_maskstore_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float *" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMASKMOVPS" form="m256, ymm, ymm" xed="VMASKMOVPS_MEMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_maskload_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const *" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMASKMOVPS" form="xmm, xmm, m128" xed="VMASKMOVPS_XMMdq_XMMdq_MEMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_maskstore_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float *" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMASKMOVPS" form="m128, xmm, xmm" xed="VMASKMOVPS_MEMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Move</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] 
+dst[63:32] := a[63:32] 
+dst[95:64] := a[127:96] 
+dst[127:96] := a[127:96]
+dst[159:128] := a[191:160] 
+dst[191:160] := a[191:160] 
+dst[223:192] := a[255:224] 
+dst[255:224] := a[255:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVSHDUP" form="ymm, ymm" xed="VMOVSHDUP_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Move</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] 
+dst[63:32] := a[31:0] 
+dst[95:64] := a[95:64] 
+dst[127:96] := a[95:64]
+dst[159:128] := a[159:128] 
+dst[191:160] := a[159:128] 
+dst[223:192] := a[223:192] 
+dst[255:224] := a[223:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVSLDUP" form="ymm, ymm" xed="VMOVSLDUP_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Move</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := a[63:0]
+dst[191:128] := a[191:128]
+dst[255:192] := a[191:128]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDDUP" form="ymm, ymm" xed="VMOVDDUP_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_lddqu_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i const *" varname="mem_addr" etype="M256" memwidth="256"/>
+	<description>Load 256-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm256_loadu_si256" when the data crosses a cache line boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VLDDQU" form="ymm, m256" xed="VLDDQU_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_stream_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m256i *" varname="mem_addr" etype="M256" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<description>Store 256-bits of integer data from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVNTDQ" form="m256, ymm" xed="VMOVNTDQ_MEMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_stream_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double *" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVNTPD" form="m256, ymm" xed="VMOVNTPD_MEMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_stream_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float *" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVNTPS" form="m256, ymm" xed="VMOVNTPS_MEMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_rcp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := 1.0 / a[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRCPPS" form="ymm, ymm" xed="VRCPPS_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_rsqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRSQRTPS" form="ymm, ymm" xed="VRSQRTPS_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="ymm, ymm" xed="VSQRTPD_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="ymm, ymm" xed="VSQRTPS_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" immtype="_MM_FROUND"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i], rounding)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VROUNDPD" form="ymm, ymm, imm8" xed="VROUNDPD_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" immtype="_MM_FROUND"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i], rounding)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VROUNDPS" form="ymm, ymm, imm8" xed="VROUNDPS_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKHPD" form="ymm, ymm, ymm" xed="VUNPCKHPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKHPS" form="ymm, ymm, ymm" xed="VUNPCKHPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKLPD" form="ymm, ymm, ymm" xed="VUNPCKLPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKLPS" form="ymm, ymm, ymm" xed="VUNPCKLPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_testz_si256">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m256i" varname="b" etype="M256"/>
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+IF ((a[255:0] AND b[255:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[255:0]) AND b[255:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction name="VPTEST" form="ymm, ymm" xed="VPTEST_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_testc_si256">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m256i" varname="b" etype="M256"/>
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+IF ((a[255:0] AND b[255:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[255:0]) AND b[255:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction name="VPTEST" form="ymm, ymm" xed="VPTEST_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_testnzc_si256">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m256i" varname="b" etype="M256"/>
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+IF ((a[255:0] AND b[255:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[255:0]) AND b[255:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="VPTEST" form="ymm, ymm" xed="VPTEST_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_testz_pd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction name="VTESTPD" form="ymm, ymm" xed="VTESTPD_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_testc_pd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction name="VTESTPD" form="ymm, ymm" xed="VTESTPD_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_testnzc_pd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="VTESTPD" form="ymm, ymm" xed="VTESTPD_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_testz_pd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction name="VTESTPD" form="xmm, xmm" xed="VTESTPD_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_testc_pd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction name="VTESTPD" form="xmm, xmm" xed="VTESTPD_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_testnzc_pd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[63] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="VTESTPD" form="xmm, xmm" xed="VTESTPD_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_testz_ps">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction name="VTESTPS" form="ymm, ymm" xed="VTESTPS_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_testc_ps">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction name="VTESTPS" form="ymm, ymm" xed="VTESTPS_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_testnzc_ps">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0 &amp;&amp; \
+    tmp[159] == 0 &amp;&amp; tmp[191] == 0 &amp;&amp; tmp[223] == 0 &amp;&amp; tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="VTESTPS" form="ymm, ymm" xed="VTESTPS_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_testz_ps">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction name="VTESTPS" form="xmm, xmm" xed="VTESTPS_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_testc_ps">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction name="VTESTPS" form="xmm, xmm" xed="VTESTPS_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm_testnzc_ps">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[31] == 0 &amp;&amp; tmp[63] == 0 &amp;&amp; tmp[95] == 0 &amp;&amp; tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="VTESTPS" form="xmm, xmm" xed="VTESTPS_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_movemask_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:4] := 0
+	</operation>
+	<instruction name="VMOVMSKPD" form="r32, ymm" xed="VMOVMSKPD_GPR32d_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_movemask_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:8] := 0
+	</operation>
+	<instruction name="VMOVMSKPS" form="r32, ymm" xed="VMOVMSKPS_GPR32d_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_setzero_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m256d with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="VXORPD" form="ymm, ymm, ymm" xed="VXORPD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_setzero_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m256 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="VXORPS" form="ymm, ymm, ymm" xed="VXORPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_setzero_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m256i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="VPXOR" form="ymm, ymm, ymm" xed="VPXOR_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="e3" etype="FP64"/>
+	<parameter type="double" varname="e2" etype="FP64"/>
+	<parameter type="double" varname="e1" etype="FP64"/>
+	<parameter type="double" varname="e0" etype="FP64"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="e7" etype="FP32"/>
+	<parameter type="float" varname="e6" etype="FP32"/>
+	<parameter type="float" varname="e5" etype="FP32"/>
+	<parameter type="float" varname="e4" etype="FP32"/>
+	<parameter type="float" varname="e3" etype="FP32"/>
+	<parameter type="float" varname="e2" etype="FP32"/>
+	<parameter type="float" varname="e1" etype="FP32"/>
+	<parameter type="float" varname="e0" etype="FP32"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set_epi8">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="char" varname="e31" etype="UI8"/>
+	<parameter type="char" varname="e30" etype="UI8"/>
+	<parameter type="char" varname="e29" etype="UI8"/>
+	<parameter type="char" varname="e28" etype="UI8"/>
+	<parameter type="char" varname="e27" etype="UI8"/>
+	<parameter type="char" varname="e26" etype="UI8"/>
+	<parameter type="char" varname="e25" etype="UI8"/>
+	<parameter type="char" varname="e24" etype="UI8"/>
+	<parameter type="char" varname="e23" etype="UI8"/>
+	<parameter type="char" varname="e22" etype="UI8"/>
+	<parameter type="char" varname="e21" etype="UI8"/>
+	<parameter type="char" varname="e20" etype="UI8"/>
+	<parameter type="char" varname="e19" etype="UI8"/>
+	<parameter type="char" varname="e18" etype="UI8"/>
+	<parameter type="char" varname="e17" etype="UI8"/>
+	<parameter type="char" varname="e16" etype="UI8"/>
+	<parameter type="char" varname="e15" etype="UI8"/>
+	<parameter type="char" varname="e14" etype="UI8"/>
+	<parameter type="char" varname="e13" etype="UI8"/>
+	<parameter type="char" varname="e12" etype="UI8"/>
+	<parameter type="char" varname="e11" etype="UI8"/>
+	<parameter type="char" varname="e10" etype="UI8"/>
+	<parameter type="char" varname="e9" etype="UI8"/>
+	<parameter type="char" varname="e8" etype="UI8"/>
+	<parameter type="char" varname="e7" etype="UI8"/>
+	<parameter type="char" varname="e6" etype="UI8"/>
+	<parameter type="char" varname="e5" etype="UI8"/>
+	<parameter type="char" varname="e4" etype="UI8"/>
+	<parameter type="char" varname="e3" etype="UI8"/>
+	<parameter type="char" varname="e2" etype="UI8"/>
+	<parameter type="char" varname="e1" etype="UI8"/>
+	<parameter type="char" varname="e0" etype="UI8"/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+dst[71:64] := e8
+dst[79:72] := e9
+dst[87:80] := e10
+dst[95:88] := e11
+dst[103:96] := e12
+dst[111:104] := e13
+dst[119:112] := e14
+dst[127:120] := e15
+dst[135:128] := e16
+dst[143:136] := e17
+dst[151:144] := e18
+dst[159:152] := e19
+dst[167:160] := e20
+dst[175:168] := e21
+dst[183:176] := e22
+dst[191:184] := e23
+dst[199:192] := e24
+dst[207:200] := e25
+dst[215:208] := e26
+dst[223:216] := e27
+dst[231:224] := e28
+dst[239:232] := e29
+dst[247:240] := e30
+dst[255:248] := e31
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set_epi16">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="short" varname="e15" etype="UI16"/>
+	<parameter type="short" varname="e14" etype="UI16"/>
+	<parameter type="short" varname="e13" etype="UI16"/>
+	<parameter type="short" varname="e12" etype="UI16"/>
+	<parameter type="short" varname="e11" etype="UI16"/>
+	<parameter type="short" varname="e10" etype="UI16"/>
+	<parameter type="short" varname="e9" etype="UI16"/>
+	<parameter type="short" varname="e8" etype="UI16"/>
+	<parameter type="short" varname="e7" etype="UI16"/>
+	<parameter type="short" varname="e6" etype="UI16"/>
+	<parameter type="short" varname="e5" etype="UI16"/>
+	<parameter type="short" varname="e4" etype="UI16"/>
+	<parameter type="short" varname="e3" etype="UI16"/>
+	<parameter type="short" varname="e2" etype="UI16"/>
+	<parameter type="short" varname="e1" etype="UI16"/>
+	<parameter type="short" varname="e0" etype="UI16"/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+dst[79:64] := e4
+dst[95:80] := e5
+dst[111:96] := e6
+dst[127:112] := e7
+dst[143:128] := e8
+dst[159:144] := e9
+dst[175:160] := e10
+dst[191:176] := e11
+dst[207:192] := e12
+dst[223:208] := e13
+dst[239:224] := e14
+dst[255:240] := e15
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="e7" etype="UI32"/>
+	<parameter type="int" varname="e6" etype="UI32"/>
+	<parameter type="int" varname="e5" etype="UI32"/>
+	<parameter type="int" varname="e4" etype="UI32"/>
+	<parameter type="int" varname="e3" etype="UI32"/>
+	<parameter type="int" varname="e2" etype="UI32"/>
+	<parameter type="int" varname="e1" etype="UI32"/>
+	<parameter type="int" varname="e0" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set_epi64x">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="e3" etype="UI64"/>
+	<parameter type="__int64" varname="e2" etype="UI64"/>
+	<parameter type="__int64" varname="e1" etype="UI64"/>
+	<parameter type="__int64" varname="e0" etype="UI64"/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_setr_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="e3" etype="FP64"/>
+	<parameter type="double" varname="e2" etype="FP64"/>
+	<parameter type="double" varname="e1" etype="FP64"/>
+	<parameter type="double" varname="e0" etype="FP64"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e3
+dst[127:64] := e2
+dst[191:128] := e1
+dst[255:192] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_setr_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="e7" etype="FP32"/>
+	<parameter type="float" varname="e6" etype="FP32"/>
+	<parameter type="float" varname="e5" etype="FP32"/>
+	<parameter type="float" varname="e4" etype="FP32"/>
+	<parameter type="float" varname="e3" etype="FP32"/>
+	<parameter type="float" varname="e2" etype="FP32"/>
+	<parameter type="float" varname="e1" etype="FP32"/>
+	<parameter type="float" varname="e0" etype="FP32"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e7
+dst[63:32] := e6
+dst[95:64] := e5
+dst[127:96] := e4
+dst[159:128] := e3
+dst[191:160] := e2
+dst[223:192] := e1
+dst[255:224] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_setr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="char" varname="e31" etype="UI8"/>
+	<parameter type="char" varname="e30" etype="UI8"/>
+	<parameter type="char" varname="e29" etype="UI8"/>
+	<parameter type="char" varname="e28" etype="UI8"/>
+	<parameter type="char" varname="e27" etype="UI8"/>
+	<parameter type="char" varname="e26" etype="UI8"/>
+	<parameter type="char" varname="e25" etype="UI8"/>
+	<parameter type="char" varname="e24" etype="UI8"/>
+	<parameter type="char" varname="e23" etype="UI8"/>
+	<parameter type="char" varname="e22" etype="UI8"/>
+	<parameter type="char" varname="e21" etype="UI8"/>
+	<parameter type="char" varname="e20" etype="UI8"/>
+	<parameter type="char" varname="e19" etype="UI8"/>
+	<parameter type="char" varname="e18" etype="UI8"/>
+	<parameter type="char" varname="e17" etype="UI8"/>
+	<parameter type="char" varname="e16" etype="UI8"/>
+	<parameter type="char" varname="e15" etype="UI8"/>
+	<parameter type="char" varname="e14" etype="UI8"/>
+	<parameter type="char" varname="e13" etype="UI8"/>
+	<parameter type="char" varname="e12" etype="UI8"/>
+	<parameter type="char" varname="e11" etype="UI8"/>
+	<parameter type="char" varname="e10" etype="UI8"/>
+	<parameter type="char" varname="e9" etype="UI8"/>
+	<parameter type="char" varname="e8" etype="UI8"/>
+	<parameter type="char" varname="e7" etype="UI8"/>
+	<parameter type="char" varname="e6" etype="UI8"/>
+	<parameter type="char" varname="e5" etype="UI8"/>
+	<parameter type="char" varname="e4" etype="UI8"/>
+	<parameter type="char" varname="e3" etype="UI8"/>
+	<parameter type="char" varname="e2" etype="UI8"/>
+	<parameter type="char" varname="e1" etype="UI8"/>
+	<parameter type="char" varname="e0" etype="UI8"/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e31
+dst[15:8] := e30
+dst[23:16] := e29
+dst[31:24] := e28
+dst[39:32] := e27
+dst[47:40] := e26
+dst[55:48] := e25
+dst[63:56] := e24
+dst[71:64] := e23
+dst[79:72] := e22
+dst[87:80] := e21
+dst[95:88] := e20
+dst[103:96] := e19
+dst[111:104] := e18
+dst[119:112] := e17
+dst[127:120] := e16
+dst[135:128] := e15
+dst[143:136] := e14
+dst[151:144] := e13
+dst[159:152] := e12
+dst[167:160] := e11
+dst[175:168] := e10
+dst[183:176] := e9
+dst[191:184] := e8
+dst[199:192] := e7
+dst[207:200] := e6
+dst[215:208] := e5
+dst[223:216] := e4
+dst[231:224] := e3
+dst[239:232] := e2
+dst[247:240] := e1
+dst[255:248] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_setr_epi16">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="short" varname="e15" etype="UI16"/>
+	<parameter type="short" varname="e14" etype="UI16"/>
+	<parameter type="short" varname="e13" etype="UI16"/>
+	<parameter type="short" varname="e12" etype="UI16"/>
+	<parameter type="short" varname="e11" etype="UI16"/>
+	<parameter type="short" varname="e10" etype="UI16"/>
+	<parameter type="short" varname="e9" etype="UI16"/>
+	<parameter type="short" varname="e8" etype="UI16"/>
+	<parameter type="short" varname="e7" etype="UI16"/>
+	<parameter type="short" varname="e6" etype="UI16"/>
+	<parameter type="short" varname="e5" etype="UI16"/>
+	<parameter type="short" varname="e4" etype="UI16"/>
+	<parameter type="short" varname="e3" etype="UI16"/>
+	<parameter type="short" varname="e2" etype="UI16"/>
+	<parameter type="short" varname="e1" etype="UI16"/>
+	<parameter type="short" varname="e0" etype="UI16"/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e15
+dst[31:16] := e14
+dst[47:32] := e13
+dst[63:48] := e12
+dst[79:64] := e11
+dst[95:80] := e10
+dst[111:96] := e9
+dst[127:112] := e8
+dst[143:128] := e7
+dst[159:144] := e6
+dst[175:160] := e5
+dst[191:176] := e4
+dst[207:192] := e3
+dst[223:208] := e2
+dst[239:224] := e1
+dst[255:240] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_setr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="e7" etype="UI32"/>
+	<parameter type="int" varname="e6" etype="UI32"/>
+	<parameter type="int" varname="e5" etype="UI32"/>
+	<parameter type="int" varname="e4" etype="UI32"/>
+	<parameter type="int" varname="e3" etype="UI32"/>
+	<parameter type="int" varname="e2" etype="UI32"/>
+	<parameter type="int" varname="e1" etype="UI32"/>
+	<parameter type="int" varname="e0" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e7
+dst[63:32] := e6
+dst[95:64] := e5
+dst[127:96] := e4
+dst[159:128] := e3
+dst[191:160] := e2
+dst[223:192] := e1
+dst[255:224] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_setr_epi64x">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="e3" etype="UI64"/>
+	<parameter type="__int64" varname="e2" etype="UI64"/>
+	<parameter type="__int64" varname="e1" etype="UI64"/>
+	<parameter type="__int64" varname="e0" etype="UI64"/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e3
+dst[127:64] := e2
+dst[191:128] := e1
+dst[255:192] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set1_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="a" etype="FP64"/>
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set1_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="a" etype="FP32"/>
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastb".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate the "vpbroadcastw".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastd".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_set1_epi64x">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="long long" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m256d to type __m256.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m256 to type __m256d.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castps_si256">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castpd_si256">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castsi256_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Cast vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castsi256_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Cast vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castps256_ps128">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castpd256_pd128">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castsi256_si128">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<description>Cast vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castps128_ps256">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castpd128_pd256">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_castsi128_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m128i" varname="a" etype="M256"/>
+	<description>Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_zextps128_ps256">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m128 to type __m256; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_zextpd128_pd256">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_zextsi128_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m128i" varname="a" etype="M256"/>
+	<description>Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_floor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VROUNDPS" form="ymm, ymm, imm8" xed="VROUNDPS_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_ceil_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VROUNDPS" form="ymm, ymm, imm8" xed="VROUNDPS_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_floor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VROUNDPD" form="ymm, ymm, imm8" xed="VROUNDPD_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_ceil_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VROUNDPD" form="ymm, ymm, imm8" xed="VROUNDPD_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_undefined_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m256 with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_undefined_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m256d with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_undefined_si256">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m256i with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_set_m128">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="hi" etype="FP32"/>
+	<parameter type="__m128" varname="lo" etype="FP32"/>
+	<description>Set packed __m256 vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF128" form="ymm, ymm, xmm, imm8" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_set_m128d">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="hi" etype="FP64"/>
+	<parameter type="__m128d" varname="lo" etype="FP64"/>
+	<description>Set packed __m256d vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF128" form="ymm, ymm, xmm, imm8" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_set_m128i">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="hi" etype="M128"/>
+	<parameter type="__m128i" varname="lo" etype="M128"/>
+	<description>Set packed __m256i vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF128" form="ymm, ymm, xmm, imm8" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_setr_m128">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="lo" etype="FP32"/>
+	<parameter type="__m128" varname="hi" etype="FP32"/>
+	<description>Set packed __m256 vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF128" form="ymm, ymm, xmm, imm8" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_setr_m128d">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="lo" etype="FP64"/>
+	<parameter type="__m128d" varname="hi" etype="FP64"/>
+	<description>Set packed __m256d vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF128" form="ymm, ymm, xmm, imm8" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" name="_mm256_setr_m128i">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="lo" etype="M128"/>
+	<parameter type="__m128i" varname="hi" etype="M128"/>
+	<description>Set packed __m256i vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF128" form="ymm, ymm, xmm, imm8" xed="VINSERTF128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_loadu2_m128">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="hiaddr" etype="FP32" memwidth="128"/>
+	<parameter type="float const*" varname="loaddr" etype="FP32" memwidth="128"/>
+	<description>Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst".
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[loaddr+127:loaddr]
+dst[255:128] := MEM[hiaddr+127:hiaddr]
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_loadu2_m128d">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="hiaddr" etype="FP64" memwidth="128"/>
+	<parameter type="double const*" varname="loaddr" etype="FP64" memwidth="128"/>
+	<description>Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst".
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[loaddr+127:loaddr]
+dst[255:128] := MEM[hiaddr+127:hiaddr]
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_loadu2_m128i">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m128i const*" varname="hiaddr" etype="M128" memwidth="128"/>
+	<parameter type="__m128i const*" varname="loaddr" etype="M128" memwidth="128"/>
+	<description>Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in "dst".
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[loaddr+127:loaddr]
+dst[255:128] := MEM[hiaddr+127:hiaddr]
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_storeu2_m128">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float*" varname="hiaddr" etype="FP32" memwidth="128"/>
+	<parameter type="float*" varname="loaddr" etype="FP32" memwidth="128"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory two different 128-bit locations.
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[loaddr+127:loaddr] := a[127:0]
+MEM[hiaddr+127:hiaddr] := a[255:128]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_storeu2_m128d">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="hiaddr" etype="FP64" memwidth="128"/>
+	<parameter type="double*" varname="loaddr" etype="FP64" memwidth="128"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory two different 128-bit locations.
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[loaddr+127:loaddr] := a[127:0]
+MEM[hiaddr+127:hiaddr] := a[255:128]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" sequence="TRUE" name="_mm256_storeu2_m128i">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m128i*" varname="hiaddr" etype="M128" memwidth="128"/>
+	<parameter type="__m128i*" varname="loaddr" etype="M128" memwidth="128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<description>Store the high and low 128-bit halves (each composed of integer data) from "a" into memory two different 128-bit locations.
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[loaddr+127:loaddr] := a[127:0]
+MEM[hiaddr+127:hiaddr] := a[255:128]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_acos_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ACOS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_acos_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ACOS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_acosh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ACOSH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_acosh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ACOSH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_asin_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ASIN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_asin_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ASIN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_asinh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ASINH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_asinh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ASINH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_atan_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_atan_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_atan2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_atan2_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_atanh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ATANH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_atanh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ATANH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cbrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cbrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cdfnorm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cdfnorm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cdfnorminv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cdfnorminv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CEXP(a[31:0], b[31:0]) {
+	result[31:0]  := POW(FP32(e), a[31:0]) * COS(b[31:0])
+	result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0])
+	RETURN result
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_clog_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CLOG(a[31:0], b[31:0]) {
+	result[31:0]  := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0)))
+	result[63:32] := ATAN2(b, a)
+	RETURN result
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cos_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cos_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cosd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := COSD(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cosd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := COSD(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cosh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := COSH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_cosh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := COSH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_csqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CSQRT(a[31:0], b[31:0]) {
+	sign[31:0] := (b &lt; 0.0) ? -FP32(1.0) : FP32(1.0)
+	result[31:0]  := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0)
+	result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0)
+	RETURN result
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_div_epi8">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_div_epi16">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_div_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_div_epi64">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_div_epu8">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_div_epu16">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_div_epu32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_div_epu64">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_erf_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ERF(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_erf_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ERF(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_erfc_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_erfc_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+63:i] := 1.0 - ERF(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_erfcinv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_erfcinv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_erfinv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_erfinv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+63:i] := 1.0 / ERF(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_exp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_exp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_exp10_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(10.0, a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_exp10_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(FP32(10.0), a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_exp2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(2.0, a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_exp2_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_expm1_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i]) - 1.0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_expm1_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_hypot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_hypot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_idiv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_idivrem_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i *" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_invcbrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := InvCubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_invcbrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := InvCubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_invsqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := InvSQRT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_invsqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := InvSQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_irem_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_log_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_log_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_log10_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_log10_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_log1p_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LOG(1.0 + a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_log1p_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LOG(1.0 + a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_log2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_log2_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_logb_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_logb_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_pow_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POW(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_pow_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POW(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_rem_epi8">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 31
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_rem_epi16">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_rem_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_rem_epi64">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_rem_epu8">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 31
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_rem_epu16">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_rem_epu32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_rem_epu64">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_sin_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_sin_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_sincos_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d *" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_sincos_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256 *" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_sind_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SIND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_sind_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SIND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_sinh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SINH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_sinh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SINH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_svml_ceil_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_svml_ceil_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_svml_floor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_svml_floor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_svml_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_svml_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_svml_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_svml_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_tan_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TAN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_tan_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TAN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_tand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TAND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_tand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TAND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_tanh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TANH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_tanh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TANH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_trunc_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TRUNCATE(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_trunc_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TRUNCATE(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_udiv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_udivrem_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i *" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm256_urem_epi32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" vexEq="TRUE" name="_mm256_cvtss_f32">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name="VMOVSS" form="m32, xmm" xed="VMOVSS_MEMd_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" vexEq="TRUE" name="_mm256_cvtsd_f64">
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="VMOVSD" form="m64, xmm" xed="VMOVSD_MEMq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX" vexEq="TRUE" name="_mm256_cvtsi256_si32">
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name="VMOVD" form="r32, xmm" xed="VMOVD_GPR32d_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" sequence="TRUE" name="_mm256_extract_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="int" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="const int" varname="index" etype="IMM" immwidth="5"/>
+	<description>Extract an 8-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[7:0] := (a[255:0] &gt;&gt; (index[4:0] * 8))[7:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" sequence="TRUE" name="_mm256_extract_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="int" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="const int" varname="index" etype="IMM" immwidth="4"/>
+	<description>Extract a 16-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[15:0] := (a[255:0] &gt;&gt; (index[3:0] * 16))[15:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSB" form="ymm, ymm" xed="VPABSB_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSW" form="ymm, ymm" xed="VPABSW_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSD" form="ymm, ymm" xed="VPABSD_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDB" form="ymm, ymm, ymm" xed="VPADDB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDW" form="ymm, ymm, ymm" xed="VPADDW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDD" form="ymm, ymm, ymm" xed="VPADDD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDQ" form="ymm, ymm, ymm" xed="VPADDQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDSB" form="ymm, ymm, ymm" xed="VPADDSB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDSW" form="ymm, ymm, ymm" xed="VPADDSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDUSB" form="ymm, ymm, ymm" xed="VPADDUSB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDUSW" form="ymm, ymm, ymm" xed="VPADDUSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	dst[i+127:i] := tmp[127:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPALIGNR" form="ymm, ymm, ymm, imm8" xed="VPALIGNR_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_and_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m256i" varname="b" etype="M256"/>
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := (a[255:0] AND b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPAND" form="ymm, ymm, ymm" xed="VPAND_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_andnot_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m256i" varname="b" etype="M256"/>
+	<description>Compute the bitwise NOT of 256 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := ((NOT a[255:0]) AND b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPANDN" form="ymm, ymm, ymm" xed="VPANDN_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPAVGB" form="ymm, ymm, ymm" xed="VPAVGB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPAVGW" form="ymm, ymm, ymm" xed="VPAVGW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_blend_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Blend packed 16-bit integers from "a" and "b" within 128-bit lanes using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[j%8]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBLENDW" form="ymm, ymm, ymm, imm8" xed="VPBLENDW_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_blend_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBLENDD" form="xmm, xmm, xmm, imm8" xed="VPBLENDD_XMMdq_XMMdq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_blend_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBLENDD" form="ymm, ymm, ymm, imm8" xed="VPBLENDD_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_blendv_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<description>Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF mask[i+7]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBLENDVB" form="ymm, ymm, ymm, ymm" xed="VPBLENDVB_YMMqq_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="xmm, xmm" xed="VPBROADCASTB_XMMdq_XMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="ymm, xmm" xed="VPBROADCASTB_YMMqq_XMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="xmm, xmm" xed="VPBROADCASTD_XMMdq_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="ymm, xmm" xed="VPBROADCASTD_YMMqq_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="xmm, xmm" xed="VPBROADCASTQ_XMMdq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="ymm, xmm" xed="VPBROADCASTQ_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" vexEq="TRUE" name="_mm_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="MOVDDUP" form="xmm, xmm" xed="MOVDDUP_XMMdq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTSD" form="ymm, xmm" xed="VBROADCASTSD_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_broadcastsi128_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<description>Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst".</description>
+	<operation>
+dst[127:0] := a[127:0]
+dst[255:128] := a[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI128" form="ymm, m128" xed="VBROADCASTI128_YMMqq_MEMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_broadcastsi128_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<description>Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst".</description>
+	<operation>
+dst[127:0] := a[127:0]
+dst[255:128] := a[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI128" form="ymm, m128" xed="VBROADCASTI128_YMMqq_MEMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="xmm, xmm" xed="VBROADCASTSS_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="ymm, xmm" xed="VBROADCASTSS_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="xmm, xmm" xed="VPBROADCASTW_XMMdq_XMMw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="ymm, xmm" xed="VPBROADCASTW_YMMqq_XMMw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cmpeq_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCMPEQB" form="ymm, ymm, ymm" xed="VPCMPEQB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cmpeq_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCMPEQW" form="ymm, ymm, ymm" xed="VPCMPEQW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cmpeq_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCMPEQD" form="ymm, ymm, ymm" xed="VPCMPEQD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cmpeq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCMPEQQ" form="ymm, ymm, ymm" xed="VPCMPEQQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cmpgt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCMPGTB" form="ymm, ymm, ymm" xed="VPCMPGTB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cmpgt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCMPGTW" form="ymm, ymm, ymm" xed="VPCMPGTW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cmpgt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCMPGTD" form="ymm, ymm, ymm" xed="VPCMPGTD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cmpgt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] &gt; b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCMPGTQ" form="ymm, ymm, ymm" xed="VPCMPGTQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := SignExtend32(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXWD" form="ymm, xmm" xed="VPMOVSXWD_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := SignExtend64(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXWQ" form="ymm, xmm" xed="VPMOVSXWQ_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := SignExtend64(a[k+31:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXDQ" form="ymm, xmm" xed="VPMOVSXDQ_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	dst[l+15:l] := SignExtend16(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXBW" form="ymm, xmm" xed="VPMOVSXBW_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := SignExtend32(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXBD" form="ymm, xmm" xed="VPMOVSXBD_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := SignExtend64(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXBQ" form="ymm, xmm" xed="VPMOVSXBQ_YMMqq_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := ZeroExtend32(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXWD" form="ymm, xmm" xed="VPMOVZXWD_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := ZeroExtend64(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXWQ" form="ymm, xmm" xed="VPMOVZXWQ_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := ZeroExtend64(a[k+31:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXDQ" form="ymm, xmm" xed="VPMOVZXDQ_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	dst[l+15:l] := ZeroExtend16(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXBW" form="ymm, xmm" xed="VPMOVZXBW_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := ZeroExtend32(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXBD" form="ymm, xmm" xed="VPMOVZXBD_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := ZeroExtend64(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXBQ" form="ymm, xmm" xed="VPMOVZXBQ_YMMqq_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_extracti128_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI128" form="xmm, ymm, imm8" xed="VEXTRACTI128_XMMdq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_hadd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[31:16] + a[15:0]
+dst[31:16] := a[63:48] + a[47:32]
+dst[47:32] := a[95:80] + a[79:64]
+dst[63:48] := a[127:112] + a[111:96]
+dst[79:64] := b[31:16] + b[15:0]
+dst[95:80] := b[63:48] + b[47:32]
+dst[111:96] := b[95:80] + b[79:64]
+dst[127:112] := b[127:112] + b[111:96]
+dst[143:128] := a[159:144] + a[143:128]
+dst[159:144] := a[191:176] + a[175:160]
+dst[175:160] := a[223:208] + a[207:192]
+dst[191:176] := a[255:240] + a[239:224]
+dst[207:192] := b[159:144] + b[143:128]
+dst[223:208] := b[191:176] + b[175:160]
+dst[239:224] := b[223:208] + b[207:192]
+dst[255:240] := b[255:240] + b[239:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPHADDW" form="ymm, ymm, ymm" xed="VPHADDW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_hadd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+dst[159:128] := a[191:160] + a[159:128]
+dst[191:160] := a[255:224] + a[223:192]
+dst[223:192] := b[191:160] + b[159:128]
+dst[255:224] := b[255:224] + b[223:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPHADDD" form="ymm, ymm, ymm" xed="VPHADDD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_hadds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:16] + a[15:0])
+dst[31:16] := Saturate16(a[63:48] + a[47:32])
+dst[47:32] := Saturate16(a[95:80] + a[79:64])
+dst[63:48] := Saturate16(a[127:112] + a[111:96])
+dst[79:64] := Saturate16(b[31:16] + b[15:0])
+dst[95:80] := Saturate16(b[63:48] + b[47:32])
+dst[111:96] := Saturate16(b[95:80] + b[79:64])
+dst[127:112] := Saturate16(b[127:112] + b[111:96])
+dst[143:128] := Saturate16(a[159:144] + a[143:128])
+dst[159:144] := Saturate16(a[191:176] + a[175:160])
+dst[175:160] := Saturate16(a[223:208] + a[207:192])
+dst[191:176] := Saturate16(a[255:240] + a[239:224])
+dst[207:192] := Saturate16(b[159:144] + b[143:128])
+dst[223:208] := Saturate16(b[191:176] + b[175:160])
+dst[239:224] := Saturate16(b[223:208] + b[207:192])
+dst[255:240] := Saturate16(b[255:240] + b[239:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPHADDSW" form="ymm, ymm, ymm" xed="VPHADDSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_hsub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[15:0] - a[31:16]
+dst[31:16] := a[47:32] - a[63:48]
+dst[47:32] := a[79:64] - a[95:80]
+dst[63:48] := a[111:96] - a[127:112]
+dst[79:64] := b[15:0] - b[31:16]
+dst[95:80] := b[47:32] - b[63:48]
+dst[111:96] := b[79:64] - b[95:80]
+dst[127:112] := b[111:96] - b[127:112]
+dst[143:128] := a[143:128] - a[159:144]
+dst[159:144] := a[175:160] - a[191:176]
+dst[175:160] := a[207:192] - a[223:208]
+dst[191:176] := a[239:224] - a[255:240]
+dst[207:192] := b[143:128] - b[159:144]
+dst[223:208] := b[175:160] - b[191:176]
+dst[239:224] := b[207:192] - b[223:208]
+dst[255:240] := b[239:224] - b[255:240]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPHSUBW" form="ymm, ymm, ymm" xed="VPHSUBW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_hsub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+dst[159:128] := a[159:128] - a[191:160]
+dst[191:160] := a[223:192] - a[255:224]
+dst[223:192] := b[159:128] - b[191:160]
+dst[255:224] := b[223:192] - b[255:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPHSUBD" form="ymm, ymm, ymm" xed="VPHSUBD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_hsubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[15:0] - a[31:16])
+dst[31:16] := Saturate16(a[47:32] - a[63:48])
+dst[47:32] := Saturate16(a[79:64] - a[95:80])
+dst[63:48] := Saturate16(a[111:96] - a[127:112])
+dst[79:64] := Saturate16(b[15:0] - b[31:16])
+dst[95:80] := Saturate16(b[47:32] - b[63:48])
+dst[111:96] := Saturate16(b[79:64] - b[95:80])
+dst[127:112] := Saturate16(b[111:96] - b[127:112])
+dst[143:128] := Saturate16(a[143:128] - a[159:144])
+dst[159:144] := Saturate16(a[175:160] - a[191:176])
+dst[175:160] := Saturate16(a[207:192] - a[223:208])
+dst[191:176] := Saturate16(a[239:224] - a[255:240])
+dst[207:192] := Saturate16(b[143:128] - b[159:144])
+dst[223:208] := Saturate16(b[175:160] - b[191:176])
+dst[239:224] := Saturate16(b[207:192] - b[223:208])
+dst[255:240] := Saturate16(b[239:224] - b[255:240])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPHSUBSW" form="ymm, ymm, ymm" xed="VPHSUBSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="xmm, vm32x, xmm" xed="VGATHERDPD_XMMf64_MEMf64_XMMi64_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="ymm, vm32x, ymm" xed="VGATHERDPD_YMMf64_MEMf64_YMMi64_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="xmm, vm32x, xmm" xed="VGATHERDPS_XMMf32_MEMf32_XMMi32_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="ymm, vm32x, ymm" xed="VGATHERDPS_YMMf32_MEMf32_YMMi32_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="int const*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="xmm, vm32x, xmm" xed="VPGATHERDD_XMMu32_MEMd_XMMi32_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="int const*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="ymm, vm32x, ymm" xed="VPGATHERDD_YMMu32_MEMd_YMMi32_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__int64 const*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="xmm, vm32x, xmm" xed="VPGATHERDQ_XMMu64_MEMq_XMMi64_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__int64 const*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="ymm, vm32x, ymm" xed="VPGATHERDQ_YMMu64_MEMq_YMMi64_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERQPD" form="xmm, vm64x, xmm" xed="VGATHERQPD_XMMf64_MEMf64_XMMi64_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERQPD" form="ymm, vm64x, ymm" xed="VGATHERQPD_YMMf64_MEMf64_YMMi64_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VGATHERQPS" form="xmm, vm64x, xmm" xed="VGATHERQPS_XMMf32_MEMf32_XMMi32_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERQPS" form="xmm, vm64y, xmm" xed="VGATHERQPS_XMMf32_MEMf32_XMMi32_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="int const*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPGATHERQD" form="xmm, vm64x, xmm" xed="VPGATHERQD_XMMu32_MEMd_XMMi32_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="int const*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERQD" form="xmm, vm64y, xmm" xed="VPGATHERQD_XMMu32_MEMd_XMMi32_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__int64 const*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERQQ" form="xmm, vm64x, xmm" xed="VPGATHERQQ_XMMu64_MEMq_XMMi64_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__int64 const*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERQQ" form="ymm, vm64x, ymm" xed="VPGATHERQQ_YMMu64_MEMq_YMMi64_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_inserti128_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of integer data) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTI128" form="ymm, ymm, xmm, imm8" xed="VINSERTI128_YMMqq_YMMqq_XMMdq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADDWD" form="ymm, ymm, ymm" xed="VPMADDWD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADDUBSW" form="ymm, ymm, ymm" xed="VPMADDUBSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_mask_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="double const*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128d" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="xmm, vm32x, xmm" xed="VGATHERDPD_XMMf64_MEMf64_XMMi64_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mask_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="double const*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256d" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="ymm, vm32x, ymm" xed="VGATHERDPD_YMMf64_MEMf64_YMMi64_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_mask_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="float const*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF mask[i+31]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="xmm, vm32x, xmm" xed="VGATHERDPS_XMMf32_MEMf32_XMMi32_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mask_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="float const*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF mask[i+31]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="ymm, vm32x, ymm" xed="VGATHERDPS_YMMf32_MEMf32_YMMi32_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_mask_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="int const*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF mask[i+31]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="xmm, vm32x, xmm" xed="VPGATHERDD_XMMu32_MEMd_XMMi32_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mask_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="int const*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF mask[i+31]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="ymm, vm32x, ymm" xed="VPGATHERDD_YMMu32_MEMd_YMMi32_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_mask_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__int64 const*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="xmm, vm32x, xmm" xed="VPGATHERDQ_XMMu64_MEMq_XMMi64_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mask_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__int64 const*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="ymm, vm32x, ymm" xed="VPGATHERDQ_YMMu64_MEMq_YMMi64_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_mask_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="double const*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128d" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF mask[i+63]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERQPD" form="xmm, vm64x, xmm" xed="VGATHERQPD_XMMf64_MEMf64_XMMi64_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mask_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="double const*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256d" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF mask[i+63]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERQPD" form="ymm, vm64x, ymm" xed="VGATHERQPD_YMMf64_MEMf64_YMMi64_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_mask_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="float const*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:64] := 0
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VGATHERQPS" form="xmm, vm64x, xmm" xed="VGATHERQPS_XMMf32_MEMf32_XMMi32_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mask_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="float const*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERQPS" form="xmm, vm64y, xmm" xed="VGATHERQPS_XMMf32_MEMf32_XMMi32_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_mask_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="int const*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:64] := 0
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPGATHERQD" form="xmm, vm64x, xmm" xed="VPGATHERQD_XMMu32_MEMd_XMMi32_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mask_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="int const*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERQD" form="xmm, vm64y, xmm" xed="VPGATHERQD_XMMu32_MEMd_XMMi32_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_mask_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__int64 const*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF mask[i+63]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERQQ" form="xmm, vm64x, xmm" xed="VPGATHERQQ_XMMu64_MEMq_XMMi64_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mask_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__int64 const*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF mask[i+63]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERQQ" form="ymm, vm64x, ymm" xed="VPGATHERQQ_YMMu64_MEMq_YMMi64_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_maskload_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="int const*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<description>Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMASKMOVD" form="xmm, xmm, m128" xed="VPMASKMOVD_XMMdq_XMMdq_MEMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_maskload_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="int const*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<description>Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMASKMOVD" form="ymm, ymm, m256" xed="VPMASKMOVD_YMMqq_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_maskload_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__int64 const*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<description>Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMASKMOVQ" form="xmm, xmm, m128" xed="VPMASKMOVQ_XMMdq_XMMdq_MEMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_maskload_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__int64 const*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<description>Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMASKMOVQ" form="ymm, ymm, m256" xed="VPMASKMOVQ_YMMqq_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_maskstore_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="int*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMASKMOVD" form="m128, xmm, xmm" xed="VPMASKMOVD_MEMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_maskstore_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="int*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMASKMOVD" form="m256, ymm, ymm" xed="VPMASKMOVD_MEMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_maskstore_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__int64*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<parameter type="__m128i" varname="mask" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMASKMOVQ" form="m128, xmm, xmm" xed="VPMASKMOVQ_MEMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_maskstore_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__int64*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<parameter type="__m256i" varname="mask" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMASKMOVQ" form="m256, ymm, ymm" xed="VPMASKMOVQ_MEMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSB" form="ymm, ymm, ymm" xed="VPMAXSB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSW" form="ymm, ymm, ymm" xed="VPMAXSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSD" form="ymm, ymm, ymm" xed="VPMAXSD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUB" form="ymm, ymm, ymm" xed="VPMAXUB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUW" form="ymm, ymm, ymm" xed="VPMAXUW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUD" form="ymm, ymm, ymm" xed="VPMAXUD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSB" form="ymm, ymm, ymm" xed="VPMINSB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSW" form="ymm, ymm, ymm" xed="VPMINSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSD" form="ymm, ymm, ymm" xed="VPMINSD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUB" form="ymm, ymm, ymm" xed="VPMINUB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUW" form="ymm, ymm, ymm" xed="VPMINUW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUD" form="ymm, ymm, ymm" xed="VPMINUD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_movemask_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+	</operation>
+	<instruction name="VPMOVMSKB" form="r32, ymm" xed="VPMOVMSKB_GPR32d_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mpsadbw_epu8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Eight SADs are performed for each 128-bit lane using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8".</description>
+	<operation>
+DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
+	a_offset := imm8[2]*32
+	b_offset := imm8[1:0]*32
+	FOR j := 0 to 7
+		i := j*8
+		k := a_offset+i
+		l := b_offset
+		tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \
+		                   ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24]))
+	ENDFOR
+	RETURN tmp[127:0]
+}
+dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
+dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMPSADBW" form="ymm, ymm, ymm, imm8" xed="VMPSADBW_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULDQ" form="ymm, ymm, ymm" xed="VPMULDQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULUDQ" form="ymm, ymm, ymm" xed="VPMULUDQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULHW" form="ymm, ymm, ymm" xed="VPMULHW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULHUW" form="ymm, ymm, ymm" xed="VPMULHUW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULHRSW" form="ymm, ymm, ymm" xed="VPMULHRSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULLW" form="ymm, ymm, ymm" xed="VPMULLW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Multiply the packed signed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp[63:0] := a[i+31:i] * b[i+31:i]
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULLD" form="ymm, ymm, ymm" xed="VPMULLD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_or_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m256i" varname="b" etype="M256"/>
+	<description>Compute the bitwise OR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := (a[255:0] OR b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOR" form="ymm, ymm, ymm" xed="VPOR_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="SI8"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(a[79:64])
+dst[47:40] := Saturate8(a[95:80])
+dst[55:48] := Saturate8(a[111:96])
+dst[63:56] := Saturate8(a[127:112])
+dst[71:64] := Saturate8(b[15:0])
+dst[79:72] := Saturate8(b[31:16])
+dst[87:80] := Saturate8(b[47:32])
+dst[95:88] := Saturate8(b[63:48])
+dst[103:96] := Saturate8(b[79:64])
+dst[111:104] := Saturate8(b[95:80])
+dst[119:112] := Saturate8(b[111:96])
+dst[127:120] := Saturate8(b[127:112])
+dst[135:128] := Saturate8(a[143:128])
+dst[143:136] := Saturate8(a[159:144])
+dst[151:144] := Saturate8(a[175:160])
+dst[159:152] := Saturate8(a[191:176])
+dst[167:160] := Saturate8(a[207:192])
+dst[175:168] := Saturate8(a[223:208])
+dst[183:176] := Saturate8(a[239:224])
+dst[191:184] := Saturate8(a[255:240])
+dst[199:192] := Saturate8(b[143:128])
+dst[207:200] := Saturate8(b[159:144])
+dst[215:208] := Saturate8(b[175:160])
+dst[223:216] := Saturate8(b[191:176])
+dst[231:224] := Saturate8(b[207:192])
+dst[239:232] := Saturate8(b[223:208])
+dst[247:240] := Saturate8(b[239:224])
+dst[255:248] := Saturate8(b[255:240])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKSSWB" form="ymm, ymm, ymm" xed="VPACKSSWB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(a[95:64])
+dst[63:48] := Saturate16(a[127:96])
+dst[79:64] := Saturate16(b[31:0])
+dst[95:80] := Saturate16(b[63:32])
+dst[111:96] := Saturate16(b[95:64])
+dst[127:112] := Saturate16(b[127:96])
+dst[143:128] := Saturate16(a[159:128])
+dst[159:144] := Saturate16(a[191:160])
+dst[175:160] := Saturate16(a[223:192])
+dst[191:176] := Saturate16(a[255:224])
+dst[207:192] := Saturate16(b[159:128])
+dst[223:208] := Saturate16(b[191:160])
+dst[239:224] := Saturate16(b[223:192])
+dst[255:240] := Saturate16(b[255:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKSSDW" form="ymm, ymm, ymm" xed="VPACKSSDW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(a[79:64])
+dst[47:40] := SaturateU8(a[95:80])
+dst[55:48] := SaturateU8(a[111:96])
+dst[63:56] := SaturateU8(a[127:112])
+dst[71:64] := SaturateU8(b[15:0])
+dst[79:72] := SaturateU8(b[31:16])
+dst[87:80] := SaturateU8(b[47:32])
+dst[95:88] := SaturateU8(b[63:48])
+dst[103:96] := SaturateU8(b[79:64])
+dst[111:104] := SaturateU8(b[95:80])
+dst[119:112] := SaturateU8(b[111:96])
+dst[127:120] := SaturateU8(b[127:112])
+dst[135:128] := SaturateU8(a[143:128])
+dst[143:136] := SaturateU8(a[159:144])
+dst[151:144] := SaturateU8(a[175:160])
+dst[159:152] := SaturateU8(a[191:176])
+dst[167:160] := SaturateU8(a[207:192])
+dst[175:168] := SaturateU8(a[223:208])
+dst[183:176] := SaturateU8(a[239:224])
+dst[191:184] := SaturateU8(a[255:240])
+dst[199:192] := SaturateU8(b[143:128])
+dst[207:200] := SaturateU8(b[159:144])
+dst[215:208] := SaturateU8(b[175:160])
+dst[223:216] := SaturateU8(b[191:176])
+dst[231:224] := SaturateU8(b[207:192])
+dst[239:232] := SaturateU8(b[223:208])
+dst[247:240] := SaturateU8(b[239:224])
+dst[255:248] := SaturateU8(b[255:240])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKUSWB" form="ymm, ymm, ymm" xed="VPACKUSWB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := SaturateU16(a[31:0])
+dst[31:16] := SaturateU16(a[63:32])
+dst[47:32] := SaturateU16(a[95:64])
+dst[63:48] := SaturateU16(a[127:96])
+dst[79:64] := SaturateU16(b[31:0])
+dst[95:80] := SaturateU16(b[63:32])
+dst[111:96] := SaturateU16(b[95:64])
+dst[127:112] := SaturateU16(b[127:96])
+dst[143:128] := SaturateU16(a[159:128])
+dst[159:144] := SaturateU16(a[191:160])
+dst[175:160] := SaturateU16(a[223:192])
+dst[191:176] := SaturateU16(a[255:224])
+dst[207:192] := SaturateU16(b[159:128])
+dst[223:208] := SaturateU16(b[191:160])
+dst[239:224] := SaturateU16(b[223:192])
+dst[255:240] := SaturateU16(b[255:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKUSDW" form="ymm, ymm, ymm" xed="VPACKUSDW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_permute2x128_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m256i" varname="b" etype="M256"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src1, src2, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERM2I128" form="ymm, ymm, ymm, imm8" xed="VPERM2I128_YMMqq_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_permute4x64_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMQ" form="ymm, ymm, imm8" xed="VPERMQ_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_permute4x64_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPD" form="ymm, ymm, imm8" xed="VPERMPD_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_permutevar8x32_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMD" form="ymm, ymm, ymm" xed="VPERMD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_permutevar8x32_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPS" form="ymm, ymm, ymm" xed="VPERMPS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+FOR j := 0 to 3
+	i := j*64
+	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \
+	               tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
+	dst[i+63:i+16] := 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSADBW" form="ymm, ymm, ymm" xed="VPSADBW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFD" form="ymm, ymm, imm8" xed="VPSHUFD_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[3:0] := b[i+3:i]
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+	IF b[128+i+7] == 1
+		dst[128+i+7:128+i] := 0
+	ELSE
+		index[3:0] := b[128+i+3:128+i]
+		dst[128+i+7:128+i] := a[128+index*8+7:128+index*8]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFB" form="ymm, ymm, ymm" xed="VPSHUFB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+dst[191:128] := a[191:128]
+dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFHW" form="ymm, ymm, imm8" xed="VPSHUFHW_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+dst[127:64] := a[127:64]
+dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+dst[255:192] := a[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFLW" form="ymm, ymm, imm8" xed="VPSHUFLW_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sign_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Negate packed signed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF b[i+7:i] &lt; 0
+		dst[i+7:i] := -(a[i+7:i])
+	ELSE IF b[i+7:i] == 0
+		dst[i+7:i] := 0
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSIGNB" form="ymm, ymm, ymm" xed="VPSIGNB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sign_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Negate packed signed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF b[i+15:i] &lt; 0
+		dst[i+15:i] := -(a[i+15:i])
+	ELSE IF b[i+15:i] == 0
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSIGNW" form="ymm, ymm, ymm" xed="VPSIGNW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sign_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Negate packed signed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF b[i+31:i] &lt; 0
+		dst[i+31:i] := -(a[i+31:i])
+	ELSE IF b[i+31:i] == 0
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSIGND" form="ymm, ymm, ymm" xed="VPSIGND_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_slli_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLDQ" form="ymm, ymm, imm8" xed="VPSLLDQ_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_bslli_epi128">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLDQ" form="ymm, ymm, imm8" xed="VPSLLDQ_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLW" form="ymm, ymm, xmm" xed="VPSLLW_YMMqq_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLW" form="ymm, ymm, imm8" xed="VPSLLW_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLD" form="ymm, ymm, xmm" xed="VPSLLD_YMMqq_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLD" form="ymm, ymm, imm8" xed="VPSLLD_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="ymm, ymm, xmm" xed="VPSLLQ_YMMqq_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="ymm, ymm, imm8" xed="VPSLLQ_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLVD" form="xmm, xmm, xmm" xed="VPSLLVD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLVD" form="ymm, ymm, ymm" xed="VPSLLVD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLVQ" form="xmm, xmm, xmm" xed="VPSLLVQ_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLVQ" form="ymm, ymm, ymm" xed="VPSLLVQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAW" form="ymm, ymm, xmm" xed="VPSRAW_YMMqq_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAW" form="ymm, ymm, imm8" xed="VPSRAW_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAD" form="ymm, ymm, xmm" xed="VPSRAD_YMMqq_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAD" form="ymm, ymm, imm8" xed="VPSRAD_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAVD" form="xmm, xmm, xmm" xed="VPSRAVD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAVD" form="ymm, ymm, ymm" xed="VPSRAVD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srli_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLDQ" form="ymm, ymm, imm8" xed="VPSRLDQ_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_bsrli_epi128">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLDQ" form="ymm, ymm, imm8" xed="VPSRLDQ_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLW" form="ymm, ymm, xmm" xed="VPSRLW_YMMqq_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLW" form="ymm, ymm, imm8" xed="VPSRLW_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLD" form="ymm, ymm, xmm" xed="VPSRLD_YMMqq_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLD" form="ymm, ymm, imm8" xed="VPSRLD_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="ymm, ymm, xmm" xed="VPSRLQ_YMMqq_YMMqq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="ymm, ymm, imm8" xed="VPSRLQ_YMMqq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLVD" form="xmm, xmm, xmm" xed="VPSRLVD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLVD" form="ymm, ymm, ymm" xed="VPSRLVD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLVQ" form="xmm, xmm, xmm" xed="VPSRLVQ_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLVQ" form="ymm, ymm, ymm" xed="VPSRLVQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_stream_load_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i const*" varname="mem_addr" etype="M256" memwidth="256"/>
+	<description>Load 256-bits of integer data from memory into "dst" using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVNTDQA" form="ymm, m256" xed="VMOVNTDQA_YMMqq_MEMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBB" form="ymm, ymm, ymm" xed="VPSUBB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBW" form="ymm, ymm, ymm" xed="VPSUBW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBD" form="ymm, ymm, ymm" xed="VPSUBD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBQ" form="ymm, ymm, ymm" xed="VPSUBQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBSB" form="ymm, ymm, ymm" xed="VPSUBSB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBSW" form="ymm, ymm, ymm" xed="VPSUBSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBUSB" form="ymm, ymm, ymm" xed="VPSUBUSB_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBUSW" form="ymm, ymm, ymm" xed="VPSUBUSW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_xor_si256">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m256i" varname="a" etype="M256"/>
+	<parameter type="__m256i" varname="b" etype="M256"/>
+	<description>Compute the bitwise XOR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := (a[255:0] XOR b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPXOR" form="ymm, ymm, ymm" xed="VPXOR_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHBW" form="ymm, ymm, ymm" xed="VPUNPCKHBW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHWD" form="ymm, ymm, ymm" xed="VPUNPCKHWD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHDQ" form="ymm, ymm, ymm" xed="VPUNPCKHDQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHQDQ" form="ymm, ymm, ymm" xed="VPUNPCKHQDQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLBW" form="ymm, ymm, ymm" xed="VPUNPCKLBW_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLWD" form="ymm, ymm, ymm" xed="VPUNPCKLWD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLDQ" form="ymm, ymm, ymm" xed="VPUNPCKLDQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX2" name="_mm256_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLQDQ" form="ymm, ymm, ymm" xed="VPUNPCKLQDQ_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kunpackd">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask64" varname="dst" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Unpack and interleave 32 bits from masks "a" and "b", and store the 64-bit result in "dst".</description>
+	<operation>
+dst[31:0] := b[31:0]
+dst[63:32] := a[31:0]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="KUNPCKDQ" form="k, k, k" xed="KUNPCKDQ_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kunpackw">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask32" varname="dst" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Unpack and interleave 16 bits from masks "a" and "b", and store the 32-bit result in "dst".</description>
+	<operation>
+dst[15:0] := b[15:0]
+dst[31:16] := a[15:0]
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="KUNPCKWD" form="k, k, k" xed="KUNPCKWD_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 1
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 3
+	i := j*64
+	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	               ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                  ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDBPSADBW" form="ymm, ymm, ymm, imm8" xed="VDBPSADBW_YMMu16_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 1
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 3
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDBPSADBW" form="ymm {k}, ymm, ymm, imm8" xed="VDBPSADBW_YMMu16_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 1
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 3
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDBPSADBW" form="ymm {z}, ymm, ymm, imm8" xed="VDBPSADBW_YMMu16_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 3
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 7
+	i := j*64
+	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	               ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                  ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDBPSADBW" form="zmm, zmm, zmm, imm8" xed="VDBPSADBW_ZMMu16_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 3
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 7
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDBPSADBW" form="zmm {k}, zmm, zmm, imm8" xed="VDBPSADBW_ZMMu16_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+FOR i := 0 to 3
+	tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ]
+	tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ]
+	tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ]
+	tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ]
+ENDFOR
+FOR j := 0 to 7
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDBPSADBW" form="zmm {z}, zmm, zmm, imm8" xed="VDBPSADBW_ZMMu16_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+tmp.dword[0] := b.dword[ imm8[1:0] ]
+tmp.dword[1] := b.dword[ imm8[3:2] ]
+tmp.dword[2] := b.dword[ imm8[5:4] ]
+tmp.dword[3] := b.dword[ imm8[7:6] ]
+FOR j := 0 to 1
+	i := j*64
+	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	               ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                  ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                  ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDBPSADBW" form="xmm, xmm, xmm, imm8" xed="VDBPSADBW_XMMu16_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+tmp.dword[0] := b.dword[ imm8[1:0] ]
+tmp.dword[1] := b.dword[ imm8[3:2] ]
+tmp.dword[2] := b.dword[ imm8[5:4] ]
+tmp.dword[3] := b.dword[ imm8[7:6] ]
+FOR j := 0 to 1
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDBPSADBW" form="xmm {k}, xmm, xmm, imm8" xed="VDBPSADBW_XMMu16_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.</description>
+	<operation>
+tmp.dword[0] := b.dword[ imm8[1:0] ]
+tmp.dword[1] := b.dword[ imm8[3:2] ]
+tmp.dword[2] := b.dword[ imm8[5:4] ]
+tmp.dword[3] := b.dword[ imm8[7:6] ]
+FOR j := 0 to 1
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\
+	                   ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\
+	                      ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\
+	                      ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDBPSADBW" form="xmm {z}, xmm, xmm, imm8" xed="VDBPSADBW_XMMu16_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="256"/>
+	<description>Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="ymm {k}, m256" xed="VMOVDQU16_YMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="ymm {k}, ymm" xed="VMOVDQU16_YMMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI16" memwidth="256"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Store packed 16-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU16" form="m256 {k}, ymm" xed="VMOVDQU16_MEMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="256"/>
+	<description>Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="ymm {z}, m256" xed="VMOVDQU16_YMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="ymm {z}, ymm" xed="VMOVDQU16_YMMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="512"/>
+	<description>Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="zmm {k}, m512" xed="VMOVDQU16_ZMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="zmm {k}, zmm" xed="VMOVDQU16_ZMMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI16" memwidth="512"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Store packed 16-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU16" form="m512 {k}, zmm" xed="VMOVDQU16_MEMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="512"/>
+	<description>Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="zmm {z}, m512" xed="VMOVDQU16_ZMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="zmm {z}, zmm" xed="VMOVDQU16_ZMMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="128"/>
+	<description>Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="xmm {k}, m128" xed="VMOVDQU16_XMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="xmm {k}, xmm" xed="VMOVDQU16_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI16" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Store packed 16-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU16" form="m128 {k}, xmm" xed="VMOVDQU16_MEMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="128"/>
+	<description>Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="xmm {z}, m128" xed="VMOVDQU16_XMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="xmm {z}, xmm" xed="VMOVDQU16_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI8" memwidth="256"/>
+	<description>Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="ymm {k}, m256" xed="VMOVDQU8_YMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="ymm {k}, ymm" xed="VMOVDQU8_YMMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI8" memwidth="256"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Store packed 8-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU8" form="m256 {k}, ymm" xed="VMOVDQU8_MEMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI8" memwidth="256"/>
+	<description>Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="ymm {z}, m256" xed="VMOVDQU8_YMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="ymm {z}, ymm" xed="VMOVDQU8_YMMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI8" memwidth="512"/>
+	<description>Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="zmm {k}, m512" xed="VMOVDQU8_ZMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="zmm {k}, zmm" xed="VMOVDQU8_ZMMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI8" memwidth="512"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Store packed 8-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU8" form="m512 {k}, zmm" xed="VMOVDQU8_MEMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI8" memwidth="512"/>
+	<description>Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="zmm {z}, m512" xed="VMOVDQU8_ZMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="zmm {z}, zmm" xed="VMOVDQU8_ZMMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI8" memwidth="128"/>
+	<description>Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="xmm {k}, m128" xed="VMOVDQU8_XMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="xmm {k}, xmm" xed="VMOVDQU8_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI8" memwidth="128"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Store packed 8-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU8" form="m128 {k}, xmm" xed="VMOVDQU8_MEMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI8" memwidth="128"/>
+	<description>Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="xmm {z}, m128" xed="VMOVDQU8_XMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="xmm {z}, xmm" xed="VMOVDQU8_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSB" form="ymm {k}, ymm" xed="VPABSB_YMMi8_MASKmskw_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSB" form="ymm {z}, ymm" xed="VPABSB_YMMi8_MASKmskw_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSB" form="zmm, zmm" xed="VPABSB_ZMMi8_MASKmskw_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSB" form="zmm {k}, zmm" xed="VPABSB_ZMMi8_MASKmskw_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSB" form="zmm {z}, zmm" xed="VPABSB_ZMMi8_MASKmskw_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPABSB" form="xmm {k}, xmm" xed="VPABSB_XMMi8_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPABSB" form="xmm {z}, xmm" xed="VPABSB_XMMi8_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSW" form="ymm {k}, ymm" xed="VPABSW_YMMi16_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSW" form="ymm {z}, ymm" xed="VPABSW_YMMi16_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSW" form="zmm, zmm" xed="VPABSW_ZMMi16_MASKmskw_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSW" form="zmm {k}, zmm" xed="VPABSW_ZMMi16_MASKmskw_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSW" form="zmm {z}, zmm" xed="VPABSW_ZMMi16_MASKmskw_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPABSW" form="xmm {k}, xmm" xed="VPABSW_XMMi16_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPABSW" form="xmm {z}, xmm" xed="VPABSW_XMMi16_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__m256i" varname="src" etype="SI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+tmp_dst[143:128] := Saturate16(a[159:128])
+tmp_dst[159:144] := Saturate16(a[191:160])
+tmp_dst[175:160] := Saturate16(a[223:192])
+tmp_dst[191:176] := Saturate16(a[255:224])
+tmp_dst[207:192] := Saturate16(b[159:128])
+tmp_dst[223:208] := Saturate16(b[191:160])
+tmp_dst[239:224] := Saturate16(b[223:192])
+tmp_dst[255:240] := Saturate16(b[255:224])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKSSDW" form="ymm {k}, ymm, ymm" xed="VPACKSSDW_YMMi16_MASKmskw_YMMi32_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+tmp_dst[143:128] := Saturate16(a[159:128])
+tmp_dst[159:144] := Saturate16(a[191:160])
+tmp_dst[175:160] := Saturate16(a[223:192])
+tmp_dst[191:176] := Saturate16(a[255:224])
+tmp_dst[207:192] := Saturate16(b[159:128])
+tmp_dst[223:208] := Saturate16(b[191:160])
+tmp_dst[239:224] := Saturate16(b[223:192])
+tmp_dst[255:240] := Saturate16(b[255:224])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKSSDW" form="ymm {z}, ymm, ymm" xed="VPACKSSDW_YMMi16_MASKmskw_YMMi32_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="SI16"/>
+	<parameter type="__m512i" varname="src" etype="SI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+tmp_dst[143:128] := Saturate16(a[159:128])
+tmp_dst[159:144] := Saturate16(a[191:160])
+tmp_dst[175:160] := Saturate16(a[223:192])
+tmp_dst[191:176] := Saturate16(a[255:224])
+tmp_dst[207:192] := Saturate16(b[159:128])
+tmp_dst[223:208] := Saturate16(b[191:160])
+tmp_dst[239:224] := Saturate16(b[223:192])
+tmp_dst[255:240] := Saturate16(b[255:224])
+tmp_dst[271:256] := Saturate16(a[287:256])
+tmp_dst[287:272] := Saturate16(a[319:288])
+tmp_dst[303:288] := Saturate16(a[351:320])
+tmp_dst[319:304] := Saturate16(a[383:352])
+tmp_dst[335:320] := Saturate16(b[287:256])
+tmp_dst[351:336] := Saturate16(b[319:288])
+tmp_dst[367:352] := Saturate16(b[351:320])
+tmp_dst[383:368] := Saturate16(b[383:352])
+tmp_dst[399:384] := Saturate16(a[415:384])
+tmp_dst[415:400] := Saturate16(a[447:416])
+tmp_dst[431:416] := Saturate16(a[479:448])
+tmp_dst[447:432] := Saturate16(a[511:480])
+tmp_dst[463:448] := Saturate16(b[415:384])
+tmp_dst[479:464] := Saturate16(b[447:416])
+tmp_dst[495:480] := Saturate16(b[479:448])
+tmp_dst[511:496] := Saturate16(b[511:480])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSSDW" form="zmm {k}, zmm, zmm" xed="VPACKSSDW_ZMMi16_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+tmp_dst[143:128] := Saturate16(a[159:128])
+tmp_dst[159:144] := Saturate16(a[191:160])
+tmp_dst[175:160] := Saturate16(a[223:192])
+tmp_dst[191:176] := Saturate16(a[255:224])
+tmp_dst[207:192] := Saturate16(b[159:128])
+tmp_dst[223:208] := Saturate16(b[191:160])
+tmp_dst[239:224] := Saturate16(b[223:192])
+tmp_dst[255:240] := Saturate16(b[255:224])
+tmp_dst[271:256] := Saturate16(a[287:256])
+tmp_dst[287:272] := Saturate16(a[319:288])
+tmp_dst[303:288] := Saturate16(a[351:320])
+tmp_dst[319:304] := Saturate16(a[383:352])
+tmp_dst[335:320] := Saturate16(b[287:256])
+tmp_dst[351:336] := Saturate16(b[319:288])
+tmp_dst[367:352] := Saturate16(b[351:320])
+tmp_dst[383:368] := Saturate16(b[383:352])
+tmp_dst[399:384] := Saturate16(a[415:384])
+tmp_dst[415:400] := Saturate16(a[447:416])
+tmp_dst[431:416] := Saturate16(a[479:448])
+tmp_dst[447:432] := Saturate16(a[511:480])
+tmp_dst[463:448] := Saturate16(b[415:384])
+tmp_dst[479:464] := Saturate16(b[447:416])
+tmp_dst[495:480] := Saturate16(b[479:448])
+tmp_dst[511:496] := Saturate16(b[511:480])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSSDW" form="zmm {z}, zmm, zmm" xed="VPACKSSDW_ZMMi16_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="SI16"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(a[95:64])
+dst[63:48] := Saturate16(a[127:96])
+dst[79:64] := Saturate16(b[31:0])
+dst[95:80] := Saturate16(b[63:32])
+dst[111:96] := Saturate16(b[95:64])
+dst[127:112] := Saturate16(b[127:96])
+dst[143:128] := Saturate16(a[159:128])
+dst[159:144] := Saturate16(a[191:160])
+dst[175:160] := Saturate16(a[223:192])
+dst[191:176] := Saturate16(a[255:224])
+dst[207:192] := Saturate16(b[159:128])
+dst[223:208] := Saturate16(b[191:160])
+dst[239:224] := Saturate16(b[223:192])
+dst[255:240] := Saturate16(b[255:224])
+dst[271:256] := Saturate16(a[287:256])
+dst[287:272] := Saturate16(a[319:288])
+dst[303:288] := Saturate16(a[351:320])
+dst[319:304] := Saturate16(a[383:352])
+dst[335:320] := Saturate16(b[287:256])
+dst[351:336] := Saturate16(b[319:288])
+dst[367:352] := Saturate16(b[351:320])
+dst[383:368] := Saturate16(b[383:352])
+dst[399:384] := Saturate16(a[415:384])
+dst[415:400] := Saturate16(a[447:416])
+dst[431:416] := Saturate16(a[479:448])
+dst[447:432] := Saturate16(a[511:480])
+dst[463:448] := Saturate16(b[415:384])
+dst[479:464] := Saturate16(b[447:416])
+dst[495:480] := Saturate16(b[479:448])
+dst[511:496] := Saturate16(b[511:480])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSSDW" form="zmm, zmm, zmm" xed="VPACKSSDW_ZMMi16_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="src" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPACKSSDW" form="xmm {k}, xmm, xmm" xed="VPACKSSDW_XMMi16_MASKmskw_XMMi32_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate16(a[31:0])
+tmp_dst[31:16] := Saturate16(a[63:32])
+tmp_dst[47:32] := Saturate16(a[95:64])
+tmp_dst[63:48] := Saturate16(a[127:96])
+tmp_dst[79:64] := Saturate16(b[31:0])
+tmp_dst[95:80] := Saturate16(b[63:32])
+tmp_dst[111:96] := Saturate16(b[95:64])
+tmp_dst[127:112] := Saturate16(b[127:96])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPACKSSDW" form="xmm {z}, xmm, xmm" xed="VPACKSSDW_XMMi16_MASKmskw_XMMi32_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="SI8"/>
+	<parameter type="__m256i" varname="src" etype="SI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+tmp_dst[135:128] := Saturate8(a[143:128])
+tmp_dst[143:136] := Saturate8(a[159:144])
+tmp_dst[151:144] := Saturate8(a[175:160])
+tmp_dst[159:152] := Saturate8(a[191:176])
+tmp_dst[167:160] := Saturate8(a[207:192])
+tmp_dst[175:168] := Saturate8(a[223:208])
+tmp_dst[183:176] := Saturate8(a[239:224])
+tmp_dst[191:184] := Saturate8(a[255:240])
+tmp_dst[199:192] := Saturate8(b[143:128])
+tmp_dst[207:200] := Saturate8(b[159:144])
+tmp_dst[215:208] := Saturate8(b[175:160])
+tmp_dst[223:216] := Saturate8(b[191:176])
+tmp_dst[231:224] := Saturate8(b[207:192])
+tmp_dst[239:232] := Saturate8(b[223:208])
+tmp_dst[247:240] := Saturate8(b[239:224])
+tmp_dst[255:248] := Saturate8(b[255:240])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKSSWB" form="ymm {k}, ymm, ymm" xed="VPACKSSWB_YMMi8_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+tmp_dst[135:128] := Saturate8(a[143:128])
+tmp_dst[143:136] := Saturate8(a[159:144])
+tmp_dst[151:144] := Saturate8(a[175:160])
+tmp_dst[159:152] := Saturate8(a[191:176])
+tmp_dst[167:160] := Saturate8(a[207:192])
+tmp_dst[175:168] := Saturate8(a[223:208])
+tmp_dst[183:176] := Saturate8(a[239:224])
+tmp_dst[191:184] := Saturate8(a[255:240])
+tmp_dst[199:192] := Saturate8(b[143:128])
+tmp_dst[207:200] := Saturate8(b[159:144])
+tmp_dst[215:208] := Saturate8(b[175:160])
+tmp_dst[223:216] := Saturate8(b[191:176])
+tmp_dst[231:224] := Saturate8(b[207:192])
+tmp_dst[239:232] := Saturate8(b[223:208])
+tmp_dst[247:240] := Saturate8(b[239:224])
+tmp_dst[255:248] := Saturate8(b[255:240])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKSSWB" form="ymm {z}, ymm, ymm" xed="VPACKSSWB_YMMi8_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="SI8"/>
+	<parameter type="__m512i" varname="src" etype="SI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+tmp_dst[135:128] := Saturate8(a[143:128])
+tmp_dst[143:136] := Saturate8(a[159:144])
+tmp_dst[151:144] := Saturate8(a[175:160])
+tmp_dst[159:152] := Saturate8(a[191:176])
+tmp_dst[167:160] := Saturate8(a[207:192])
+tmp_dst[175:168] := Saturate8(a[223:208])
+tmp_dst[183:176] := Saturate8(a[239:224])
+tmp_dst[191:184] := Saturate8(a[255:240])
+tmp_dst[199:192] := Saturate8(b[143:128])
+tmp_dst[207:200] := Saturate8(b[159:144])
+tmp_dst[215:208] := Saturate8(b[175:160])
+tmp_dst[223:216] := Saturate8(b[191:176])
+tmp_dst[231:224] := Saturate8(b[207:192])
+tmp_dst[239:232] := Saturate8(b[223:208])
+tmp_dst[247:240] := Saturate8(b[239:224])
+tmp_dst[255:248] := Saturate8(b[255:240])
+tmp_dst[263:256] := Saturate8(a[271:256])
+tmp_dst[271:264] := Saturate8(a[287:272])
+tmp_dst[279:272] := Saturate8(a[303:288])
+tmp_dst[287:280] := Saturate8(a[319:304])
+tmp_dst[295:288] := Saturate8(a[335:320])
+tmp_dst[303:296] := Saturate8(a[351:336])
+tmp_dst[311:304] := Saturate8(a[367:352])
+tmp_dst[319:312] := Saturate8(a[383:368])
+tmp_dst[327:320] := Saturate8(b[271:256])
+tmp_dst[335:328] := Saturate8(b[287:272])
+tmp_dst[343:336] := Saturate8(b[303:288])
+tmp_dst[351:344] := Saturate8(b[319:304])
+tmp_dst[359:352] := Saturate8(b[335:320])
+tmp_dst[367:360] := Saturate8(b[351:336])
+tmp_dst[375:368] := Saturate8(b[367:352])
+tmp_dst[383:376] := Saturate8(b[383:368])
+tmp_dst[391:384] := Saturate8(a[399:384])
+tmp_dst[399:392] := Saturate8(a[415:400])
+tmp_dst[407:400] := Saturate8(a[431:416])
+tmp_dst[415:408] := Saturate8(a[447:432])
+tmp_dst[423:416] := Saturate8(a[463:448])
+tmp_dst[431:424] := Saturate8(a[479:464])
+tmp_dst[439:432] := Saturate8(a[495:480])
+tmp_dst[447:440] := Saturate8(a[511:496])
+tmp_dst[455:448] := Saturate8(b[399:384])
+tmp_dst[463:456] := Saturate8(b[415:400])
+tmp_dst[471:464] := Saturate8(b[431:416])
+tmp_dst[479:472] := Saturate8(b[447:432])
+tmp_dst[487:480] := Saturate8(b[463:448])
+tmp_dst[495:488] := Saturate8(b[479:464])
+tmp_dst[503:496] := Saturate8(b[495:480])
+tmp_dst[511:504] := Saturate8(b[511:496])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSSWB" form="zmm {k}, zmm, zmm" xed="VPACKSSWB_ZMMi8_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+tmp_dst[135:128] := Saturate8(a[143:128])
+tmp_dst[143:136] := Saturate8(a[159:144])
+tmp_dst[151:144] := Saturate8(a[175:160])
+tmp_dst[159:152] := Saturate8(a[191:176])
+tmp_dst[167:160] := Saturate8(a[207:192])
+tmp_dst[175:168] := Saturate8(a[223:208])
+tmp_dst[183:176] := Saturate8(a[239:224])
+tmp_dst[191:184] := Saturate8(a[255:240])
+tmp_dst[199:192] := Saturate8(b[143:128])
+tmp_dst[207:200] := Saturate8(b[159:144])
+tmp_dst[215:208] := Saturate8(b[175:160])
+tmp_dst[223:216] := Saturate8(b[191:176])
+tmp_dst[231:224] := Saturate8(b[207:192])
+tmp_dst[239:232] := Saturate8(b[223:208])
+tmp_dst[247:240] := Saturate8(b[239:224])
+tmp_dst[255:248] := Saturate8(b[255:240])
+tmp_dst[263:256] := Saturate8(a[271:256])
+tmp_dst[271:264] := Saturate8(a[287:272])
+tmp_dst[279:272] := Saturate8(a[303:288])
+tmp_dst[287:280] := Saturate8(a[319:304])
+tmp_dst[295:288] := Saturate8(a[335:320])
+tmp_dst[303:296] := Saturate8(a[351:336])
+tmp_dst[311:304] := Saturate8(a[367:352])
+tmp_dst[319:312] := Saturate8(a[383:368])
+tmp_dst[327:320] := Saturate8(b[271:256])
+tmp_dst[335:328] := Saturate8(b[287:272])
+tmp_dst[343:336] := Saturate8(b[303:288])
+tmp_dst[351:344] := Saturate8(b[319:304])
+tmp_dst[359:352] := Saturate8(b[335:320])
+tmp_dst[367:360] := Saturate8(b[351:336])
+tmp_dst[375:368] := Saturate8(b[367:352])
+tmp_dst[383:376] := Saturate8(b[383:368])
+tmp_dst[391:384] := Saturate8(a[399:384])
+tmp_dst[399:392] := Saturate8(a[415:400])
+tmp_dst[407:400] := Saturate8(a[431:416])
+tmp_dst[415:408] := Saturate8(a[447:432])
+tmp_dst[423:416] := Saturate8(a[463:448])
+tmp_dst[431:424] := Saturate8(a[479:464])
+tmp_dst[439:432] := Saturate8(a[495:480])
+tmp_dst[447:440] := Saturate8(a[511:496])
+tmp_dst[455:448] := Saturate8(b[399:384])
+tmp_dst[463:456] := Saturate8(b[415:400])
+tmp_dst[471:464] := Saturate8(b[431:416])
+tmp_dst[479:472] := Saturate8(b[447:432])
+tmp_dst[487:480] := Saturate8(b[463:448])
+tmp_dst[495:488] := Saturate8(b[479:464])
+tmp_dst[503:496] := Saturate8(b[495:480])
+tmp_dst[511:504] := Saturate8(b[511:496])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSSWB" form="zmm {z}, zmm, zmm" xed="VPACKSSWB_ZMMi8_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="SI8"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(a[79:64])
+dst[47:40] := Saturate8(a[95:80])
+dst[55:48] := Saturate8(a[111:96])
+dst[63:56] := Saturate8(a[127:112])
+dst[71:64] := Saturate8(b[15:0])
+dst[79:72] := Saturate8(b[31:16])
+dst[87:80] := Saturate8(b[47:32])
+dst[95:88] := Saturate8(b[63:48])
+dst[103:96] := Saturate8(b[79:64])
+dst[111:104] := Saturate8(b[95:80])
+dst[119:112] := Saturate8(b[111:96])
+dst[127:120] := Saturate8(b[127:112])
+dst[135:128] := Saturate8(a[143:128])
+dst[143:136] := Saturate8(a[159:144])
+dst[151:144] := Saturate8(a[175:160])
+dst[159:152] := Saturate8(a[191:176])
+dst[167:160] := Saturate8(a[207:192])
+dst[175:168] := Saturate8(a[223:208])
+dst[183:176] := Saturate8(a[239:224])
+dst[191:184] := Saturate8(a[255:240])
+dst[199:192] := Saturate8(b[143:128])
+dst[207:200] := Saturate8(b[159:144])
+dst[215:208] := Saturate8(b[175:160])
+dst[223:216] := Saturate8(b[191:176])
+dst[231:224] := Saturate8(b[207:192])
+dst[239:232] := Saturate8(b[223:208])
+dst[247:240] := Saturate8(b[239:224])
+dst[255:248] := Saturate8(b[255:240])
+dst[263:256] := Saturate8(a[271:256])
+dst[271:264] := Saturate8(a[287:272])
+dst[279:272] := Saturate8(a[303:288])
+dst[287:280] := Saturate8(a[319:304])
+dst[295:288] := Saturate8(a[335:320])
+dst[303:296] := Saturate8(a[351:336])
+dst[311:304] := Saturate8(a[367:352])
+dst[319:312] := Saturate8(a[383:368])
+dst[327:320] := Saturate8(b[271:256])
+dst[335:328] := Saturate8(b[287:272])
+dst[343:336] := Saturate8(b[303:288])
+dst[351:344] := Saturate8(b[319:304])
+dst[359:352] := Saturate8(b[335:320])
+dst[367:360] := Saturate8(b[351:336])
+dst[375:368] := Saturate8(b[367:352])
+dst[383:376] := Saturate8(b[383:368])
+dst[391:384] := Saturate8(a[399:384])
+dst[399:392] := Saturate8(a[415:400])
+dst[407:400] := Saturate8(a[431:416])
+dst[415:408] := Saturate8(a[447:432])
+dst[423:416] := Saturate8(a[463:448])
+dst[431:424] := Saturate8(a[479:464])
+dst[439:432] := Saturate8(a[495:480])
+dst[447:440] := Saturate8(a[511:496])
+dst[455:448] := Saturate8(b[399:384])
+dst[463:456] := Saturate8(b[415:400])
+dst[471:464] := Saturate8(b[431:416])
+dst[479:472] := Saturate8(b[447:432])
+dst[487:480] := Saturate8(b[463:448])
+dst[495:488] := Saturate8(b[479:464])
+dst[503:496] := Saturate8(b[495:480])
+dst[511:504] := Saturate8(b[511:496])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSSWB" form="zmm, zmm, zmm" xed="VPACKSSWB_ZMMi8_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="src" etype="SI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPACKSSWB" form="xmm {k}, xmm, xmm" xed="VPACKSSWB_XMMi8_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate8(a[15:0])
+tmp_dst[15:8] := Saturate8(a[31:16])
+tmp_dst[23:16] := Saturate8(a[47:32])
+tmp_dst[31:24] := Saturate8(a[63:48])
+tmp_dst[39:32] := Saturate8(a[79:64])
+tmp_dst[47:40] := Saturate8(a[95:80])
+tmp_dst[55:48] := Saturate8(a[111:96])
+tmp_dst[63:56] := Saturate8(a[127:112])
+tmp_dst[71:64] := Saturate8(b[15:0])
+tmp_dst[79:72] := Saturate8(b[31:16])
+tmp_dst[87:80] := Saturate8(b[47:32])
+tmp_dst[95:88] := Saturate8(b[63:48])
+tmp_dst[103:96] := Saturate8(b[79:64])
+tmp_dst[111:104] := Saturate8(b[95:80])
+tmp_dst[119:112] := Saturate8(b[111:96])
+tmp_dst[127:120] := Saturate8(b[127:112])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPACKSSWB" form="xmm {z}, xmm, xmm" xed="VPACKSSWB_XMMi8_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+tmp_dst[143:128] := SaturateU16(a[159:128])
+tmp_dst[159:144] := SaturateU16(a[191:160])
+tmp_dst[175:160] := SaturateU16(a[223:192])
+tmp_dst[191:176] := SaturateU16(a[255:224])
+tmp_dst[207:192] := SaturateU16(b[159:128])
+tmp_dst[223:208] := SaturateU16(b[191:160])
+tmp_dst[239:224] := SaturateU16(b[223:192])
+tmp_dst[255:240] := SaturateU16(b[255:224])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKUSDW" form="ymm {k}, ymm, ymm" xed="VPACKUSDW_YMMu16_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+tmp_dst[143:128] := SaturateU16(a[159:128])
+tmp_dst[159:144] := SaturateU16(a[191:160])
+tmp_dst[175:160] := SaturateU16(a[223:192])
+tmp_dst[191:176] := SaturateU16(a[255:224])
+tmp_dst[207:192] := SaturateU16(b[159:128])
+tmp_dst[223:208] := SaturateU16(b[191:160])
+tmp_dst[239:224] := SaturateU16(b[223:192])
+tmp_dst[255:240] := SaturateU16(b[255:224])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKUSDW" form="ymm {z}, ymm, ymm" xed="VPACKUSDW_YMMu16_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+tmp_dst[143:128] := SaturateU16(a[159:128])
+tmp_dst[159:144] := SaturateU16(a[191:160])
+tmp_dst[175:160] := SaturateU16(a[223:192])
+tmp_dst[191:176] := SaturateU16(a[255:224])
+tmp_dst[207:192] := SaturateU16(b[159:128])
+tmp_dst[223:208] := SaturateU16(b[191:160])
+tmp_dst[239:224] := SaturateU16(b[223:192])
+tmp_dst[255:240] := SaturateU16(b[255:224])
+tmp_dst[271:256] := SaturateU16(a[287:256])
+tmp_dst[287:272] := SaturateU16(a[319:288])
+tmp_dst[303:288] := SaturateU16(a[351:320])
+tmp_dst[319:304] := SaturateU16(a[383:352])
+tmp_dst[335:320] := SaturateU16(b[287:256])
+tmp_dst[351:336] := SaturateU16(b[319:288])
+tmp_dst[367:352] := SaturateU16(b[351:320])
+tmp_dst[383:368] := SaturateU16(b[383:352])
+tmp_dst[399:384] := SaturateU16(a[415:384])
+tmp_dst[415:400] := SaturateU16(a[447:416])
+tmp_dst[431:416] := SaturateU16(a[479:448])
+tmp_dst[447:432] := SaturateU16(a[511:480])
+tmp_dst[463:448] := SaturateU16(b[415:384])
+tmp_dst[479:464] := SaturateU16(b[447:416])
+tmp_dst[495:480] := SaturateU16(b[479:448])
+tmp_dst[511:496] := SaturateU16(b[511:480])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKUSDW" form="zmm {k}, zmm, zmm" xed="VPACKUSDW_ZMMu16_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+tmp_dst[143:128] := SaturateU16(a[159:128])
+tmp_dst[159:144] := SaturateU16(a[191:160])
+tmp_dst[175:160] := SaturateU16(a[223:192])
+tmp_dst[191:176] := SaturateU16(a[255:224])
+tmp_dst[207:192] := SaturateU16(b[159:128])
+tmp_dst[223:208] := SaturateU16(b[191:160])
+tmp_dst[239:224] := SaturateU16(b[223:192])
+tmp_dst[255:240] := SaturateU16(b[255:224])
+tmp_dst[271:256] := SaturateU16(a[287:256])
+tmp_dst[287:272] := SaturateU16(a[319:288])
+tmp_dst[303:288] := SaturateU16(a[351:320])
+tmp_dst[319:304] := SaturateU16(a[383:352])
+tmp_dst[335:320] := SaturateU16(b[287:256])
+tmp_dst[351:336] := SaturateU16(b[319:288])
+tmp_dst[367:352] := SaturateU16(b[351:320])
+tmp_dst[383:368] := SaturateU16(b[383:352])
+tmp_dst[399:384] := SaturateU16(a[415:384])
+tmp_dst[415:400] := SaturateU16(a[447:416])
+tmp_dst[431:416] := SaturateU16(a[479:448])
+tmp_dst[447:432] := SaturateU16(a[511:480])
+tmp_dst[463:448] := SaturateU16(b[415:384])
+tmp_dst[479:464] := SaturateU16(b[447:416])
+tmp_dst[495:480] := SaturateU16(b[479:448])
+tmp_dst[511:496] := SaturateU16(b[511:480])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKUSDW" form="zmm {z}, zmm, zmm" xed="VPACKUSDW_ZMMu16_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := SaturateU16(a[31:0])
+dst[31:16] := SaturateU16(a[63:32])
+dst[47:32] := SaturateU16(a[95:64])
+dst[63:48] := SaturateU16(a[127:96])
+dst[79:64] := SaturateU16(b[31:0])
+dst[95:80] := SaturateU16(b[63:32])
+dst[111:96] := SaturateU16(b[95:64])
+dst[127:112] := SaturateU16(b[127:96])
+dst[143:128] := SaturateU16(a[159:128])
+dst[159:144] := SaturateU16(a[191:160])
+dst[175:160] := SaturateU16(a[223:192])
+dst[191:176] := SaturateU16(a[255:224])
+dst[207:192] := SaturateU16(b[159:128])
+dst[223:208] := SaturateU16(b[191:160])
+dst[239:224] := SaturateU16(b[223:192])
+dst[255:240] := SaturateU16(b[255:224])
+dst[271:256] := SaturateU16(a[287:256])
+dst[287:272] := SaturateU16(a[319:288])
+dst[303:288] := SaturateU16(a[351:320])
+dst[319:304] := SaturateU16(a[383:352])
+dst[335:320] := SaturateU16(b[287:256])
+dst[351:336] := SaturateU16(b[319:288])
+dst[367:352] := SaturateU16(b[351:320])
+dst[383:368] := SaturateU16(b[383:352])
+dst[399:384] := SaturateU16(a[415:384])
+dst[415:400] := SaturateU16(a[447:416])
+dst[431:416] := SaturateU16(a[479:448])
+dst[447:432] := SaturateU16(a[511:480])
+dst[463:448] := SaturateU16(b[415:384])
+dst[479:464] := SaturateU16(b[447:416])
+dst[495:480] := SaturateU16(b[479:448])
+dst[511:496] := SaturateU16(b[511:480])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKUSDW" form="zmm, zmm, zmm" xed="VPACKUSDW_ZMMu16_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPACKUSDW" form="xmm {k}, xmm, xmm" xed="VPACKUSDW_XMMu16_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := SaturateU16(a[31:0])
+tmp_dst[31:16] := SaturateU16(a[63:32])
+tmp_dst[47:32] := SaturateU16(a[95:64])
+tmp_dst[63:48] := SaturateU16(a[127:96])
+tmp_dst[79:64] := SaturateU16(b[31:0])
+tmp_dst[95:80] := SaturateU16(b[63:32])
+tmp_dst[111:96] := SaturateU16(b[95:64])
+tmp_dst[127:112] := SaturateU16(b[127:96])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPACKUSDW" form="xmm {z}, xmm, xmm" xed="VPACKUSDW_XMMu16_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+tmp_dst[135:128] := SaturateU8(a[143:128])
+tmp_dst[143:136] := SaturateU8(a[159:144])
+tmp_dst[151:144] := SaturateU8(a[175:160])
+tmp_dst[159:152] := SaturateU8(a[191:176])
+tmp_dst[167:160] := SaturateU8(a[207:192])
+tmp_dst[175:168] := SaturateU8(a[223:208])
+tmp_dst[183:176] := SaturateU8(a[239:224])
+tmp_dst[191:184] := SaturateU8(a[255:240])
+tmp_dst[199:192] := SaturateU8(b[143:128])
+tmp_dst[207:200] := SaturateU8(b[159:144])
+tmp_dst[215:208] := SaturateU8(b[175:160])
+tmp_dst[223:216] := SaturateU8(b[191:176])
+tmp_dst[231:224] := SaturateU8(b[207:192])
+tmp_dst[239:232] := SaturateU8(b[223:208])
+tmp_dst[247:240] := SaturateU8(b[239:224])
+tmp_dst[255:248] := SaturateU8(b[255:240])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKUSWB" form="ymm {k}, ymm, ymm" xed="VPACKUSWB_YMMu8_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+tmp_dst[135:128] := SaturateU8(a[143:128])
+tmp_dst[143:136] := SaturateU8(a[159:144])
+tmp_dst[151:144] := SaturateU8(a[175:160])
+tmp_dst[159:152] := SaturateU8(a[191:176])
+tmp_dst[167:160] := SaturateU8(a[207:192])
+tmp_dst[175:168] := SaturateU8(a[223:208])
+tmp_dst[183:176] := SaturateU8(a[239:224])
+tmp_dst[191:184] := SaturateU8(a[255:240])
+tmp_dst[199:192] := SaturateU8(b[143:128])
+tmp_dst[207:200] := SaturateU8(b[159:144])
+tmp_dst[215:208] := SaturateU8(b[175:160])
+tmp_dst[223:216] := SaturateU8(b[191:176])
+tmp_dst[231:224] := SaturateU8(b[207:192])
+tmp_dst[239:232] := SaturateU8(b[223:208])
+tmp_dst[247:240] := SaturateU8(b[239:224])
+tmp_dst[255:248] := SaturateU8(b[255:240])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPACKUSWB" form="ymm {z}, ymm, ymm" xed="VPACKUSWB_YMMu8_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+tmp_dst[135:128] := SaturateU8(a[143:128])
+tmp_dst[143:136] := SaturateU8(a[159:144])
+tmp_dst[151:144] := SaturateU8(a[175:160])
+tmp_dst[159:152] := SaturateU8(a[191:176])
+tmp_dst[167:160] := SaturateU8(a[207:192])
+tmp_dst[175:168] := SaturateU8(a[223:208])
+tmp_dst[183:176] := SaturateU8(a[239:224])
+tmp_dst[191:184] := SaturateU8(a[255:240])
+tmp_dst[199:192] := SaturateU8(b[143:128])
+tmp_dst[207:200] := SaturateU8(b[159:144])
+tmp_dst[215:208] := SaturateU8(b[175:160])
+tmp_dst[223:216] := SaturateU8(b[191:176])
+tmp_dst[231:224] := SaturateU8(b[207:192])
+tmp_dst[239:232] := SaturateU8(b[223:208])
+tmp_dst[247:240] := SaturateU8(b[239:224])
+tmp_dst[255:248] := SaturateU8(b[255:240])
+tmp_dst[263:256] := SaturateU8(a[271:256])
+tmp_dst[271:264] := SaturateU8(a[287:272])
+tmp_dst[279:272] := SaturateU8(a[303:288])
+tmp_dst[287:280] := SaturateU8(a[319:304])
+tmp_dst[295:288] := SaturateU8(a[335:320])
+tmp_dst[303:296] := SaturateU8(a[351:336])
+tmp_dst[311:304] := SaturateU8(a[367:352])
+tmp_dst[319:312] := SaturateU8(a[383:368])
+tmp_dst[327:320] := SaturateU8(b[271:256])
+tmp_dst[335:328] := SaturateU8(b[287:272])
+tmp_dst[343:336] := SaturateU8(b[303:288])
+tmp_dst[351:344] := SaturateU8(b[319:304])
+tmp_dst[359:352] := SaturateU8(b[335:320])
+tmp_dst[367:360] := SaturateU8(b[351:336])
+tmp_dst[375:368] := SaturateU8(b[367:352])
+tmp_dst[383:376] := SaturateU8(b[383:368])
+tmp_dst[391:384] := SaturateU8(a[399:384])
+tmp_dst[399:392] := SaturateU8(a[415:400])
+tmp_dst[407:400] := SaturateU8(a[431:416])
+tmp_dst[415:408] := SaturateU8(a[447:432])
+tmp_dst[423:416] := SaturateU8(a[463:448])
+tmp_dst[431:424] := SaturateU8(a[479:464])
+tmp_dst[439:432] := SaturateU8(a[495:480])
+tmp_dst[447:440] := SaturateU8(a[511:496])
+tmp_dst[455:448] := SaturateU8(b[399:384])
+tmp_dst[463:456] := SaturateU8(b[415:400])
+tmp_dst[471:464] := SaturateU8(b[431:416])
+tmp_dst[479:472] := SaturateU8(b[447:432])
+tmp_dst[487:480] := SaturateU8(b[463:448])
+tmp_dst[495:488] := SaturateU8(b[479:464])
+tmp_dst[503:496] := SaturateU8(b[495:480])
+tmp_dst[511:504] := SaturateU8(b[511:496])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKUSWB" form="zmm {k}, zmm, zmm" xed="VPACKUSWB_ZMMu8_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+tmp_dst[135:128] := SaturateU8(a[143:128])
+tmp_dst[143:136] := SaturateU8(a[159:144])
+tmp_dst[151:144] := SaturateU8(a[175:160])
+tmp_dst[159:152] := SaturateU8(a[191:176])
+tmp_dst[167:160] := SaturateU8(a[207:192])
+tmp_dst[175:168] := SaturateU8(a[223:208])
+tmp_dst[183:176] := SaturateU8(a[239:224])
+tmp_dst[191:184] := SaturateU8(a[255:240])
+tmp_dst[199:192] := SaturateU8(b[143:128])
+tmp_dst[207:200] := SaturateU8(b[159:144])
+tmp_dst[215:208] := SaturateU8(b[175:160])
+tmp_dst[223:216] := SaturateU8(b[191:176])
+tmp_dst[231:224] := SaturateU8(b[207:192])
+tmp_dst[239:232] := SaturateU8(b[223:208])
+tmp_dst[247:240] := SaturateU8(b[239:224])
+tmp_dst[255:248] := SaturateU8(b[255:240])
+tmp_dst[263:256] := SaturateU8(a[271:256])
+tmp_dst[271:264] := SaturateU8(a[287:272])
+tmp_dst[279:272] := SaturateU8(a[303:288])
+tmp_dst[287:280] := SaturateU8(a[319:304])
+tmp_dst[295:288] := SaturateU8(a[335:320])
+tmp_dst[303:296] := SaturateU8(a[351:336])
+tmp_dst[311:304] := SaturateU8(a[367:352])
+tmp_dst[319:312] := SaturateU8(a[383:368])
+tmp_dst[327:320] := SaturateU8(b[271:256])
+tmp_dst[335:328] := SaturateU8(b[287:272])
+tmp_dst[343:336] := SaturateU8(b[303:288])
+tmp_dst[351:344] := SaturateU8(b[319:304])
+tmp_dst[359:352] := SaturateU8(b[335:320])
+tmp_dst[367:360] := SaturateU8(b[351:336])
+tmp_dst[375:368] := SaturateU8(b[367:352])
+tmp_dst[383:376] := SaturateU8(b[383:368])
+tmp_dst[391:384] := SaturateU8(a[399:384])
+tmp_dst[399:392] := SaturateU8(a[415:400])
+tmp_dst[407:400] := SaturateU8(a[431:416])
+tmp_dst[415:408] := SaturateU8(a[447:432])
+tmp_dst[423:416] := SaturateU8(a[463:448])
+tmp_dst[431:424] := SaturateU8(a[479:464])
+tmp_dst[439:432] := SaturateU8(a[495:480])
+tmp_dst[447:440] := SaturateU8(a[511:496])
+tmp_dst[455:448] := SaturateU8(b[399:384])
+tmp_dst[463:456] := SaturateU8(b[415:400])
+tmp_dst[471:464] := SaturateU8(b[431:416])
+tmp_dst[479:472] := SaturateU8(b[447:432])
+tmp_dst[487:480] := SaturateU8(b[463:448])
+tmp_dst[495:488] := SaturateU8(b[479:464])
+tmp_dst[503:496] := SaturateU8(b[495:480])
+tmp_dst[511:504] := SaturateU8(b[511:496])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKUSWB" form="zmm {z}, zmm, zmm" xed="VPACKUSWB_ZMMu8_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(a[79:64])
+dst[47:40] := SaturateU8(a[95:80])
+dst[55:48] := SaturateU8(a[111:96])
+dst[63:56] := SaturateU8(a[127:112])
+dst[71:64] := SaturateU8(b[15:0])
+dst[79:72] := SaturateU8(b[31:16])
+dst[87:80] := SaturateU8(b[47:32])
+dst[95:88] := SaturateU8(b[63:48])
+dst[103:96] := SaturateU8(b[79:64])
+dst[111:104] := SaturateU8(b[95:80])
+dst[119:112] := SaturateU8(b[111:96])
+dst[127:120] := SaturateU8(b[127:112])
+dst[135:128] := SaturateU8(a[143:128])
+dst[143:136] := SaturateU8(a[159:144])
+dst[151:144] := SaturateU8(a[175:160])
+dst[159:152] := SaturateU8(a[191:176])
+dst[167:160] := SaturateU8(a[207:192])
+dst[175:168] := SaturateU8(a[223:208])
+dst[183:176] := SaturateU8(a[239:224])
+dst[191:184] := SaturateU8(a[255:240])
+dst[199:192] := SaturateU8(b[143:128])
+dst[207:200] := SaturateU8(b[159:144])
+dst[215:208] := SaturateU8(b[175:160])
+dst[223:216] := SaturateU8(b[191:176])
+dst[231:224] := SaturateU8(b[207:192])
+dst[239:232] := SaturateU8(b[223:208])
+dst[247:240] := SaturateU8(b[239:224])
+dst[255:248] := SaturateU8(b[255:240])
+dst[263:256] := SaturateU8(a[271:256])
+dst[271:264] := SaturateU8(a[287:272])
+dst[279:272] := SaturateU8(a[303:288])
+dst[287:280] := SaturateU8(a[319:304])
+dst[295:288] := SaturateU8(a[335:320])
+dst[303:296] := SaturateU8(a[351:336])
+dst[311:304] := SaturateU8(a[367:352])
+dst[319:312] := SaturateU8(a[383:368])
+dst[327:320] := SaturateU8(b[271:256])
+dst[335:328] := SaturateU8(b[287:272])
+dst[343:336] := SaturateU8(b[303:288])
+dst[351:344] := SaturateU8(b[319:304])
+dst[359:352] := SaturateU8(b[335:320])
+dst[367:360] := SaturateU8(b[351:336])
+dst[375:368] := SaturateU8(b[367:352])
+dst[383:376] := SaturateU8(b[383:368])
+dst[391:384] := SaturateU8(a[399:384])
+dst[399:392] := SaturateU8(a[415:400])
+dst[407:400] := SaturateU8(a[431:416])
+dst[415:408] := SaturateU8(a[447:432])
+dst[423:416] := SaturateU8(a[463:448])
+dst[431:424] := SaturateU8(a[479:464])
+dst[439:432] := SaturateU8(a[495:480])
+dst[447:440] := SaturateU8(a[511:496])
+dst[455:448] := SaturateU8(b[399:384])
+dst[463:456] := SaturateU8(b[415:400])
+dst[471:464] := SaturateU8(b[431:416])
+dst[479:472] := SaturateU8(b[447:432])
+dst[487:480] := SaturateU8(b[463:448])
+dst[495:488] := SaturateU8(b[479:464])
+dst[503:496] := SaturateU8(b[495:480])
+dst[511:504] := SaturateU8(b[511:496])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKUSWB" form="zmm, zmm, zmm" xed="VPACKUSWB_ZMMu8_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPACKUSWB" form="xmm {k}, xmm, xmm" xed="VPACKUSWB_XMMu8_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := SaturateU8(a[15:0])
+tmp_dst[15:8] := SaturateU8(a[31:16])
+tmp_dst[23:16] := SaturateU8(a[47:32])
+tmp_dst[31:24] := SaturateU8(a[63:48])
+tmp_dst[39:32] := SaturateU8(a[79:64])
+tmp_dst[47:40] := SaturateU8(a[95:80])
+tmp_dst[55:48] := SaturateU8(a[111:96])
+tmp_dst[63:56] := SaturateU8(a[127:112])
+tmp_dst[71:64] := SaturateU8(b[15:0])
+tmp_dst[79:72] := SaturateU8(b[31:16])
+tmp_dst[87:80] := SaturateU8(b[47:32])
+tmp_dst[95:88] := SaturateU8(b[63:48])
+tmp_dst[103:96] := SaturateU8(b[79:64])
+tmp_dst[111:104] := SaturateU8(b[95:80])
+tmp_dst[119:112] := SaturateU8(b[111:96])
+tmp_dst[127:120] := SaturateU8(b[127:112])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPACKUSWB" form="xmm {z}, xmm, xmm" xed="VPACKUSWB_XMMu8_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDB" form="ymm {k}, ymm, ymm" xed="VPADDB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDB" form="ymm {z}, ymm, ymm" xed="VPADDB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDB" form="zmm, zmm, zmm" xed="VPADDB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDB" form="zmm {k}, zmm, zmm" xed="VPADDB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDB" form="zmm {z}, zmm, zmm" xed="VPADDB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDB" form="xmm {k}, xmm, xmm" xed="VPADDB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDB" form="xmm {z}, xmm, xmm" xed="VPADDB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDSB" form="ymm {k}, ymm, ymm" xed="VPADDSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDSB" form="ymm {z}, ymm, ymm" xed="VPADDSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSB" form="zmm, zmm, zmm" xed="VPADDSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSB" form="zmm {k}, zmm, zmm" xed="VPADDSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSB" form="zmm {z}, zmm, zmm" xed="VPADDSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDSB" form="xmm {k}, xmm, xmm" xed="VPADDSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDSB" form="xmm {z}, xmm, xmm" xed="VPADDSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDSW" form="ymm {k}, ymm, ymm" xed="VPADDSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDSW" form="ymm {z}, ymm, ymm" xed="VPADDSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSW" form="zmm, zmm, zmm" xed="VPADDSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSW" form="zmm {k}, zmm, zmm" xed="VPADDSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSW" form="zmm {z}, zmm, zmm" xed="VPADDSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDSW" form="xmm {k}, xmm, xmm" xed="VPADDSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDSW" form="xmm {z}, xmm, xmm" xed="VPADDSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDUSB" form="ymm {k}, ymm, ymm" xed="VPADDUSB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDUSB" form="ymm {z}, ymm, ymm" xed="VPADDUSB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDUSB" form="zmm, zmm, zmm" xed="VPADDUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDUSB" form="zmm {k}, zmm, zmm" xed="VPADDUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDUSB" form="zmm {z}, zmm, zmm" xed="VPADDUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDUSB" form="xmm {k}, xmm, xmm" xed="VPADDUSB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDUSB" form="xmm {z}, xmm, xmm" xed="VPADDUSB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDUSW" form="ymm {k}, ymm, ymm" xed="VPADDUSW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDUSW" form="ymm {z}, ymm, ymm" xed="VPADDUSW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDUSW" form="zmm, zmm, zmm" xed="VPADDUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDUSW" form="zmm {k}, zmm, zmm" xed="VPADDUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDUSW" form="zmm {z}, zmm, zmm" xed="VPADDUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDUSW" form="xmm {k}, xmm, xmm" xed="VPADDUSW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDUSW" form="xmm {z}, xmm, xmm" xed="VPADDUSW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDW" form="ymm {k}, ymm, ymm" xed="VPADDW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDW" form="ymm {z}, ymm, ymm" xed="VPADDW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDW" form="zmm, zmm, zmm" xed="VPADDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDW" form="zmm {k}, zmm, zmm" xed="VPADDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDW" form="zmm {z}, zmm, zmm" xed="VPADDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDW" form="xmm {k}, xmm, xmm" xed="VPADDW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDW" form="xmm {z}, xmm, xmm" xed="VPADDW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPALIGNR" form="ymm {k}, ymm, ymm, imm8" xed="VPALIGNR_YMMu8_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPALIGNR" form="ymm {z}, ymm, ymm, imm8" xed="VPALIGNR_YMMu8_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	dst[i+127:i] := tmp[127:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPALIGNR" form="zmm, zmm, zmm, imm8" xed="VPALIGNR_ZMMu8_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPALIGNR" form="zmm {k}, zmm, zmm, imm8" xed="VPALIGNR_ZMMu8_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128)[255:0] OR b[i+127:i]) &gt;&gt; (imm8*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPALIGNR" form="zmm {z}, zmm, zmm, imm8" xed="VPALIGNR_ZMMu8_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[255:0] := ((a[127:0] &lt;&lt; 128)[255:0] OR b[127:0]) &gt;&gt; (imm8*8)
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPALIGNR" form="xmm {k}, xmm, xmm, imm8" xed="VPALIGNR_XMMu8_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[255:0] := ((a[127:0] &lt;&lt; 128)[255:0] OR b[127:0]) &gt;&gt; (imm8*8)
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPALIGNR" form="xmm {z}, xmm, xmm, imm8" xed="VPALIGNR_XMMu8_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPAVGB" form="ymm {k}, ymm, ymm" xed="VPAVGB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPAVGB" form="ymm {z}, ymm, ymm" xed="VPAVGB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPAVGB" form="zmm, zmm, zmm" xed="VPAVGB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPAVGB" form="zmm {k}, zmm, zmm" xed="VPAVGB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPAVGB" form="zmm {z}, zmm, zmm" xed="VPAVGB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPAVGB" form="xmm {k}, xmm, xmm" xed="VPAVGB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPAVGB" form="xmm {z}, xmm, xmm" xed="VPAVGB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPAVGW" form="ymm {k}, ymm, ymm" xed="VPAVGW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPAVGW" form="ymm {z}, ymm, ymm" xed="VPAVGW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPAVGW" form="zmm, zmm, zmm" xed="VPAVGW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPAVGW" form="zmm {k}, zmm, zmm" xed="VPAVGW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPAVGW" form="zmm {z}, zmm, zmm" xed="VPAVGW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPAVGW" form="xmm {k}, xmm, xmm" xed="VPAVGW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPAVGW" form="xmm {z}, xmm, xmm" xed="VPAVGW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_blend_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBLENDMB" form="ymm {k}, ymm, ymm" xed="VPBLENDMB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_blend_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBLENDMB" form="zmm {k}, zmm, zmm" xed="VPBLENDMB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_blend_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBLENDMB" form="xmm {k}, xmm, xmm" xed="VPBLENDMB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_blend_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBLENDMW" form="ymm {k}, ymm, ymm" xed="VPBLENDMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_blend_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBLENDMW" form="zmm {k}, zmm, zmm" xed="VPBLENDMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_blend_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBLENDMW" form="xmm {k}, xmm, xmm" xed="VPBLENDMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="ymm {k}, xmm" xed="VPBROADCASTB_YMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="ymm {k}, r8" xed="VPBROADCASTB_YMMu8_MASKmskw_GPR32u8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="ymm {z}, xmm" xed="VPBROADCASTB_YMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="ymm {z}, r8" xed="VPBROADCASTB_YMMu8_MASKmskw_GPR32u8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="zmm, xmm" xed="VPBROADCASTB_ZMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="zmm {k}, xmm" xed="VPBROADCASTB_ZMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="zmm {k}, r8" xed="VPBROADCASTB_ZMMu8_MASKmskw_GPR32u8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="zmm {z}, xmm" xed="VPBROADCASTB_ZMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="zmm {z}, r8" xed="VPBROADCASTB_ZMMu8_MASKmskw_GPR32u8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="xmm {k}, xmm" xed="VPBROADCASTB_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="xmm {k}, r8" xed="VPBROADCASTB_XMMu8_MASKmskw_GPR32u8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="xmm {z}, xmm" xed="VPBROADCASTB_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="xmm {z}, r8" xed="VPBROADCASTB_XMMu8_MASKmskw_GPR32u8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="ymm {k}, xmm" xed="VPBROADCASTW_YMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="ymm {k}, r16" xed="VPBROADCASTW_YMMu16_MASKmskw_GPR32u16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="ymm {z}, xmm" xed="VPBROADCASTW_YMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast 16-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="ymm {z}, r16" xed="VPBROADCASTW_YMMu16_MASKmskw_GPR32u16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="zmm, xmm" xed="VPBROADCASTW_ZMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="zmm {k}, xmm" xed="VPBROADCASTW_ZMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast 16-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="zmm {k}, r16" xed="VPBROADCASTW_ZMMu16_MASKmskw_GPR32u16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="zmm {z}, xmm" xed="VPBROADCASTW_ZMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="zmm {z}, r16" xed="VPBROADCASTW_ZMMu16_MASKmskw_GPR32u16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="xmm {k}, xmm" xed="VPBROADCASTW_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="xmm {k}, r16" xed="VPBROADCASTW_XMMu16_MASKmskw_GPR32u16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="xmm {z}, xmm" xed="VPBROADCASTW_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="xmm {z}, r16" xed="VPBROADCASTW_XMMu16_MASKmskw_GPR32u16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, ymm, ymm, imm8" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, ymm, ymm, imm8" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, ymm, ymm" xed="VPCMPB_MASKmskw_MASKmskw_YMMi8_YMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, zmm, zmm, imm8" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, zmm, zmm, imm8" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, zmm, zmm" xed="VPCMPB_MASKmskw_MASKmskw_ZMMi8_ZMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, xmm, xmm, imm8" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, xmm, xmm, imm8" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPB" form="k {k}, xmm, xmm" xed="VPCMPB_MASKmskw_MASKmskw_XMMi8_XMMi8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, ymm, ymm, imm8" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, ymm, ymm, imm8" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, ymm, ymm" xed="VPCMPUB_MASKmskw_MASKmskw_YMMu8_YMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, zmm, zmm, imm8" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, zmm, zmm, imm8" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, zmm, zmm" xed="VPCMPUB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, xmm, xmm, imm8" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, xmm, xmm, imm8" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUB" form="k {k}, xmm, xmm" xed="VPCMPUB_MASKmskw_MASKmskw_XMMu8_XMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, ymm, ymm, imm8" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, ymm, ymm, imm8" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, ymm, ymm" xed="VPCMPUW_MASKmskw_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, zmm, zmm, imm8" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, zmm, zmm, imm8" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, zmm, zmm" xed="VPCMPUW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, xmm, xmm, imm8" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, xmm, xmm, imm8" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUW" form="k {k}, xmm, xmm" xed="VPCMPUW_MASKmskw_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, ymm, ymm, imm8" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, ymm, ymm, imm8" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, ymm, ymm" xed="VPCMPW_MASKmskw_MASKmskw_YMMi16_YMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, zmm, zmm, imm8" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, zmm, zmm, imm8" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, zmm, zmm" xed="VPCMPW_MASKmskw_MASKmskw_ZMMi16_ZMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, xmm, xmm, imm8" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, xmm, xmm, imm8" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPW" form="k {k}, xmm, xmm" xed="VPCMPW_MASKmskw_MASKmskw_XMMi16_XMMi16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask2_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="idx" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+3:i]
+		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := idx[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2W" form="ymm {k}, ymm, ymm" xed="VPERMI2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+3:i]
+		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMT2W" form="ymm {k}, ymm, ymm" xed="VPERMT2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="idx" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+3:i]
+		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2W" form="ymm {z}, ymm, ymm" xed="VPERMI2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<instruction name="VPERMT2W" form="ymm {z}, ymm, ymm" xed="VPERMT2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="idx" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	off := 16*idx[i+3:i]
+	dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2W" form="ymm, ymm, ymm" xed="VPERMI2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<instruction name="VPERMT2W" form="ymm, ymm, ymm" xed="VPERMT2W_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask2_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="idx" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+4:i]
+		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := idx[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2W" form="zmm {k}, zmm, zmm" xed="VPERMI2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+4:i]
+		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMT2W" form="zmm {k}, zmm, zmm" xed="VPERMT2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="idx" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+4:i]
+		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2W" form="zmm {z}, zmm, zmm" xed="VPERMI2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<instruction name="VPERMT2W" form="zmm {z}, zmm, zmm" xed="VPERMT2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="idx" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	off := 16*idx[i+4:i]
+	dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2W" form="zmm, zmm, zmm" xed="VPERMI2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<instruction name="VPERMT2W" form="zmm, zmm, zmm" xed="VPERMT2W_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask2_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="idx" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+2:i]
+		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := idx[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2W" form="xmm {k}, xmm, xmm" xed="VPERMI2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+2:i]
+		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMT2W" form="xmm {k}, xmm, xmm" xed="VPERMT2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="idx" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+2:i]
+		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2W" form="xmm {z}, xmm, xmm" xed="VPERMI2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<instruction name="VPERMT2W" form="xmm {z}, xmm, xmm" xed="VPERMT2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="idx" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	off := 16*idx[i+2:i]
+	dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2W" form="xmm, xmm, xmm" xed="VPERMI2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<instruction name="VPERMT2W" form="xmm, xmm, xmm" xed="VPERMT2W_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMW" form="ymm {k}, ymm, ymm" xed="VPERMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMW" form="ymm {z}, ymm, ymm" xed="VPERMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="idx" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]*16
+	dst[i+15:i] := a[id+15:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMW" form="ymm, ymm, ymm" xed="VPERMW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMW" form="zmm {k}, zmm, zmm" xed="VPERMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMW" form="zmm {z}, zmm, zmm" xed="VPERMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="idx" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]*16
+	dst[i+15:i] := a[id+15:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMW" form="zmm, zmm, zmm" xed="VPERMW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMW" form="xmm {k}, xmm, xmm" xed="VPERMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMW" form="xmm {z}, xmm, xmm" xed="VPERMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="idx" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]*16
+	dst[i+15:i] := a[id+15:id]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMW" form="xmm, xmm, xmm" xed="VPERMW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__m256i" varname="src" etype="SI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADDUBSW" form="ymm {k}, ymm, ymm" xed="VPMADDUBSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADDUBSW" form="ymm {z}, ymm, ymm" xed="VPMADDUBSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI16"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADDUBSW" form="zmm, zmm, zmm" xed="VPMADDUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI16"/>
+	<parameter type="__m512i" varname="src" etype="SI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADDUBSW" form="zmm {k}, zmm, zmm" xed="VPMADDUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADDUBSW" form="zmm {z}, zmm, zmm" xed="VPMADDUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="src" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADDUBSW" form="xmm {k}, xmm, xmm" xed="VPMADDUBSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADDUBSW" form="xmm {z}, xmm, xmm" xed="VPMADDUBSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADDWD" form="ymm {k}, ymm, ymm" xed="VPMADDWD_YMMi32_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADDWD" form="ymm {z}, ymm, ymm" xed="VPMADDWD_YMMi32_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADDWD" form="zmm, zmm, zmm" xed="VPMADDWD_ZMMi32_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADDWD" form="zmm {k}, zmm, zmm" xed="VPMADDWD_ZMMi32_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADDWD" form="zmm {z}, zmm, zmm" xed="VPMADDWD_ZMMi32_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADDWD" form="xmm {k}, xmm, xmm" xed="VPMADDWD_XMMi32_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADDWD" form="xmm {z}, xmm, xmm" xed="VPMADDWD_XMMi32_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSB" form="ymm {k}, ymm, ymm" xed="VPMAXSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSB" form="ymm {z}, ymm, ymm" xed="VPMAXSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSB" form="zmm {k}, zmm, zmm" xed="VPMAXSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSB" form="zmm {z}, zmm, zmm" xed="VPMAXSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSB" form="zmm, zmm, zmm" xed="VPMAXSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXSB" form="xmm {k}, xmm, xmm" xed="VPMAXSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXSB" form="xmm {z}, xmm, xmm" xed="VPMAXSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSW" form="ymm {k}, ymm, ymm" xed="VPMAXSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSW" form="ymm {z}, ymm, ymm" xed="VPMAXSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSW" form="zmm {k}, zmm, zmm" xed="VPMAXSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSW" form="zmm {z}, zmm, zmm" xed="VPMAXSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSW" form="zmm, zmm, zmm" xed="VPMAXSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXSW" form="xmm {k}, xmm, xmm" xed="VPMAXSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXSW" form="xmm {z}, xmm, xmm" xed="VPMAXSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUB" form="ymm {k}, ymm, ymm" xed="VPMAXUB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUB" form="ymm {z}, ymm, ymm" xed="VPMAXUB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUB" form="zmm {k}, zmm, zmm" xed="VPMAXUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUB" form="zmm {z}, zmm, zmm" xed="VPMAXUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUB" form="zmm, zmm, zmm" xed="VPMAXUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXUB" form="xmm {k}, xmm, xmm" xed="VPMAXUB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXUB" form="xmm {z}, xmm, xmm" xed="VPMAXUB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUW" form="ymm {k}, ymm, ymm" xed="VPMAXUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUW" form="ymm {z}, ymm, ymm" xed="VPMAXUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUW" form="zmm {k}, zmm, zmm" xed="VPMAXUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUW" form="zmm {z}, zmm, zmm" xed="VPMAXUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUW" form="zmm, zmm, zmm" xed="VPMAXUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXUW" form="xmm {k}, xmm, xmm" xed="VPMAXUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXUW" form="xmm {z}, xmm, xmm" xed="VPMAXUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSB" form="ymm {k}, ymm, ymm" xed="VPMINSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSB" form="ymm {z}, ymm, ymm" xed="VPMINSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSB" form="zmm {k}, zmm, zmm" xed="VPMINSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSB" form="zmm {z}, zmm, zmm" xed="VPMINSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSB" form="zmm, zmm, zmm" xed="VPMINSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINSB" form="xmm {k}, xmm, xmm" xed="VPMINSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINSB" form="xmm {z}, xmm, xmm" xed="VPMINSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSW" form="ymm {k}, ymm, ymm" xed="VPMINSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSW" form="ymm {z}, ymm, ymm" xed="VPMINSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSW" form="zmm {k}, zmm, zmm" xed="VPMINSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSW" form="zmm {z}, zmm, zmm" xed="VPMINSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSW" form="zmm, zmm, zmm" xed="VPMINSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINSW" form="xmm {k}, xmm, xmm" xed="VPMINSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINSW" form="xmm {z}, xmm, xmm" xed="VPMINSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUB" form="ymm {k}, ymm, ymm" xed="VPMINUB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUB" form="ymm {z}, ymm, ymm" xed="VPMINUB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUB" form="zmm {k}, zmm, zmm" xed="VPMINUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUB" form="zmm {z}, zmm, zmm" xed="VPMINUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUB" form="zmm, zmm, zmm" xed="VPMINUB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINUB" form="xmm {k}, xmm, xmm" xed="VPMINUB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINUB" form="xmm {z}, xmm, xmm" xed="VPMINUB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUW" form="ymm {k}, ymm, ymm" xed="VPMINUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUW" form="ymm {z}, ymm, ymm" xed="VPMINUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUW" form="zmm {k}, zmm, zmm" xed="VPMINUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUW" form="zmm {z}, zmm, zmm" xed="VPMINUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUW" form="zmm, zmm, zmm" xed="VPMINUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINUW" form="xmm {k}, xmm, xmm" xed="VPMINUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINUW" form="xmm {z}, xmm, xmm" xed="VPMINUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_movepi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF a[i+7]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVB2M" form="k, ymm" xed="VPMOVB2M_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movepi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF a[i+7]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVB2M" form="k, zmm" xed="VPMOVB2M_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_movepi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF a[i+7]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPMOVB2M" form="k, xmm" xed="VPMOVB2M_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_movm_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<description>Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := 0xFF
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVM2B" form="ymm" xed="VPMOVM2B_YMMu8_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movm_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<description>Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := 0xFF
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVM2B" form="zmm" xed="VPMOVM2B_ZMMu8_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_movm_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := 0xFF
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVM2B" form="xmm" xed="VPMOVM2B_XMMu8_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_movm_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 0xFFFF
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVM2W" form="ymm" xed="VPMOVM2W_YMMu16_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movm_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<description>Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 0xFFFF
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVM2W" form="zmm" xed="VPMOVM2W_ZMMu16_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_movm_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 0xFFFF
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVM2W" form="xmm" xed="VPMOVM2W_XMMu16_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate8(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSWB" form="xmm, ymm" xed="VPMOVSWB_XMMi8_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="src" etype="SI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSWB" form="xmm {k}, ymm" xed="VPMOVSWB_XMMi8_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI8" memwidth="128"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSWB" form="m128 {k}, ymm" xed="VPMOVSWB_MEMi8_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSWB" form="xmm {z}, ymm" xed="VPMOVSWB_XMMi8_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI8"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate8(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSWB" form="ymm, zmm" xed="VPMOVSWB_YMMi8_MASKmskw_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI8"/>
+	<parameter type="__m256i" varname="src" etype="SI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSWB" form="ymm {k}, zmm" xed="VPMOVSWB_YMMi8_MASKmskw_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI8" memwidth="256"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSWB" form="m256 {k}, zmm" xed="VPMOVSWB_MEMi8_MASKmskw_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSWB" form="ymm {z}, zmm" xed="VPMOVSWB_YMMi8_MASKmskw_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate8(a[i+15:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSWB" form="xmm, xmm" xed="VPMOVSWB_XMMi8_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="src" etype="SI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSWB" form="xmm {k}, xmm" xed="VPMOVSWB_XMMi8_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI8" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSWB" form="m64 {k}, xmm" xed="VPMOVSWB_MEMi8_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSWB" form="xmm {z}, xmm" xed="VPMOVSWB_XMMi8_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__m256i" varname="src" etype="SI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXBW" form="ymm {k}, xmm" xed="VPMOVSXBW_YMMi16_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXBW" form="ymm {z}, xmm" xed="VPMOVSXBW_YMMi16_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI16"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	dst[l+15:l] := SignExtend16(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXBW" form="zmm, ymm" xed="VPMOVSXBW_ZMMi16_MASKmskw_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI16"/>
+	<parameter type="__m512i" varname="src" etype="SI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXBW" form="zmm {k}, ymm" xed="VPMOVSXBW_ZMMi16_MASKmskw_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXBW" form="zmm {z}, ymm" xed="VPMOVSXBW_ZMMi16_MASKmskw_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="src" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXBW" form="xmm {k}, xmm" xed="VPMOVSXBW_XMMi16_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXBW" form="xmm {z}, xmm" xed="VPMOVSXBW_XMMi16_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := SaturateU8(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSWB" form="xmm, ymm" xed="VPMOVUSWB_XMMu8_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSWB" form="xmm {k}, ymm" xed="VPMOVUSWB_XMMu8_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="128"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSWB" form="m128 {k}, ymm" xed="VPMOVUSWB_MEMu8_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSWB" form="xmm {z}, ymm" xed="VPMOVUSWB_XMMu8_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := SaturateU8(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVUSWB" form="ymm, zmm" xed="VPMOVUSWB_YMMu8_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVUSWB" form="ymm {k}, zmm" xed="VPMOVUSWB_YMMu8_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="256"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSWB" form="m256 {k}, zmm" xed="VPMOVUSWB_MEMu8_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVUSWB" form="ymm {z}, zmm" xed="VPMOVUSWB_YMMu8_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := SaturateU8(a[i+15:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSWB" form="xmm, xmm" xed="VPMOVUSWB_XMMu8_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSWB" form="xmm {k}, xmm" xed="VPMOVUSWB_XMMu8_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSWB" form="m64 {k}, xmm" xed="VPMOVUSWB_MEMu8_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSWB" form="xmm {z}, xmm" xed="VPMOVUSWB_XMMu8_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_movepi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF a[i+15]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPMOVW2M" form="k, ymm" xed="VPMOVW2M_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movepi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF a[i+15]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVW2M" form="k, zmm" xed="VPMOVW2M_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_movepi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPMOVW2M" form="k, xmm" xed="VPMOVW2M_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Truncate8(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVWB" form="xmm, ymm" xed="VPMOVWB_XMMu8_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVWB" form="xmm {k}, ymm" xed="VPMOVWB_XMMu8_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="128"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVWB" form="m128 {k}, ymm" xed="VPMOVWB_MEMu8_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVWB" form="xmm {z}, ymm" xed="VPMOVWB_XMMu8_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Truncate8(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVWB" form="ymm, zmm" xed="VPMOVWB_YMMu8_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVWB" form="ymm {k}, zmm" xed="VPMOVWB_YMMu8_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="256"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVWB" form="m256 {k}, zmm" xed="VPMOVWB_MEMu8_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVWB" form="ymm {z}, zmm" xed="VPMOVWB_YMMu8_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Truncate8(a[i+15:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVWB" form="xmm, xmm" xed="VPMOVWB_XMMu8_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVWB" form="xmm {k}, xmm" xed="VPMOVWB_XMMu8_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVWB" form="m64 {k}, xmm" xed="VPMOVWB_MEMu8_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVWB" form="xmm {z}, xmm" xed="VPMOVWB_XMMu8_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXBW" form="ymm {k}, xmm" xed="VPMOVZXBW_YMMi16_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXBW" form="ymm {z}, xmm" xed="VPMOVZXBW_YMMi16_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	dst[l+15:l] := ZeroExtend16(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXBW" form="zmm, ymm" xed="VPMOVZXBW_ZMMi16_MASKmskw_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXBW" form="zmm {k}, ymm" xed="VPMOVZXBW_ZMMi16_MASKmskw_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXBW" form="zmm {z}, ymm" xed="VPMOVZXBW_ZMMi16_MASKmskw_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXBW" form="xmm {k}, xmm" xed="VPMOVZXBW_XMMi16_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend16(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXBW" form="xmm {z}, xmm" xed="VPMOVZXBW_XMMi16_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULHRSW" form="ymm {k}, ymm, ymm" xed="VPMULHRSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULHRSW" form="ymm {z}, ymm, ymm" xed="VPMULHRSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHRSW" form="zmm {k}, zmm, zmm" xed="VPMULHRSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHRSW" form="zmm {z}, zmm, zmm" xed="VPMULHRSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHRSW" form="zmm, zmm, zmm" xed="VPMULHRSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULHRSW" form="xmm {k}, xmm, xmm" xed="VPMULHRSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULHRSW" form="xmm {z}, xmm, xmm" xed="VPMULHRSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULHUW" form="ymm {k}, ymm, ymm" xed="VPMULHUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULHUW" form="ymm {z}, ymm, ymm" xed="VPMULHUW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHUW" form="zmm {k}, zmm, zmm" xed="VPMULHUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHUW" form="zmm {z}, zmm, zmm" xed="VPMULHUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHUW" form="zmm, zmm, zmm" xed="VPMULHUW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULHUW" form="xmm {k}, xmm, xmm" xed="VPMULHUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULHUW" form="xmm {z}, xmm, xmm" xed="VPMULHUW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULHW" form="ymm {k}, ymm, ymm" xed="VPMULHW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULHW" form="ymm {z}, ymm, ymm" xed="VPMULHW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHW" form="zmm {k}, zmm, zmm" xed="VPMULHW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHW" form="zmm {z}, zmm, zmm" xed="VPMULHW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHW" form="zmm, zmm, zmm" xed="VPMULHW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULHW" form="xmm {k}, xmm, xmm" xed="VPMULHW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULHW" form="xmm {z}, xmm, xmm" xed="VPMULHW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULLW" form="ymm {k}, ymm, ymm" xed="VPMULLW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULLW" form="ymm {z}, ymm, ymm" xed="VPMULLW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULLW" form="zmm {k}, zmm, zmm" xed="VPMULLW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULLW" form="zmm {z}, zmm, zmm" xed="VPMULLW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULLW" form="zmm, zmm, zmm" xed="VPMULLW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULLW" form="xmm {k}, xmm, xmm" xed="VPMULLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULLW" form="xmm {z}, xmm, xmm" xed="VPMULLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+FOR j := 0 to 7
+	i := j*64
+	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \
+	               tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
+	dst[i+63:i+16] := 0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSADBW" form="zmm, zmm, zmm" xed="VPSADBW_ZMMu16_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[4:0] := b[i+3:i] + (j &amp; 0x10)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFB" form="ymm {k}, ymm, ymm" xed="VPSHUFB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[4:0] := b[i+3:i] + (j &amp; 0x10)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFB" form="ymm {z}, ymm, ymm" xed="VPSHUFB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" within 128-bit lanes using the control in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[5:0] := b[i+3:i] + (j &amp; 0x30)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFB" form="zmm {k}, zmm, zmm" xed="VPSHUFB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[5:0] := b[i+3:i] + (j &amp; 0x30)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFB" form="zmm {z}, zmm, zmm" xed="VPSHUFB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[5:0] := b[i+3:i] + (j &amp; 0x30)
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFB" form="zmm, zmm, zmm" xed="VPSHUFB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[3:0] := b[i+3:i]
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHUFB" form="xmm {k}, xmm, xmm" xed="VPSHUFB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[3:0] := b[i+3:i]
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHUFB" form="xmm {z}, xmm, xmm" xed="VPSHUFB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFHW" form="ymm {k}, ymm, imm8" xed="VPSHUFHW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFHW" form="ymm {z}, ymm, imm8" xed="VPSHUFHW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+tmp_dst[319:256] := a[319:256]
+tmp_dst[335:320] := (a &gt;&gt; (imm8[1:0] * 16))[335:320]
+tmp_dst[351:336] := (a &gt;&gt; (imm8[3:2] * 16))[335:320]
+tmp_dst[367:352] := (a &gt;&gt; (imm8[5:4] * 16))[335:320]
+tmp_dst[383:368] := (a &gt;&gt; (imm8[7:6] * 16))[335:320]
+tmp_dst[447:384] := a[447:384]
+tmp_dst[463:448] := (a &gt;&gt; (imm8[1:0] * 16))[463:448]
+tmp_dst[479:464] := (a &gt;&gt; (imm8[3:2] * 16))[463:448]
+tmp_dst[495:480] := (a &gt;&gt; (imm8[5:4] * 16))[463:448]
+tmp_dst[511:496] := (a &gt;&gt; (imm8[7:6] * 16))[463:448]
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFHW" form="zmm {k}, zmm, imm8" xed="VPSHUFHW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+tmp_dst[319:256] := a[319:256]
+tmp_dst[335:320] := (a &gt;&gt; (imm8[1:0] * 16))[335:320]
+tmp_dst[351:336] := (a &gt;&gt; (imm8[3:2] * 16))[335:320]
+tmp_dst[367:352] := (a &gt;&gt; (imm8[5:4] * 16))[335:320]
+tmp_dst[383:368] := (a &gt;&gt; (imm8[7:6] * 16))[335:320]
+tmp_dst[447:384] := a[447:384]
+tmp_dst[463:448] := (a &gt;&gt; (imm8[1:0] * 16))[463:448]
+tmp_dst[479:464] := (a &gt;&gt; (imm8[3:2] * 16))[463:448]
+tmp_dst[495:480] := (a &gt;&gt; (imm8[5:4] * 16))[463:448]
+tmp_dst[511:496] := (a &gt;&gt; (imm8[7:6] * 16))[463:448]
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFHW" form="zmm {z}, zmm, imm8" xed="VPSHUFHW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+dst[191:128] := a[191:128]
+dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+dst[319:256] := a[319:256]
+dst[335:320] := (a &gt;&gt; (imm8[1:0] * 16))[335:320]
+dst[351:336] := (a &gt;&gt; (imm8[3:2] * 16))[335:320]
+dst[367:352] := (a &gt;&gt; (imm8[5:4] * 16))[335:320]
+dst[383:368] := (a &gt;&gt; (imm8[7:6] * 16))[335:320]
+dst[447:384] := a[447:384]
+dst[463:448] := (a &gt;&gt; (imm8[1:0] * 16))[463:448]
+dst[479:464] := (a &gt;&gt; (imm8[3:2] * 16))[463:448]
+dst[495:480] := (a &gt;&gt; (imm8[5:4] * 16))[463:448]
+dst[511:496] := (a &gt;&gt; (imm8[7:6] * 16))[463:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFHW" form="zmm, zmm, imm8" xed="VPSHUFHW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHUFHW" form="xmm {k}, xmm, imm8" xed="VPSHUFHW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHUFHW" form="xmm {z}, xmm, imm8" xed="VPSHUFHW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFLW" form="ymm {k}, ymm, imm8" xed="VPSHUFLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFLW" form="ymm {z}, ymm, imm8" xed="VPSHUFLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+tmp_dst[271:256] := (a &gt;&gt; (imm8[1:0] * 16))[271:256]
+tmp_dst[287:272] := (a &gt;&gt; (imm8[3:2] * 16))[271:256]
+tmp_dst[303:288] := (a &gt;&gt; (imm8[5:4] * 16))[271:256]
+tmp_dst[319:304] := (a &gt;&gt; (imm8[7:6] * 16))[271:256]
+tmp_dst[383:320] := a[383:320]
+tmp_dst[399:384] := (a &gt;&gt; (imm8[1:0] * 16))[399:384]
+tmp_dst[415:400] := (a &gt;&gt; (imm8[3:2] * 16))[399:384]
+tmp_dst[431:416] := (a &gt;&gt; (imm8[5:4] * 16))[399:384]
+tmp_dst[447:432] := (a &gt;&gt; (imm8[7:6] * 16))[399:384]
+tmp_dst[511:448] := a[511:448]
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFLW" form="zmm {k}, zmm, imm8" xed="VPSHUFLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+tmp_dst[271:256] := (a &gt;&gt; (imm8[1:0] * 16))[271:256]
+tmp_dst[287:272] := (a &gt;&gt; (imm8[3:2] * 16))[271:256]
+tmp_dst[303:288] := (a &gt;&gt; (imm8[5:4] * 16))[271:256]
+tmp_dst[319:304] := (a &gt;&gt; (imm8[7:6] * 16))[271:256]
+tmp_dst[383:320] := a[383:320]
+tmp_dst[399:384] := (a &gt;&gt; (imm8[1:0] * 16))[399:384]
+tmp_dst[415:400] := (a &gt;&gt; (imm8[3:2] * 16))[399:384]
+tmp_dst[431:416] := (a &gt;&gt; (imm8[5:4] * 16))[399:384]
+tmp_dst[447:432] := (a &gt;&gt; (imm8[7:6] * 16))[399:384]
+tmp_dst[511:448] := a[511:448]
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFLW" form="zmm {z}, zmm, imm8" xed="VPSHUFLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+dst[127:64] := a[127:64]
+dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+dst[255:192] := a[255:192]
+dst[271:256] := (a &gt;&gt; (imm8[1:0] * 16))[271:256]
+dst[287:272] := (a &gt;&gt; (imm8[3:2] * 16))[271:256]
+dst[303:288] := (a &gt;&gt; (imm8[5:4] * 16))[271:256]
+dst[319:304] := (a &gt;&gt; (imm8[7:6] * 16))[271:256]
+dst[383:320] := a[383:320]
+dst[399:384] := (a &gt;&gt; (imm8[1:0] * 16))[399:384]
+dst[415:400] := (a &gt;&gt; (imm8[3:2] * 16))[399:384]
+dst[431:416] := (a &gt;&gt; (imm8[5:4] * 16))[399:384]
+dst[447:432] := (a &gt;&gt; (imm8[7:6] * 16))[399:384]
+dst[511:448] := a[511:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFLW" form="zmm, zmm, imm8" xed="VPSHUFLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHUFLW" form="xmm {k}, xmm, imm8" xed="VPSHUFLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHUFLW" form="xmm {z}, xmm, imm8" xed="VPSHUFLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_bslli_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="M128"/>
+	<parameter type="__m512i" varname="a" etype="M128"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
+dst[383:256] := a[383:256] &lt;&lt; (tmp*8)
+dst[511:384] := a[511:384] &lt;&lt; (tmp*8)
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLDQ" form="zmm, zmm, imm8" xed="VPSLLDQ_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLVW" form="ymm {k}, ymm, ymm" xed="VPSLLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLVW" form="ymm {z}, ymm, ymm" xed="VPSLLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLVW" form="ymm, ymm, ymm" xed="VPSLLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLVW" form="zmm {k}, zmm, zmm" xed="VPSLLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLVW" form="zmm {z}, zmm, zmm" xed="VPSLLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLVW" form="zmm, zmm, zmm" xed="VPSLLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLVW" form="xmm {k}, xmm, xmm" xed="VPSLLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLVW" form="xmm {z}, xmm, xmm" xed="VPSLLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLVW" form="xmm, xmm, xmm" xed="VPSLLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLW" form="ymm {k}, ymm, xmm" xed="VPSLLW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLW" form="ymm {k}, ymm, imm8" xed="VPSLLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLW" form="ymm {z}, ymm, xmm" xed="VPSLLW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLW" form="ymm {z}, ymm, imm8" xed="VPSLLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLW" form="zmm {k}, zmm, xmm" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLW" form="zmm {k}, zmm, imm8" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLW" form="zmm {z}, zmm, xmm" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLW" form="zmm {z}, zmm, imm8" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLW" form="zmm, zmm, xmm" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLW" form="zmm, zmm, imm8" xed="VPSLLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLW" form="xmm {k}, xmm, xmm" xed="VPSLLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLW" form="xmm {k}, xmm, imm8" xed="VPSLLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLW" form="xmm {z}, xmm, xmm" xed="VPSLLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLW" form="xmm {z}, xmm, imm8" xed="VPSLLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAVW" form="ymm {k}, ymm, ymm" xed="VPSRAVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAVW" form="ymm {z}, ymm, ymm" xed="VPSRAVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAVW" form="ymm, ymm, ymm" xed="VPSRAVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAVW" form="zmm {k}, zmm, zmm" xed="VPSRAVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAVW" form="zmm {z}, zmm, zmm" xed="VPSRAVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAVW" form="zmm, zmm, zmm" xed="VPSRAVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAVW" form="xmm {k}, xmm, xmm" xed="VPSRAVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAVW" form="xmm {z}, xmm, xmm" xed="VPSRAVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0)
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAVW" form="xmm, xmm, xmm" xed="VPSRAVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAW" form="ymm {k}, ymm, xmm" xed="VPSRAW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAW" form="ymm {k}, ymm, imm8" xed="VPSRAW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAW" form="ymm {z}, ymm, xmm" xed="VPSRAW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAW" form="ymm {z}, ymm, imm8" xed="VPSRAW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAW" form="zmm {k}, zmm, xmm" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAW" form="zmm {k}, zmm, imm8" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAW" form="zmm {z}, zmm, xmm" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAW" form="zmm {z}, zmm, imm8" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAW" form="zmm, zmm, xmm" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAW" form="zmm, zmm, imm8" xed="VPSRAW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAW" form="xmm {k}, xmm, xmm" xed="VPSRAW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAW" form="xmm {k}, xmm, imm8" xed="VPSRAW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAW" form="xmm {z}, xmm, xmm" xed="VPSRAW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+		ELSE
+			dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAW" form="xmm {z}, xmm, imm8" xed="VPSRAW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_bsrli_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="M128"/>
+	<parameter type="__m512i" varname="a" etype="M128"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
+dst[383:256] := a[383:256] &gt;&gt; (tmp*8)
+dst[511:384] := a[511:384] &gt;&gt; (tmp*8)
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLDQ" form="zmm, zmm, imm8" xed="VPSRLDQ_ZMMu8_ZMMu8_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLVW" form="ymm {k}, ymm, ymm" xed="VPSRLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLVW" form="ymm {z}, ymm, ymm" xed="VPSRLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLVW" form="ymm, ymm, ymm" xed="VPSRLVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLVW" form="zmm {k}, zmm, zmm" xed="VPSRLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLVW" form="zmm {z}, zmm, zmm" xed="VPSRLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLVW" form="zmm, zmm, zmm" xed="VPSRLVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLVW" form="xmm {k}, xmm, xmm" xed="VPSRLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[i+15:i] &lt; 16
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+		ELSE
+			dst[i+15:i] := 0
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLVW" form="xmm {z}, xmm, xmm" xed="VPSRLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[i+15:i] &lt; 16
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLVW" form="xmm, xmm, xmm" xed="VPSRLVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLW" form="ymm {k}, ymm, xmm" xed="VPSRLW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLW" form="ymm {k}, ymm, imm8" xed="VPSRLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLW" form="ymm {z}, ymm, xmm" xed="VPSRLW_YMMu16_MASKmskw_YMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLW" form="ymm {z}, ymm, imm8" xed="VPSRLW_YMMu16_MASKmskw_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLW" form="zmm {k}, zmm, xmm" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLW" form="zmm {k}, zmm, imm8" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLW" form="zmm {z}, zmm, xmm" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLW" form="zmm {z}, zmm, imm8" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLW" form="zmm, zmm, xmm" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLW" form="zmm, zmm, imm8" xed="VPSRLW_ZMMu16_MASKmskw_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLW" form="xmm {k}, xmm, xmm" xed="VPSRLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLW" form="xmm {k}, xmm, imm8" xed="VPSRLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLW" form="xmm {z}, xmm, xmm" xed="VPSRLW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLW" form="xmm {z}, xmm, imm8" xed="VPSRLW_XMMu16_MASKmskw_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBB" form="ymm {k}, ymm, ymm" xed="VPSUBB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBB" form="ymm {z}, ymm, ymm" xed="VPSUBB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBB" form="zmm {k}, zmm, zmm" xed="VPSUBB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBB" form="zmm {z}, zmm, zmm" xed="VPSUBB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBB" form="zmm, zmm, zmm" xed="VPSUBB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBB" form="xmm {k}, xmm, xmm" xed="VPSUBB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBB" form="xmm {z}, xmm, xmm" xed="VPSUBB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBSB" form="ymm {k}, ymm, ymm" xed="VPSUBSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBSB" form="ymm {z}, ymm, ymm" xed="VPSUBSB_YMMi8_MASKmskw_YMMi8_YMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBSB" form="zmm {k}, zmm, zmm" xed="VPSUBSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBSB" form="zmm {z}, zmm, zmm" xed="VPSUBSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBSB" form="zmm, zmm, zmm" xed="VPSUBSB_ZMMi8_MASKmskw_ZMMi8_ZMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBSB" form="xmm {k}, xmm, xmm" xed="VPSUBSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBSB" form="xmm {z}, xmm, xmm" xed="VPSUBSB_XMMi8_MASKmskw_XMMi8_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBSW" form="ymm {k}, ymm, ymm" xed="VPSUBSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBSW" form="ymm {z}, ymm, ymm" xed="VPSUBSW_YMMi16_MASKmskw_YMMi16_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBSW" form="zmm {k}, zmm, zmm" xed="VPSUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBSW" form="zmm {z}, zmm, zmm" xed="VPSUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBSW" form="zmm, zmm, zmm" xed="VPSUBSW_ZMMi16_MASKmskw_ZMMi16_ZMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBSW" form="xmm {k}, xmm, xmm" xed="VPSUBSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBSW" form="xmm {z}, xmm, xmm" xed="VPSUBSW_XMMi16_MASKmskw_XMMi16_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBUSB" form="ymm {k}, ymm, ymm" xed="VPSUBUSB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBUSB" form="ymm {z}, ymm, ymm" xed="VPSUBUSB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBUSB" form="zmm {k}, zmm, zmm" xed="VPSUBUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBUSB" form="zmm {z}, zmm, zmm" xed="VPSUBUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBUSB" form="zmm, zmm, zmm" xed="VPSUBUSB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBUSB" form="xmm {k}, xmm, xmm" xed="VPSUBUSB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBUSB" form="xmm {z}, xmm, xmm" xed="VPSUBUSB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBUSW" form="ymm {k}, ymm, ymm" xed="VPSUBUSW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBUSW" form="ymm {z}, ymm, ymm" xed="VPSUBUSW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBUSW" form="zmm {k}, zmm, zmm" xed="VPSUBUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBUSW" form="zmm {z}, zmm, zmm" xed="VPSUBUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBUSW" form="zmm, zmm, zmm" xed="VPSUBUSW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBUSW" form="xmm {k}, xmm, xmm" xed="VPSUBUSW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBUSW" form="xmm {z}, xmm, xmm" xed="VPSUBUSW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBW" form="ymm {k}, ymm, ymm" xed="VPSUBW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBW" form="ymm {z}, ymm, ymm" xed="VPSUBW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBW" form="zmm {k}, zmm, zmm" xed="VPSUBW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBW" form="zmm {z}, zmm, zmm" xed="VPSUBW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBW" form="zmm, zmm, zmm" xed="VPSUBW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBW" form="xmm {k}, xmm, xmm" xed="VPSUBW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBW" form="xmm {z}, xmm, xmm" xed="VPSUBW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPTESTMB" form="k {k}, ymm, ymm" xed="VPTESTMB_MASKmskw_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPTESTMB" form="k, ymm, ymm" xed="VPTESTMB_MASKmskw_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPTESTMB" form="k {k}, zmm, zmm" xed="VPTESTMB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPTESTMB" form="k, zmm, zmm" xed="VPTESTMB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTMB" form="k {k}, xmm, xmm" xed="VPTESTMB_MASKmskw_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTMB" form="k, xmm, xmm" xed="VPTESTMB_MASKmskw_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTMW" form="k {k}, ymm, ymm" xed="VPTESTMW_MASKmskw_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTMW" form="k, ymm, ymm" xed="VPTESTMW_MASKmskw_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPTESTMW" form="k {k}, zmm, zmm" xed="VPTESTMW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPTESTMW" form="k, zmm, zmm" xed="VPTESTMW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTMW" form="k {k}, xmm, xmm" xed="VPTESTMW_MASKmskw_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTMW" form="k, xmm, xmm" xed="VPTESTMW_MASKmskw_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPTESTNMB" form="k {k}, ymm, ymm" xed="VPTESTNMB_MASKmskw_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPTESTNMB" form="k, ymm, ymm" xed="VPTESTNMB_MASKmskw_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPTESTNMB" form="k {k}, zmm, zmm" xed="VPTESTNMB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="VPTESTNMB" form="k, zmm, zmm" xed="VPTESTNMB_MASKmskw_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTNMB" form="k {k}, xmm, xmm" xed="VPTESTNMB_MASKmskw_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTNMB" form="k, xmm, xmm" xed="VPTESTNMB_MASKmskw_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTNMW" form="k {k}, ymm, ymm" xed="VPTESTNMW_MASKmskw_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTNMW" form="k, ymm, ymm" xed="VPTESTNMW_MASKmskw_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPTESTNMW" form="k {k}, zmm, zmm" xed="VPTESTNMW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="VPTESTNMW" form="k, zmm, zmm" xed="VPTESTNMW_MASKmskw_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTNMW" form="k {k}, xmm, xmm" xed="VPTESTNMW_MASKmskw_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTNMW" form="k, xmm, xmm" xed="VPTESTNMW_MASKmskw_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHBW" form="ymm {k}, ymm, ymm" xed="VPUNPCKHBW_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHBW" form="ymm {z}, ymm, ymm" xed="VPUNPCKHBW_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHBW" form="zmm {k}, zmm, zmm" xed="VPUNPCKHBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHBW" form="zmm {z}, zmm, zmm" xed="VPUNPCKHBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHBW" form="zmm, zmm, zmm" xed="VPUNPCKHBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKHBW" form="xmm {k}, xmm, xmm" xed="VPUNPCKHBW_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKHBW" form="xmm {z}, xmm, xmm" xed="VPUNPCKHBW_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHWD" form="ymm {k}, ymm, ymm" xed="VPUNPCKHWD_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHWD" form="ymm {z}, ymm, ymm" xed="VPUNPCKHWD_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHWD" form="zmm {k}, zmm, zmm" xed="VPUNPCKHWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHWD" form="zmm {z}, zmm, zmm" xed="VPUNPCKHWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHWD" form="zmm, zmm, zmm" xed="VPUNPCKHWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKHWD" form="xmm {k}, xmm, xmm" xed="VPUNPCKHWD_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKHWD" form="xmm {z}, xmm, xmm" xed="VPUNPCKHWD_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLBW" form="ymm {k}, ymm, ymm" xed="VPUNPCKLBW_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLBW" form="ymm {z}, ymm, ymm" xed="VPUNPCKLBW_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLBW" form="zmm {k}, zmm, zmm" xed="VPUNPCKLBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLBW" form="zmm {z}, zmm, zmm" xed="VPUNPCKLBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLBW" form="zmm, zmm, zmm" xed="VPUNPCKLBW_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKLBW" form="xmm {k}, xmm, xmm" xed="VPUNPCKLBW_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKLBW" form="xmm {z}, xmm, xmm" xed="VPUNPCKLBW_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLWD" form="ymm {k}, ymm, ymm" xed="VPUNPCKLWD_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLWD" form="ymm {z}, ymm, ymm" xed="VPUNPCKLWD_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLWD" form="zmm {k}, zmm, zmm" xed="VPUNPCKLWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLWD" form="zmm {z}, zmm, zmm" xed="VPUNPCKLWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLWD" form="zmm, zmm, zmm" xed="VPUNPCKLWD_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKLWD" form="xmm {k}, xmm, xmm" xed="VPUNPCKLWD_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKLWD" form="xmm {z}, xmm, xmm" xed="VPUNPCKLWD_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI16" memwidth="512"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Store 512-bits (composed of 32 packed 16-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVDQU16" form="m512, zmm" xed="VMOVDQU16_MEMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI8" memwidth="512"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Store 512-bits (composed of 64 packed 8-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVDQU8" form="m512, zmm" xed="VMOVDQU8_MEMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI16" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Store 256-bits (composed of 16 packed 16-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVDQU16" form="m256, ymm" xed="VMOVDQU16_MEMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI8" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Store 256-bits (composed of 32 packed 8-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVDQU8" form="m256, ymm" xed="VMOVDQU8_MEMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI16" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Store 128-bits (composed of 8 packed 16-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="VMOVDQU16" form="m128, xmm" xed="VMOVDQU16_MEMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI8" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Store 128-bits (composed of 16 packed 8-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="VMOVDQU8" form="m128, xmm" xed="VMOVDQU8_MEMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="512"/>
+	<description>Load 512-bits (composed of 32 packed 16-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="zmm, m512" xed="VMOVDQU16_ZMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI8" memwidth="512"/>
+	<description>Load 512-bits (composed of 64 packed 8-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="zmm, m512" xed="VMOVDQU8_ZMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="256"/>
+	<description>Load 256-bits (composed of 16 packed 16-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="ymm, m256" xed="VMOVDQU16_YMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI8" memwidth="256"/>
+	<description>Load 256-bits (composed of 32 packed 8-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="ymm, m256" xed="VMOVDQU8_YMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="128"/>
+	<description>Load 128-bits (composed of 8 packed 16-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU16" form="xmm, m128" xed="VMOVDQU16_XMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI8" memwidth="128"/>
+	<description>Load 128-bits (composed of 16 packed 8-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU8" form="xmm, m128" xed="VMOVDQU8_XMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kadd_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Add 32-bit masks in "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := a[31:0] + b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction name="KADDD" form="k, k, k" xed="KADDD_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kadd_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Add 64-bit masks in "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := a[63:0] + b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction name="KADDQ" form="k, k, k" xed="KADDQ_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kand_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 32-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := a[31:0] AND b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction name="KANDD" form="k, k, k" xed="KANDD_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kand_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 64-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := a[63:0] AND b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction name="KANDQ" form="k, k, k" xed="KANDQ_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kandn_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 32-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := (NOT a[31:0]) AND b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction name="KANDND" form="k, k, k" xed="KANDND_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kandn_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 64-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := (NOT a[63:0]) AND b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction name="KANDNQ" form="k, k, k" xed="KANDNQ_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_knot_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<description>Compute the bitwise NOT of 32-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[31:0] := NOT a[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction name="KNOTD" form="k, k" xed="KNOTD_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_knot_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<description>Compute the bitwise NOT of 64-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[63:0] := NOT a[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction name="KNOTQ" form="k, k" xed="KNOTQ_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kor_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 32-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := a[31:0] OR b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction name="KORD" form="k, k, k" xed="KORD_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kor_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 64-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := a[63:0] OR b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction name="KORQ" form="k, k, k" xed="KORQ_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kxnor_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XNOR of 32-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := NOT (a[31:0] XOR b[31:0])
+k[MAX:32] := 0
+	</operation>
+	<instruction name="KXNORD" form="k, k, k" xed="KXNORD_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kxnor_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XNOR of 64-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := NOT (a[63:0] XOR b[63:0])
+k[MAX:64] := 0
+	</operation>
+	<instruction name="KXNORQ" form="k, k, k" xed="KXNORQ_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kxor_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XOR of 32-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[31:0] := a[31:0] XOR b[31:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction name="KXORD" form="k, k, k" xed="KXORD_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kxor_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XOR of 64-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[63:0] := a[63:0] XOR b[63:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction name="KXORQ" form="k, k, k" xed="KXORQ_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kshiftli_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="unsigned int" varname="count" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of 32-bit mask "a" left by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 31
+	k[31:0] := a[31:0] &lt;&lt; count[7:0]
+FI
+	</operation>
+	<instruction name="KSHIFTLD" form="k, k, imm8" xed="KSHIFTLD_MASKmskw_MASKmskw_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kshiftli_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="unsigned int" varname="count" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of 64-bit mask "a" left by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 63
+	k[63:0] := a[63:0] &lt;&lt; count[7:0]
+FI
+	</operation>
+	<instruction name="KSHIFTLQ" form="k, k, imm8" xed="KSHIFTLQ_MASKmskw_MASKmskw_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kshiftri_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="unsigned int" varname="count" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of 32-bit mask "a" right by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 31
+	k[31:0] := a[31:0] &gt;&gt; count[7:0]
+FI
+	</operation>
+	<instruction name="KSHIFTRD" form="k, k, imm8" xed="KSHIFTRD_MASKmskw_MASKmskw_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kshiftri_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="unsigned int" varname="count" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of 64-bit mask "a" right by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 63
+	k[63:0] := a[63:0] &gt;&gt; count[7:0]
+FI
+	</operation>
+	<instruction name="KSHIFTRQ" form="k, k, imm8" xed="KSHIFTRQ_MASKmskw_MASKmskw_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_load_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__mmask32*" varname="mem_addr" etype="MASK" memwidth="32"/>
+	<description>Load 32-bit mask from memory into "k".</description>
+	<operation>
+k[31:0] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<instruction name="KMOVD" form="k, m32" xed="KMOVD_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_load_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__mmask64*" varname="mem_addr" etype="MASK" memwidth="64"/>
+	<description>Load 64-bit mask from memory into "k".</description>
+	<operation>
+k[63:0] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name="KMOVQ" form="k, m64" xed="KMOVQ_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_store_mask32">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__mmask32*" varname="mem_addr" etype="MASK" memwidth="32"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<description>Store 32-bit mask from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction name="KMOVD" form="m32, k" xed="KMOVD_MEMu32_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_store_mask64">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__mmask64*" varname="mem_addr" etype="MASK" memwidth="64"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<description>Store 64-bit mask from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name="KMOVQ" form="m64, k" xed="KMOVQ_MEMu64_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortest_mask32_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<parameter type="unsigned char*" varname="all_ones" etype="UI8" memwidth="8"/>
+	<description>Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones".</description>
+	<operation>
+tmp[31:0] := a[31:0] OR b[31:0]
+IF tmp[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+IF tmp[31:0] == 0xFFFFFFFF
+	MEM[all_ones+7:all_ones] := 1
+ELSE
+	MEM[all_ones+7:all_ones] := 0
+FI
+	</operation>
+	<instruction name="KORTESTD" form="k, k" xed="KORTESTD_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortestz_mask32_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[31:0] := a[31:0] OR b[31:0]
+IF tmp[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KORTESTD" form="k, k" xed="KORTESTD_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortestc_mask32_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[31:0] := a[31:0] OR b[31:0]
+IF tmp[31:0] == 0xFFFFFFFF
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KORTESTD" form="k, k" xed="KORTESTD_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortest_mask64_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<parameter type="unsigned char*" varname="all_ones" etype="UI8" memwidth="8"/>
+	<description>Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones".</description>
+	<operation>
+tmp[63:0] := a[63:0] OR b[63:0]
+IF tmp[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+IF tmp[7:0] == 0xFFFFFFFFFFFFFFFF
+	MEM[all_ones+7:all_ones] := 1
+ELSE
+	MEM[all_ones+7:all_ones] := 0
+FI
+	</operation>
+	<instruction name="KORTESTQ" form="k, k" xed="KORTESTQ_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortestz_mask64_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[63:0] := a[63:0] OR b[63:0]
+IF tmp[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KORTESTQ" form="k, k" xed="KORTESTQ_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortestc_mask64_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[63:0] := a[63:0] OR b[63:0]
+IF tmp[63:0] == 0xFFFFFFFFFFFFFFFF
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KORTESTQ" form="k, k" xed="KORTESTQ_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktest_mask32_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<parameter type="unsigned char*" varname="and_not" etype="UI8" memwidth="8"/>
+	<description>Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not".</description>
+	<operation>
+tmp1[31:0] := a[31:0] AND b[31:0]
+IF tmp1[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+tmp2[31:0] := (NOT a[31:0]) AND b[31:0]
+IF tmp2[31:0] == 0x0
+	MEM[and_not+7:and_not] := 1
+ELSE
+	MEM[and_not+7:and_not] := 0
+FI
+	</operation>
+	<instruction name="KTESTD" form="k, k" xed="KTESTD_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktestz_mask32_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[31:0] := a[31:0] AND b[31:0]
+IF tmp[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KTESTD" form="k, k" xed="KTESTD_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktestc_mask32_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<parameter type="__mmask32" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 32-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[31:0] := (NOT a[31:0]) AND b[31:0]
+IF tmp[31:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KTESTD" form="k, k" xed="KTESTD_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktest_mask64_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<parameter type="unsigned char*" varname="and_not" etype="UI8" memwidth="8"/>
+	<description>Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not".</description>
+	<operation>
+tmp1[63:0] := a[63:0] AND b[63:0]
+IF tmp1[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+tmp2[63:0] := (NOT a[63:0]) AND b[63:0]
+IF tmp2[63:0] == 0x0
+	MEM[and_not+7:and_not] := 1
+ELSE
+	MEM[and_not+7:and_not] := 0
+FI
+	</operation>
+	<instruction name="KTESTQ" form="k, k" xed="KTESTQ_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktestz_mask64_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[63:0] := a[63:0] AND b[63:0]
+IF tmp[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KTESTQ" form="k, k" xed="KTESTQ_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktestc_mask64_u8">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<parameter type="__mmask64" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 64-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[63:0] := (NOT a[63:0]) AND b[63:0]
+IF tmp[63:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KTESTQ" form="k, k" xed="KTESTQ_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_cvtmask32_u32">
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask32" varname="a" etype="MASK"/>
+	<description>Convert 32-bit mask "a" into an integer value, and store the result in "dst".</description>
+	<operation>
+dst := ZeroExtend32(a[31:0])
+	</operation>
+	<instruction name="KMOVD" form="r32, k" xed="KMOVD_GPR32u32_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_cvtmask64_u64">
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__mmask64" varname="a" etype="MASK"/>
+	<description>Convert 64-bit mask "a" into an integer value, and store the result in "dst".</description>
+	<operation>
+dst := ZeroExtend64(a[63:0])
+	</operation>
+	<instruction name="KMOVQ" form="r64, k" xed="KMOVQ_GPR64u64_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_cvtu32_mask32">
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Convert integer value "a" into an 32-bit mask, and store the result in "k".</description>
+	<operation>
+k := ZeroExtend32(a[31:0])
+	</operation>
+	<instruction name="KMOVD" form="k, r32" xed="KMOVD_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_cvtu64_mask64">
+	<CPUID>AVX512BW</CPUID>
+	<category>Mask</category>
+	<return type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Convert integer value "a" into an 64-bit mask, and store the result in "k".</description>
+	<operation>
+k := ZeroExtend64(a[63:0])
+	</operation>
+	<instruction name="KMOVQ" form="k, r64" xed="KMOVQ_MASKmskw_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_broadcastmb_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ZeroExtend64(k[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTMB2Q" form="ymm" xed="VPBROADCASTMB2Q_YMMu64_MASKu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_broadcastmb_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ZeroExtend64(k[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTMB2Q" form="xmm" xed="VPBROADCASTMB2Q_XMMu64_MASKu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_broadcastmw_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ZeroExtend32(k[15:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTMW2D" form="ymm" xed="VPBROADCASTMW2D_YMMu32_MASKu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_broadcastmw_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ZeroExtend32(k[15:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTMW2D" form="xmm" xed="VPBROADCASTMW2D_XMMu32_MASKu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	FOR k := 0 to j-1
+		m := k*32
+		dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+	ENDFOR
+	dst[i+31:i+j] := 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCONFLICTD" form="ymm, ymm" xed="VPCONFLICTD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCONFLICTD" form="ymm {k}, ymm" xed="VPCONFLICTD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCONFLICTD" form="ymm {z}, ymm" xed="VPCONFLICTD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	FOR k := 0 to j-1
+		m := k*32
+		dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+	ENDFOR
+	dst[i+31:i+j] := 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCONFLICTD" form="xmm, xmm" xed="VPCONFLICTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCONFLICTD" form="xmm {k}, xmm" xed="VPCONFLICTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCONFLICTD" form="xmm {z}, xmm" xed="VPCONFLICTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	FOR k := 0 to j-1
+		m := k*64
+		dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+	ENDFOR
+	dst[i+63:i+j] := 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCONFLICTQ" form="ymm, ymm" xed="VPCONFLICTQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCONFLICTQ" form="ymm {k}, ymm" xed="VPCONFLICTQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCONFLICTQ" form="ymm {z}, ymm" xed="VPCONFLICTQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	FOR k := 0 to j-1
+		m := k*64
+		dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+	ENDFOR
+	dst[i+63:i+j] := 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCONFLICTQ" form="xmm, xmm" xed="VPCONFLICTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCONFLICTQ" form="xmm {k}, xmm" xed="VPCONFLICTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCONFLICTQ" form="xmm {z}, xmm" xed="VPCONFLICTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp := 31
+	dst[i+31:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+31:i] := dst[i+31:i] + 1
+	OD
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPLZCNTD" form="ymm, ymm" xed="VPLZCNTD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPLZCNTD" form="ymm {k}, ymm" xed="VPLZCNTD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPLZCNTD" form="ymm {z}, ymm" xed="VPLZCNTD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp := 31
+	dst[i+31:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+31:i] := dst[i+31:i] + 1
+	OD
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPLZCNTD" form="xmm, xmm" xed="VPLZCNTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPLZCNTD" form="xmm {k}, xmm" xed="VPLZCNTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPLZCNTD" form="xmm {z}, xmm" xed="VPLZCNTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp := 63
+	dst[i+63:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+63:i] := dst[i+63:i] + 1
+	OD
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPLZCNTQ" form="ymm, ymm" xed="VPLZCNTQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPLZCNTQ" form="ymm {k}, ymm" xed="VPLZCNTQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPLZCNTQ" form="ymm {z}, ymm" xed="VPLZCNTQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp := 63
+	dst[i+63:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+63:i] := dst[i+63:i] + 1
+	OD
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPLZCNTQ" form="xmm, xmm" xed="VPLZCNTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPLZCNTQ" form="xmm {k}, xmm" xed="VPLZCNTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPLZCNTQ" form="xmm {z}, xmm" xed="VPLZCNTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcastmb_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ZeroExtend64(k[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTMB2Q" form="zmm" xed="VPBROADCASTMB2Q_ZMMu64_MASKu64_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcastmw_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ZeroExtend32(k[15:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTMW2D" form="zmm" xed="VPBROADCASTMW2D_ZMMu32_MASKu32_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	FOR k := 0 to j-1
+		m := k*32
+		dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+	ENDFOR
+	dst[i+31:i+j] := 0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCONFLICTD" form="zmm, zmm" xed="VPCONFLICTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCONFLICTD" form="zmm {k}, zmm" xed="VPCONFLICTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCONFLICTD" form="zmm {z}, zmm" xed="VPCONFLICTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	FOR k := 0 to j-1
+		m := k*64
+		dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+	ENDFOR
+	dst[i+63:i+j] := 0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCONFLICTQ" form="zmm, zmm" xed="VPCONFLICTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCONFLICTQ" form="zmm {k}, zmm" xed="VPCONFLICTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCONFLICTQ" form="zmm {z}, zmm" xed="VPCONFLICTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp := 31
+	dst[i+31:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+31:i] := dst[i+31:i] + 1
+	OD
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPLZCNTD" form="zmm, zmm" xed="VPLZCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPLZCNTD" form="zmm {k}, zmm" xed="VPLZCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPLZCNTD" form="zmm {z}, zmm" xed="VPLZCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp := 63
+	dst[i+63:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+63:i] := dst[i+63:i] + 1
+	OD
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPLZCNTQ" form="zmm, zmm" xed="VPLZCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPLZCNTQ" form="zmm {k}, zmm" xed="VPLZCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPLZCNTQ" form="zmm {z}, zmm" xed="VPLZCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512CD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDNPD" form="ymm {k}, ymm, ymm" xed="VANDNPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDNPD" form="ymm {z}, ymm, ymm" xed="VANDNPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDNPD" form="zmm, zmm, zmm" xed="VANDNPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDNPD" form="zmm {k}, zmm, zmm" xed="VANDNPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDNPD" form="zmm {z}, zmm, zmm" xed="VANDNPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VANDNPD" form="xmm {k}, xmm, xmm" xed="VANDNPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VANDNPD" form="xmm {z}, xmm, xmm" xed="VANDNPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDNPS" form="ymm {k}, ymm, ymm" xed="VANDNPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDNPS" form="ymm {z}, ymm, ymm" xed="VANDNPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDNPS" form="zmm, zmm, zmm" xed="VANDNPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDNPS" form="zmm {k}, zmm, zmm" xed="VANDNPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDNPS" form="zmm {z}, zmm, zmm" xed="VANDNPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VANDNPS" form="xmm {k}, xmm, xmm" xed="VANDNPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VANDNPS" form="xmm {z}, xmm, xmm" xed="VANDNPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDPD" form="ymm {k}, ymm, ymm" xed="VANDPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0 
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDPD" form="ymm {z}, ymm, ymm" xed="VANDPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDPD" form="zmm, zmm, zmm" xed="VANDPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDPD" form="zmm {k}, zmm, zmm" xed="VANDPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDPD" form="zmm {z}, zmm, zmm" xed="VANDPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VANDPD" form="xmm {k}, xmm, xmm" xed="VANDPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VANDPD" form="xmm {z}, xmm, xmm" xed="VANDPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDPS" form="ymm {k}, ymm, ymm" xed="VANDPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VANDPS" form="ymm {z}, ymm, ymm" xed="VANDPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDPS" form="zmm, zmm, zmm" xed="VANDPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDPS" form="zmm {k}, zmm, zmm" xed="VANDPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VANDPS" form="zmm {z}, zmm, zmm" xed="VANDPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VANDPS" form="xmm {k}, xmm, xmm" xed="VANDPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VANDPS" form="xmm {z}, xmm, xmm" xed="VANDPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X2" form="ymm, xmm" xed="VBROADCASTF32X2_YMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X2" form="ymm {k}, xmm" xed="VBROADCASTF32X2_YMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X2" form="ymm {z}, xmm" xed="VBROADCASTF32X2_YMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X2" form="zmm, xmm" xed="VBROADCASTF32X2_ZMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X2" form="zmm {k}, xmm" xed="VBROADCASTF32X2_ZMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X2" form="zmm {z}, xmm" xed="VBROADCASTF32X2_ZMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_broadcast_f32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X8" form="zmm, m256" xed="VBROADCASTF32X8_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_mask_broadcast_f32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X8" form="zmm {k}, m256" xed="VBROADCASTF32X8_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_maskz_broadcast_f32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X8" form="zmm {z}, m256" xed="VBROADCASTF32X8_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF64X2" form="ymm, m128" xed="VBROADCASTF64X2_YMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_mask_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF64X2" form="ymm {k}, m128" xed="VBROADCASTF64X2_YMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_maskz_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF64X2" form="ymm {z}, m128" xed="VBROADCASTF64X2_YMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF64X2" form="zmm, m128" xed="VBROADCASTF64X2_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_mask_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF64X2" form="zmm {k}, m128" xed="VBROADCASTF64X2_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_maskz_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF64X2" form="zmm {z}, m128" xed="VBROADCASTF64X2_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X2" form="ymm, xmm" xed="VBROADCASTI32X2_YMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X2" form="ymm {k}, xmm" xed="VBROADCASTI32X2_YMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X2" form="ymm {z}, xmm" xed="VBROADCASTI32X2_YMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcast_i32x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X2" form="zmm, xmm" xed="VBROADCASTI32X2_ZMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcast_i32x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X2" form="zmm {k}, xmm" xed="VBROADCASTI32X2_ZMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcast_i32x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X2" form="zmm {z}, xmm" xed="VBROADCASTI32X2_ZMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	n := (j % 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X2" form="xmm, xmm" xed="VBROADCASTI32X2_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X2" form="xmm {k}, xmm" xed="VBROADCASTI32X2_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	n := (j % 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X2" form="xmm {z}, xmm" xed="VBROADCASTI32X2_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_broadcast_i32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X8" form="zmm, m256" xed="VBROADCASTI32X8_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_mask_broadcast_i32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X8" form="zmm {k}, m256" xed="VBROADCASTI32X8_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_maskz_broadcast_i32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X8" form="zmm {z}, m256" xed="VBROADCASTI32X8_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_broadcast_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI64X2" form="ymm, m128" xed="VBROADCASTI64X2_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_mask_broadcast_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI64X2" form="ymm {k}, m128" xed="VBROADCASTI64X2_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_maskz_broadcast_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI64X2" form="ymm {z}, m128" xed="VBROADCASTI64X2_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_broadcast_i64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI64X2" form="zmm, m128" xed="VBROADCASTI64X2_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_mask_broadcast_i64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI64X2" form="zmm {k}, m128" xed="VBROADCASTI64X2_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_maskz_broadcast_i64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI64X2" form="zmm {z}, m128" xed="VBROADCASTI64X2_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="ymm, ymm" xed="VCVTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="ymm {k}, ymm" xed="VCVTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="ymm {z}, ymm" xed="VCVTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="zmm, zmm {er}" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="zmm, zmm" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="zmm {k}, zmm {er}" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="zmm {k}, zmm" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="zmm {z}, zmm {er}" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="zmm {z}, zmm" xed="VCVTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="xmm, xmm" xed="VCVTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="xmm {k}, xmm" xed="VCVTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2QQ" form="xmm {z}, xmm" xed="VCVTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="ymm, ymm" xed="VCVTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="ymm {k}, ymm" xed="VCVTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="ymm {z}, ymm" xed="VCVTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="zmm, zmm {er}" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="zmm, zmm" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="zmm {k}, zmm {er}" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="zmm {k}, zmm" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="zmm {z}, zmm {er}" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="zmm {z}, zmm" xed="VCVTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="xmm, xmm" xed="VCVTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="xmm {k}, xmm" xed="VCVTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2UQQ" form="xmm {z}, xmm" xed="VCVTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="ymm, xmm" xed="VCVTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="ymm {k}, xmm" xed="VCVTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="ymm {z}, xmm" xed="VCVTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="zmm, ymm {er}" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="zmm, ymm" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	 [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="zmm {k}, ymm {er}" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="zmm {k}, ymm" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="zmm {z}, ymm {er}" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="zmm {z}, ymm" xed="VCVTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="xmm, xmm" xed="VCVTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="xmm {k}, xmm" xed="VCVTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2QQ" form="xmm {z}, xmm" xed="VCVTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="ymm, xmm" xed="VCVTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="ymm {k}, xmm" xed="VCVTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="ymm {z}, xmm" xed="VCVTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="zmm, ymm {er}" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="zmm, ymm" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="zmm {k}, ymm {er}" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="zmm {k}, ymm" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="zmm {z}, ymm {er}" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="zmm {z}, ymm" xed="VCVTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="xmm, xmm" xed="VCVTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="xmm {k}, xmm" xed="VCVTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2UQQ" form="xmm {z}, xmm" xed="VCVTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="ymm, ymm" xed="VCVTQQ2PD_YMMi64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="ymm {k}, ymm" xed="VCVTQQ2PD_YMMi64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="ymm {z}, ymm" xed="VCVTQQ2PD_YMMi64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="zmm, zmm {er}" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="zmm, zmm" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="zmm {k}, zmm {er}" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="zmm {k}, zmm" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="zmm {z}, zmm {er}" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="zmm {z}, zmm" xed="VCVTQQ2PD_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="xmm, xmm" xed="VCVTQQ2PD_XMMi64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="xmm {k}, xmm" xed="VCVTQQ2PD_XMMi64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTQQ2PD" form="xmm {z}, xmm" xed="VCVTQQ2PD_XMMi64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="xmm, ymm" xed="VCVTQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="xmm {k}, ymm" xed="VCVTQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="xmm {z}, ymm" xed="VCVTQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="ymm, zmm {er}" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="ymm, zmm" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="ymm {k}, zmm {er}" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="ymm {k}, zmm" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="ymm {z}, zmm {er}" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="ymm {z}, zmm" xed="VCVTQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="xmm, xmm" xed="VCVTQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="xmm {k}, xmm" xed="VCVTQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTQQ2PS" form="xmm {z}, xmm" xed="VCVTQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="ymm, ymm" xed="VCVTTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="ymm {k}, ymm" xed="VCVTTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="ymm {z}, ymm" xed="VCVTTPD2QQ_YMMi64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="zmm, zmm {sae}" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="zmm, zmm" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="zmm {k}, zmm {sae}" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="zmm {k}, zmm" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="zmm {z}, zmm {sae}" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="zmm {z}, zmm" xed="VCVTTPD2QQ_ZMMi64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="xmm, xmm" xed="VCVTTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="xmm {k}, xmm" xed="VCVTTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2QQ" form="xmm {z}, xmm" xed="VCVTTPD2QQ_XMMi64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="ymm, ymm" xed="VCVTTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="ymm {k}, ymm" xed="VCVTTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="ymm {z}, ymm" xed="VCVTTPD2UQQ_YMMu64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="zmm, zmm {sae}" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="zmm, zmm" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="zmm {k}, zmm {sae}" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="zmm {k}, zmm" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="zmm {z}, zmm {sae}" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="zmm {z}, zmm" xed="VCVTTPD2UQQ_ZMMu64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="xmm, xmm" xed="VCVTTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="xmm {k}, xmm" xed="VCVTTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2UQQ" form="xmm {z}, xmm" xed="VCVTTPD2UQQ_XMMu64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="ymm, xmm" xed="VCVTTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="ymm {k}, xmm" xed="VCVTTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="ymm {z}, xmm" xed="VCVTTPS2QQ_YMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="zmm, ymm {sae}" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="zmm, ymm" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="zmm {k}, ymm {sae}" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="zmm {k}, ymm" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="zmm {z}, ymm {sae}" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="zmm {z}, ymm" xed="VCVTTPS2QQ_ZMMi64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="xmm, xmm" xed="VCVTTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="xmm {k}, xmm" xed="VCVTTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2QQ" form="xmm {z}, xmm" xed="VCVTTPS2QQ_XMMi64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="ymm, xmm" xed="VCVTTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="ymm {k}, xmm" xed="VCVTTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="ymm {z}, xmm" xed="VCVTTPS2UQQ_YMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="zmm, ymm {sae}" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="zmm, ymm" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="zmm {k}, ymm {sae}" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="zmm {k}, ymm" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="zmm {z}, ymm {sae}" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="zmm {z}, ymm" xed="VCVTTPS2UQQ_ZMMu64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="xmm, xmm" xed="VCVTTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="xmm {k}, xmm" xed="VCVTTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2UQQ" form="xmm {z}, xmm" xed="VCVTTPS2UQQ_XMMu64_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="ymm, ymm" xed="VCVTUQQ2PD_YMMf64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="ymm {k}, ymm" xed="VCVTUQQ2PD_YMMf64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="ymm {z}, ymm" xed="VCVTUQQ2PD_YMMf64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="zmm, zmm {er}" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="zmm, zmm" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="zmm {k}, zmm {er}" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="zmm {k}, zmm" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="zmm {z}, zmm {er}" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="zmm {z}, zmm" xed="VCVTUQQ2PD_ZMMf64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="xmm, xmm" xed="VCVTUQQ2PD_XMMf64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="xmm {k}, xmm" xed="VCVTUQQ2PD_XMMf64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PD" form="xmm {z}, xmm" xed="VCVTUQQ2PD_XMMf64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="xmm, ymm" xed="VCVTUQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="xmm {k}, ymm" xed="VCVTUQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="xmm {z}, ymm" xed="VCVTUQQ2PS_XMMf32_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="ymm, zmm {er}" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="ymm, zmm" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="ymm {k}, zmm {er}" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="ymm {k}, zmm" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="ymm {z}, zmm {er}" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="ymm {z}, zmm" xed="VCVTUQQ2PS_YMMf32_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="xmm, xmm" xed="VCVTUQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="xmm {k}, xmm" xed="VCVTUQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTUQQ2PS" form="xmm {z}, xmm" xed="VCVTUQQ2PS_XMMf32_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_extractf32x8_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTF32X8" form="ymm, zmm, imm8" xed="VEXTRACTF32X8_YMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_extractf32x8_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTF32X8" form="ymm {k}, zmm, imm8" xed="VEXTRACTF32X8_YMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_extractf32x8_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTF32X8" form="ymm {z}, zmm, imm8" xed="VEXTRACTF32X8_YMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF64X2" form="xmm, ymm, imm8" xed="VEXTRACTF64X2_XMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF64X2" form="xmm {k}, ymm, imm8" xed="VEXTRACTF64X2_XMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF64X2" form="xmm {z}, ymm, imm8" xed="VEXTRACTF64X2_XMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[1:0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF64X2" form="xmm, zmm, imm8" xed="VEXTRACTF64X2_XMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF64X2" form="xmm {k}, zmm, imm8" xed="VEXTRACTF64X2_XMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF64X2" form="xmm {z}, zmm, imm8" xed="VEXTRACTF64X2_XMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_extracti32x8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTI32X8" form="ymm, zmm, imm8" xed="VEXTRACTI32X8_YMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_extracti32x8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTI32X8" form="ymm {k}, zmm, imm8" xed="VEXTRACTI32X8_YMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_extracti32x8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTI32X8" form="ymm {z}, zmm, imm8" xed="VEXTRACTI32X8_YMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI64X2" form="xmm, ymm, imm8" xed="VEXTRACTI64X2_XMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI64X2" form="xmm {k}, ymm, imm8" xed="VEXTRACTI64X2_XMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI64X2" form="xmm {z}, ymm, imm8" xed="VEXTRACTI64X2_XMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[1:0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI64X2" form="xmm, zmm, imm8" xed="VEXTRACTI64X2_XMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI64X2" form="xmm {k}, zmm, imm8" xed="VEXTRACTI64X2_XMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI64X2" form="xmm {z}, zmm, imm8" xed="VEXTRACTI64X2_XMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VFPCLASSPD" form="k, ymm, imm8" xed="VFPCLASSPD_MASKmskw_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VFPCLASSPD" form="k {k}, ymm, imm8" xed="VFPCLASSPD_MASKmskw_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VFPCLASSPD" form="k, zmm, imm8" xed="VFPCLASSPD_MASKmskw_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VFPCLASSPD" form="k {k}, zmm, imm8" xed="VFPCLASSPD_MASKmskw_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VFPCLASSPD" form="k, xmm, imm8" xed="VFPCLASSPD_MASKmskw_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VFPCLASSPD" form="k {k}, xmm, imm8" xed="VFPCLASSPD_MASKmskw_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VFPCLASSPS" form="k, ymm, imm8" xed="VFPCLASSPS_MASKmskw_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VFPCLASSPS" form="k {k}, ymm, imm8" xed="VFPCLASSPS_MASKmskw_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VFPCLASSPS" form="k, zmm, imm8" xed="VFPCLASSPS_MASKmskw_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VFPCLASSPS" form="k {k}, zmm, imm8" xed="VFPCLASSPS_MASKmskw_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VFPCLASSPS" form="k, xmm, imm8" xed="VFPCLASSPS_MASKmskw_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VFPCLASSPS" form="k {k}, xmm, imm8" xed="VFPCLASSPS_MASKmskw_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fpclass_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k".
+	[fpclass_note]</description>
+	<operation>k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VFPCLASSSD" form="k, xmm, imm8" xed="VFPCLASSSD_MASKmskw_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fpclass_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).
+	[fpclass_note]</description>
+	<operation>IF k1[0]
+	k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VFPCLASSSD" form="k {k}, xmm, imm8" xed="VFPCLASSSD_MASKmskw_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fpclass_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k.
+	[fpclass_note]</description>
+	<operation>k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VFPCLASSSS" form="k, xmm, imm8" xed="VFPCLASSSS_MASKmskw_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fpclass_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).
+	[fpclass_note]</description>
+	<operation>IF k1[0]
+	k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VFPCLASSSS" form="k {k}, xmm, imm8" xed="VFPCLASSSS_MASKmskw_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_insertf32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF32X8" form="zmm, zmm, ymm, imm8" xed="VINSERTF32X8_ZMMf32_MASKmskw_ZMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_insertf32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF32X8" form="zmm {k}, zmm, ymm, imm8" xed="VINSERTF32X8_ZMMf32_MASKmskw_ZMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_insertf32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF32X8" form="zmm {z}, zmm, ymm, imm8" xed="VINSERTF32X8_ZMMf32_MASKmskw_ZMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE imm8[0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF64X2" form="ymm, ymm, xmm, imm8" xed="VINSERTF64X2_YMMf64_MASKmskw_YMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF64X2" form="ymm {k}, ymm, xmm, imm8" xed="VINSERTF64X2_YMMf64_MASKmskw_YMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF64X2" form="ymm {z}, ymm, xmm, imm8" xed="VINSERTF64X2_YMMf64_MASKmskw_YMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE imm8[1:0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF64X2" form="zmm, zmm, xmm, imm8" xed="VINSERTF64X2_ZMMf64_MASKmskw_ZMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF64X2" form="zmm {k}, zmm, xmm, imm8" xed="VINSERTF64X2_ZMMf64_MASKmskw_ZMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF64X2" form="zmm {z}, zmm, xmm, imm8" xed="VINSERTF64X2_ZMMf64_MASKmskw_ZMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_inserti32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE imm8[0] OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI32X8" form="zmm, zmm, ymm, imm8" xed="VINSERTI32X8_ZMMu32_MASKmskw_ZMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_inserti32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI32X8" form="zmm {k}, zmm, ymm, imm8" xed="VINSERTI32X8_ZMMu32_MASKmskw_ZMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_inserti32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI32X8" form="zmm {z}, zmm, ymm, imm8" xed="VINSERTI32X8_ZMMu32_MASKmskw_ZMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_inserti64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE imm8[0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTI64X2" form="ymm, ymm, xmm, imm8" xed="VINSERTI64X2_YMMu64_MASKmskw_YMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_inserti64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTI64X2" form="ymm {k}, ymm, xmm, imm8" xed="VINSERTI64X2_YMMu64_MASKmskw_YMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_inserti64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTI64X2" form="ymm {z}, ymm, xmm, imm8" xed="VINSERTI64X2_YMMu64_MASKmskw_YMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_inserti64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE imm8[1:0] OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI64X2" form="zmm, zmm, xmm, imm8" xed="VINSERTI64X2_ZMMu64_MASKmskw_ZMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_inserti64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI64X2" form="zmm {k}, zmm, xmm, imm8" xed="VINSERTI64X2_ZMMu64_MASKmskw_ZMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_inserti64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI64X2" form="zmm {z}, zmm, xmm, imm8" xed="VINSERTI64X2_ZMMu64_MASKmskw_ZMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VORPD" form="ymm {k}, ymm, ymm" xed="VORPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VORPD" form="ymm {z}, ymm, ymm" xed="VORPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VORPD" form="zmm {k}, zmm, zmm" xed="VORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VORPD" form="zmm {z}, zmm, zmm" xed="VORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VORPD" form="zmm, zmm, zmm" xed="VORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VORPD" form="xmm {k}, xmm, xmm" xed="VORPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VORPD" form="xmm {z}, xmm, xmm" xed="VORPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VORPS" form="ymm {k}, ymm, ymm" xed="VORPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VORPS" form="ymm {z}, ymm, ymm" xed="VORPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VORPS" form="zmm {k}, zmm, zmm" xed="VORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VORPS" form="zmm {z}, zmm, zmm" xed="VORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VORPS" form="zmm, zmm, zmm" xed="VORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VORPS" form="xmm {k}, xmm, xmm" xed="VORPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VORPS" form="xmm {z}, xmm, xmm" xed="VORPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_movepi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPMOVD2M" form="k, ymm" xed="VPMOVD2M_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movepi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF a[i+31]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPMOVD2M" form="k, zmm" xed="VPMOVD2M_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_movepi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPMOVD2M" form="k, xmm" xed="VPMOVD2M_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_movm_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 0xFFFFFFFF
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVM2D" form="ymm" xed="VPMOVM2D_YMMu32_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movm_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 0xFFFFFFFF
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVM2D" form="zmm" xed="VPMOVM2D_ZMMu32_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_movm_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 0xFFFFFFFF
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVM2D" form="xmm" xed="VPMOVM2D_XMMu32_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_movm_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 0xFFFFFFFFFFFFFFFF
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVM2Q" form="ymm" xed="VPMOVM2Q_YMMu64_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movm_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 0xFFFFFFFFFFFFFFFF
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVM2Q" form="zmm" xed="VPMOVM2Q_ZMMu64_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_movm_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 0xFFFFFFFFFFFFFFFF
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVM2Q" form="xmm" xed="VPMOVM2Q_XMMu64_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_movepi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPMOVQ2M" form="k, ymm" xed="VPMOVQ2M_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movepi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF a[i+63]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPMOVQ2M" form="k, zmm" xed="VPMOVQ2M_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_movepi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPMOVQ2M" form="k, xmm" xed="VPMOVQ2M_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULLQ" form="ymm {k}, ymm, ymm" xed="VPMULLQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULLQ" form="ymm {z}, ymm, ymm" xed="VPMULLQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := a[i+63:i] * b[i+63:i]
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULLQ" form="ymm, ymm, ymm" xed="VPMULLQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULLQ" form="zmm {k}, zmm, zmm" xed="VPMULLQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULLQ" form="zmm {z}, zmm, zmm" xed="VPMULLQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := a[i+63:i] * b[i+63:i]
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULLQ" form="zmm, zmm, zmm" xed="VPMULLQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULLQ" form="xmm {k}, xmm, xmm" xed="VPMULLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULLQ" form="xmm {z}, xmm, xmm" xed="VPMULLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := a[i+63:i] * b[i+63:i]
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULLQ" form="xmm, xmm, xmm" xed="VPMULLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="ymm {k}, ymm, ymm, imm8" xed="VRANGEPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="ymm {z}, ymm, ymm, imm8" xed="VRANGEPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="ymm, ymm, ymm, imm8" xed="VRANGEPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="zmm {k}, zmm, zmm, imm8" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_range_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="zmm {k}, zmm, zmm {sae}, imm8" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="zmm {z}, zmm, zmm, imm8" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_range_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="zmm {z}, zmm, zmm {sae}, imm8" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="zmm, zmm, zmm, imm8" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_range_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="zmm, zmm, zmm {sae}, imm8" xed="VRANGEPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="xmm {k}, xmm, xmm, imm8" xed="VRANGEPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="xmm {z}, xmm, xmm, imm8" xed="VRANGEPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGEPD" form="xmm, xmm, xmm, imm8" xed="VRANGEPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="ymm {k}, ymm, ymm, imm8" xed="VRANGEPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="ymm {z}, ymm, ymm, imm8" xed="VRANGEPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="ymm, ymm, ymm, imm8" xed="VRANGEPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="zmm {k}, zmm, zmm, imm8" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_range_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="zmm {k}, zmm, zmm {sae}, imm8" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="zmm {z}, zmm, zmm, imm8" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_range_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="zmm {z}, zmm, zmm {sae}, imm8" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="zmm, zmm, zmm, imm8" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_range_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="zmm, zmm, zmm {sae}, imm8" xed="VRANGEPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="xmm {k}, xmm, xmm, imm8" xed="VRANGEPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="xmm {z}, xmm, xmm, imm8" xed="VRANGEPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGEPS" form="xmm, xmm, xmm, imm8" xed="VRANGEPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_range_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESD" form="xmm {k}, xmm, xmm {sae}, imm8" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_range_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESD" form="xmm {k}, xmm, xmm, imm8" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_range_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESD" form="xmm {z}, xmm, xmm {sae}, imm8" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_range_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESD" form="xmm {z}, xmm, xmm, imm8" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_range_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESD" form="xmm, xmm, xmm {sae}, imm8" xed="VRANGESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_range_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESS" form="xmm {k}, xmm, xmm {sae}, imm8" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_range_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESS" form="xmm {k}, xmm, xmm, imm8" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_range_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESS" form="xmm {z}, xmm, xmm {sae}, imm8" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_range_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+IF k[0]
+	dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESS" form="xmm {z}, xmm, xmm, imm8" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_range_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note]</description>
+	<operation>
+DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) {
+	CASE opCtl[1:0] OF
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0] OF
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRANGESS" form="xmm, xmm, xmm {sae}, imm8" xed="VRANGESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="ymm {k}, ymm, imm8" xed="VREDUCEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="ymm {z}, ymm, imm8" xed="VREDUCEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="ymm, ymm, imm8" xed="VREDUCEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="zmm {k}, zmm, imm8" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_reduce_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="zmm {k}, zmm {sae}, imm8" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="zmm {z}, zmm, imm8" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_reduce_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="zmm {z}, zmm {sae}, imm8" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="zmm, zmm, imm8" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_reduce_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="zmm, zmm {sae}, imm8" xed="VREDUCEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="xmm {k}, xmm, imm8" xed="VREDUCEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="xmm {z}, xmm, imm8" xed="VREDUCEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCEPD" form="xmm, xmm, imm8" xed="VREDUCEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="ymm {k}, ymm, imm8" xed="VREDUCEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="ymm {z}, ymm, imm8" xed="VREDUCEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="ymm, ymm, imm8" xed="VREDUCEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="zmm {k}, zmm, imm8" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_reduce_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="zmm {k}, zmm {sae}, imm8" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="zmm {z}, zmm, imm8" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_reduce_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="zmm {z}, zmm {sae}, imm8" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="zmm, zmm, imm8" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_reduce_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="zmm, zmm {sae}, imm8" xed="VREDUCEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="xmm {k}, xmm, imm8" xed="VREDUCEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="xmm {z}, xmm, imm8" xed="VREDUCEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCEPS" form="xmm, xmm, imm8" xed="VREDUCEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_reduce_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESD" form="xmm {k}, xmm, xmm, imm8" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_reduce_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESD" form="xmm {k}, xmm, xmm {sae}, imm8" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_reduce_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESD" form="xmm {z}, xmm, xmm, imm8" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_reduce_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESD" form="xmm {z}, xmm, xmm {sae}, imm8" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_reduce_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESD" form="xmm, xmm, xmm, imm8" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_reduce_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := FP64(0.0)
+	FI
+	RETURN tmp[63:0]
+}
+dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESD" form="xmm, xmm, xmm {sae}, imm8" xed="VREDUCESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_reduce_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESS" form="xmm {k}, xmm, xmm, imm8" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_reduce_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESS" form="xmm {k}, xmm, xmm {sae}, imm8" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_reduce_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESS" form="xmm {z}, xmm, xmm, imm8" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_reduce_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESS" form="xmm {z}, xmm, xmm {sae}, imm8" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_reduce_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESS" form="xmm, xmm, xmm, imm8" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_reduce_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := FP32(0.0)
+	FI
+	RETURN tmp[31:0]
+}
+dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VREDUCESS" form="xmm, xmm, xmm {sae}, imm8" xed="VREDUCESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VXORPD" form="ymm {k}, ymm, ymm" xed="VXORPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VXORPD" form="ymm {z}, ymm, ymm" xed="VXORPD_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VXORPD" form="zmm {k}, zmm, zmm" xed="VXORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VXORPD" form="zmm {z}, zmm, zmm" xed="VXORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VXORPD" form="zmm, zmm, zmm" xed="VXORPD_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VXORPD" form="xmm {k}, xmm, xmm" xed="VXORPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VXORPD" form="xmm {z}, xmm, xmm" xed="VXORPD_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VXORPS" form="ymm {k}, ymm, ymm" xed="VXORPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VXORPS" form="ymm {z}, ymm, ymm" xed="VXORPS_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VXORPS" form="zmm {k}, zmm, zmm" xed="VXORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VXORPS" form="zmm {z}, zmm, zmm" xed="VXORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VXORPS" form="zmm, zmm, zmm" xed="VXORPS_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VXORPS" form="xmm {k}, xmm, xmm" xed="VXORPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VXORPS" form="xmm {z}, xmm, xmm" xed="VXORPS_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kadd_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Add 8-bit masks in "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := a[7:0] + b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction name="KADDB" form="k, k, k" xed="KADDB_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kadd_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Add 16-bit masks in "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] + b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KADDW" form="k, k, k" xed="KADDW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kand_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 8-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := a[7:0] AND b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction name="KANDB" form="k, k, k" xed="KANDB_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kandn_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 8-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := (NOT a[7:0]) AND b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction name="KANDNB" form="k, k, k" xed="KANDNB_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_knot_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<description>Compute the bitwise NOT of 8-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[7:0] := NOT a[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction name="KNOTB" form="k, k" xed="KNOTB_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kor_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 8-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := a[7:0] OR b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction name="KORB" form="k, k, k" xed="KORB_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kxnor_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XNOR of 8-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := NOT (a[7:0] XOR b[7:0])
+k[MAX:8] := 0
+	</operation>
+	<instruction name="KXNORB" form="k, k, k" xed="KXNORB_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kxor_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XOR of 8-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[7:0] := a[7:0] XOR b[7:0]
+k[MAX:8] := 0
+	</operation>
+	<instruction name="KXORB" form="k, k, k" xed="KXORB_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kshiftli_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="unsigned int" varname="count" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of 8-bit mask "a" left by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 7
+	k[7:0] := a[7:0] &lt;&lt; count[7:0]
+FI
+	</operation>
+	<instruction name="KSHIFTLB" form="k, k, imm8" xed="KSHIFTLB_MASKmskw_MASKmskw_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kshiftri_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="unsigned int" varname="count" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of 8-bit mask "a" right by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 7
+	k[7:0] := a[7:0] &gt;&gt; count[7:0]
+FI
+	</operation>
+	<instruction name="KSHIFTRB" form="k, k, imm8" xed="KSHIFTRB_MASKmskw_MASKmskw_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_load_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Load</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8*" varname="mem_addr" etype="MASK" memwidth="8"/>
+	<description>Load 8-bit mask from memory into "k".</description>
+	<operation>
+k[7:0] := MEM[mem_addr+7:mem_addr]
+	</operation>
+	<instruction name="KMOVB" form="k, m8" xed="KMOVB_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_store_mask8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__mmask8*" varname="mem_addr" etype="MASK" memwidth="8"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<description>Store 8-bit mask from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+7:mem_addr] := a[7:0]
+	</operation>
+	<instruction name="KMOVB" form="m8, k" xed="KMOVB_MEMu8_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortest_mask8_u8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<parameter type="unsigned char*" varname="all_ones" etype="UI8" memwidth="8"/>
+	<description>Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones".</description>
+	<operation>
+tmp[7:0] := a[7:0] OR b[7:0]
+IF tmp[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+IF tmp[7:0] == 0xFF
+	MEM[all_ones+7:all_ones] := 1
+ELSE
+	MEM[all_ones+7:all_ones] := 0
+FI
+	</operation>
+	<instruction name="KORTESTB" form="k, k" xed="KORTESTB_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortestz_mask8_u8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[7:0] := a[7:0] OR b[7:0]
+IF tmp[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KORTESTB" form="k, k" xed="KORTESTB_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortestc_mask8_u8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[7:0] := a[7:0] OR b[7:0]
+IF tmp[7:0] == 0xFF
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KORTESTB" form="k, k" xed="KORTESTB_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktest_mask8_u8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<parameter type="unsigned char*" varname="and_not" etype="UI8" memwidth="8"/>
+	<description>Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not".</description>
+	<operation>
+tmp1[7:0] := a[7:0] AND b[7:0]
+IF tmp1[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+tmp2[7:0] := (NOT a[7:0]) AND b[7:0]
+IF tmp2[7:0] == 0x0
+	MEM[and_not+7:and_not] := 1
+ELSE
+	MEM[and_not+7:and_not] := 0
+FI
+	</operation>
+	<instruction name="KTESTB" form="k, k" xed="KTESTB_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktestz_mask8_u8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[7:0] := a[7:0] AND b[7:0]
+IF tmp[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KTESTB" form="k, k" xed="KTESTB_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktestc_mask8_u8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<parameter type="__mmask8" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 8-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[7:0] := (NOT a[7:0]) AND b[7:0]
+IF tmp[7:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KTESTB" form="k, k" xed="KTESTB_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktest_mask16_u8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<parameter type="unsigned char*" varname="and_not" etype="UI8" memwidth="8"/>
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not".</description>
+	<operation>
+tmp1[15:0] := a[15:0] AND b[15:0]
+IF tmp1[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+tmp2[15:0] := (NOT a[15:0]) AND b[15:0]
+IF tmp2[15:0] == 0x0
+	MEM[and_not+7:and_not] := 1
+ELSE
+	MEM[and_not+7:and_not] := 0
+FI
+	</operation>
+	<instruction name="KTESTW" form="k, k" xed="KTESTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktestz_mask16_u8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[15:0] := a[15:0] AND b[15:0]
+IF tmp[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KTESTW" form="k, k" xed="KTESTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_ktestc_mask16_u8">
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 16-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[15:0] := (NOT a[15:0]) AND b[15:0]
+IF tmp[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KTESTW" form="k, k" xed="KTESTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_cvtmask8_u32">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="a" etype="MASK"/>
+	<description>Convert 8-bit mask "a" into an integer value, and store the result in "dst".</description>
+	<operation>
+dst := ZeroExtend32(a[7:0])
+	</operation>
+	<instruction name="KMOVB" form="r32, k" xed="KMOVB_GPR32u32_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_cvtu32_mask8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Mask</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="unsigned int" varname="a" etype="UI8"/>
+	<description>Convert integer value "a" into an 8-bit mask, and store the result in "k".</description>
+	<operation>
+k := a[7:0]
+	</operation>
+	<instruction name="KMOVB" form="k, r32" xed="KMOVB_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_exp2a23_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PS" form="zmm, zmm {sae}" xed="VEXP2PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_exp2a23_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PS" form="zmm, zmm" xed="VEXP2PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_exp2a23_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PS" form="zmm {k}, zmm {sae}" xed="VEXP2PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_exp2a23_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PS" form="zmm {k}, zmm" xed="VEXP2PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_exp2a23_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PS" form="zmm {z}, zmm {sae}" xed="VEXP2PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_exp2a23_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PS" form="zmm {z}, zmm" xed="VEXP2PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_exp2a23_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(2.0, a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PD" form="zmm, zmm {sae}" xed="VEXP2PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_exp2a23_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(2.0, a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PD" form="zmm, zmm" xed="VEXP2PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_exp2a23_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(2.0, a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PD" form="zmm {k}, zmm {sae}" xed="VEXP2PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_exp2a23_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(2.0, a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PD" form="zmm {k}, zmm" xed="VEXP2PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_exp2a23_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(2.0, a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PD" form="zmm {z}, zmm {sae}" xed="VEXP2PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_exp2a23_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(2.0, a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP2PD" form="zmm {z}, zmm" xed="VEXP2PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rcp28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+dst[63:0] := (1.0 / b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SD" form="xmm, xmm, xmm {sae}" xed="VRCP28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rcp28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+dst[63:0] := (1.0 / b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SD" form="xmm, xmm, xmm" xed="VRCP28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rcp28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SD" form="xmm {k}, xmm, xmm {sae}" xed="VRCP28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rcp28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SD" form="xmm {k}, xmm, xmm" xed="VRCP28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rcp28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SD" form="xmm {z}, xmm, xmm {sae}" xed="VRCP28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rcp28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SD" form="xmm {z}, xmm, xmm" xed="VRCP28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rcp28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst". The maximum relative error for this approximation is less than 2^-28, and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+dst[31:0] := (1.0 / b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SS" form="xmm, xmm, xmm {sae}" xed="VRCP28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rcp28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+dst[31:0] := (1.0 / b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SS" form="xmm, xmm, xmm" xed="VRCP28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rcp28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SS" form="xmm {k}, xmm, xmm {sae}" xed="VRCP28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rcp28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SS" form="xmm {k}, xmm, xmm" xed="VRCP28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rcp28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SS" form="xmm {z}, xmm, xmm {sae}" xed="VRCP28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rcp28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP28SS" form="xmm {z}, xmm, xmm" xed="VRCP28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rcp28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PS" form="zmm, zmm {sae}" xed="VRCP28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rcp28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PS" form="zmm, zmm" xed="VRCP28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rcp28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PS" form="zmm {k}, zmm {sae}" xed="VRCP28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rcp28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PS" form="zmm {k}, zmm" xed="VRCP28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rcp28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PS" form="zmm {z}, zmm {sae}" xed="VRCP28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rcp28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PS" form="zmm {z}, zmm" xed="VRCP28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rcp28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PD" form="zmm, zmm {sae}" xed="VRCP28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rcp28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PD" form="zmm, zmm" xed="VRCP28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rcp28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PD" form="zmm {k}, zmm {sae}" xed="VRCP28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rcp28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PD" form="zmm {k}, zmm" xed="VRCP28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rcp28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PD" form="zmm {z}, zmm {sae}" xed="VRCP28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rcp28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRCP28PD" form="zmm {z}, zmm" xed="VRCP28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rsqrt28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+dst[63:0] := (1.0 / SQRT(b[63:0]))
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SD" form="xmm, xmm, xmm {sae}" xed="VRSQRT28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rsqrt28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+dst[63:0] := (1.0 / SQRT(b[63:0]))
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SD" form="xmm, xmm, xmm" xed="VRSQRT28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rsqrt28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SD" form="xmm {k}, xmm, xmm {sae}" xed="VRSQRT28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rsqrt28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SD" form="xmm {k}, xmm, xmm" xed="VRSQRT28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rsqrt28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SD" form="xmm {z}, xmm, xmm {sae}" xed="VRSQRT28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rsqrt28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SD" form="xmm {z}, xmm, xmm" xed="VRSQRT28SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rsqrt28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+dst[31:0] := (1.0 / SQRT(b[31:0]))
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SS" form="xmm, xmm, xmm {sae}" xed="VRSQRT28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rsqrt28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+dst[31:0] := (1.0 / SQRT(b[31:0]))
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SS" form="xmm, xmm, xmm" xed="VRSQRT28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rsqrt28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SS" form="xmm {k}, xmm, xmm {sae}" xed="VRSQRT28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rsqrt28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SS" form="xmm {k}, xmm, xmm" xed="VRSQRT28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rsqrt28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SS" form="xmm {z}, xmm, xmm {sae}" xed="VRSQRT28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rsqrt28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT28SS" form="xmm {z}, xmm, xmm" xed="VRSQRT28SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rsqrt28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PS" form="zmm, zmm {sae}" xed="VRSQRT28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rsqrt28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PS" form="zmm, zmm" xed="VRSQRT28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rsqrt28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PS" form="zmm {k}, zmm {sae}" xed="VRSQRT28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rsqrt28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PS" form="zmm {k}, zmm" xed="VRSQRT28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rsqrt28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PS" form="zmm {z}, zmm {sae}" xed="VRSQRT28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rsqrt28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PS" form="zmm {z}, zmm" xed="VRSQRT28PS_ZMMf32_MASKmskw_ZMMf32_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rsqrt28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PD" form="zmm, zmm {sae}" xed="VRSQRT28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rsqrt28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PD" form="zmm, zmm" xed="VRSQRT28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rsqrt28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PD" form="zmm {k}, zmm {sae}" xed="VRSQRT28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rsqrt28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PD" form="zmm {k}, zmm" xed="VRSQRT28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rsqrt28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PD" form="zmm {z}, zmm {sae}" xed="VRSQRT28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rsqrt28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VRSQRT28PD" form="zmm {z}, zmm" xed="VRSQRT28PD_ZMMf64_MASKmskw_ZMMf64_AVX512ER"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VADDPD" form="ymm {k}, ymm, ymm" xed="VADDPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VADDPD" form="ymm {z}, ymm, ymm" xed="VADDPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDPD" form="xmm {k}, xmm, xmm" xed="VADDPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDPD" form="xmm {z}, xmm, xmm" xed="VADDPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VADDPS" form="ymm {k}, ymm, ymm" xed="VADDPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VADDPS" form="ymm {z}, ymm, ymm" xed="VADDPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDPS" form="xmm {k}, xmm, xmm" xed="VADDPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDPS" form="xmm {z}, xmm, xmm" xed="VADDPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="3"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst".</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (32*imm8[2:0])
+dst[255:0] := temp[255:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VALIGND" form="ymm, ymm, ymm, imm8" xed="VALIGND_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="3"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (32*imm8[2:0])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VALIGND" form="ymm {k}, ymm, ymm, imm8" xed="VALIGND_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="3"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (32*imm8[2:0])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VALIGND" form="ymm {z}, ymm, ymm, imm8" xed="VALIGND_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst".</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (32*imm8[1:0])
+dst[127:0] := temp[127:0]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VALIGND" form="xmm, xmm, xmm, imm8" xed="VALIGND_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (32*imm8[1:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VALIGND" form="xmm {k}, xmm, xmm, imm8" xed="VALIGND_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (32*imm8[1:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VALIGND" form="xmm {z}, xmm, xmm, imm8" xed="VALIGND_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst".</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (64*imm8[1:0])
+dst[255:0] := temp[255:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VALIGNQ" form="ymm, ymm, ymm, imm8" xed="VALIGNQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (64*imm8[1:0])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VALIGNQ" form="ymm {k}, ymm, ymm, imm8" xed="VALIGNQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (64*imm8[1:0])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VALIGNQ" form="ymm {z}, ymm, ymm, imm8" xed="VALIGNQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst".</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (64*imm8[0])
+dst[127:0] := temp[127:0]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VALIGNQ" form="xmm, xmm, xmm, imm8" xed="VALIGNQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (64*imm8[0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VALIGNQ" form="xmm {k}, xmm, xmm, imm8" xed="VALIGNQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (64*imm8[0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VALIGNQ" form="xmm {z}, xmm, xmm, imm8" xed="VALIGNQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_blend_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBLENDMPD" form="ymm {k}, ymm, ymm" xed="VBLENDMPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_blend_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VBLENDMPD" form="xmm {k}, xmm, xmm" xed="VBLENDMPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_blend_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBLENDMPS" form="ymm {k}, ymm, ymm" xed="VBLENDMPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_blend_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VBLENDMPS" form="xmm {k}, xmm, xmm" xed="VBLENDMPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X4" form="ymm, m128" xed="VBROADCASTF32X4_YMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_mask_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X4" form="ymm {k}, m128" xed="VBROADCASTF32X4_YMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_maskz_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X4" form="ymm {z}, m128" xed="VBROADCASTF32X4_YMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_broadcast_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X4" form="ymm, m128" xed="VBROADCASTI32X4_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_mask_broadcast_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X4" form="ymm {k}, m128" xed="VBROADCASTI32X4_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm256_maskz_broadcast_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X4" form="ymm {z}, m128" xed="VBROADCASTI32X4_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTSD" form="ymm {k}, xmm" xed="VBROADCASTSD_YMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTSD" form="ymm {z}, xmm" xed="VBROADCASTSD_YMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="ymm {k}, xmm" xed="VBROADCASTSS_YMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="ymm {z}, xmm" xed="VBROADCASTSS_YMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="xmm {k}, xmm" xed="VBROADCASTSS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="xmm {z}, xmm" xed="VBROADCASTSS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, ymm, ymm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, ymm, ymm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, xmm, xmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, xmm, xmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, ymm, ymm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, ymm, ymm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, xmm, xmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, xmm, xmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCOMPRESSPD" form="ymm {k}, ymm" xed="VCOMPRESSPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compressstoreu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VCOMPRESSPD" form="m256 {k}, ymm" xed="VCOMPRESSPD_MEMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCOMPRESSPD" form="ymm {z}, ymm" xed="VCOMPRESSPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCOMPRESSPD" form="xmm {k}, xmm" xed="VCOMPRESSPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compressstoreu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VCOMPRESSPD" form="m128 {k}, xmm" xed="VCOMPRESSPD_MEMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCOMPRESSPD" form="xmm {z}, xmm" xed="VCOMPRESSPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCOMPRESSPS" form="ymm {k}, ymm" xed="VCOMPRESSPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compressstoreu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VCOMPRESSPS" form="m256 {k}, ymm" xed="VCOMPRESSPS_MEMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCOMPRESSPS" form="ymm {z}, ymm" xed="VCOMPRESSPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCOMPRESSPS" form="xmm {k}, xmm" xed="VCOMPRESSPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compressstoreu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VCOMPRESSPS" form="m128 {k}, xmm" xed="VCOMPRESSPS_MEMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCOMPRESSPS" form="xmm {z}, xmm" xed="VCOMPRESSPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := src[m+63:m]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="ymm {k}, xmm" xed="VCVTDQ2PD_YMMf64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="ymm {z}, xmm" xed="VCVTDQ2PD_YMMf64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := src[m+63:m]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="xmm {k}, xmm" xed="VCVTDQ2PD_XMMf64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="xmm {z}, xmm" xed="VCVTDQ2PD_XMMf64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="ymm {k}, ymm" xed="VCVTDQ2PS_YMMf32_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="ymm {z}, ymm" xed="VCVTDQ2PS_YMMf32_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="xmm {k}, xmm" xed="VCVTDQ2PS_XMMf32_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="xmm {z}, xmm" xed="VCVTDQ2PS_XMMf32_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="xmm {k}, ymm" xed="VCVTPD2DQ_XMMi32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="xmm {z}, ymm" xed="VCVTPD2DQ_XMMi32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="xmm {k}, xmm" xed="VCVTPD2DQ_XMMi32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="xmm {z}, xmm" xed="VCVTPD2DQ_XMMi32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="xmm {k}, ymm" xed="VCVTPD2PS_XMMf32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="xmm {z}, ymm" xed="VCVTPD2PS_XMMf32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="xmm {k}, xmm" xed="VCVTPD2PS_XMMf32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="xmm {z}, xmm" xed="VCVTPD2PS_XMMf32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="xmm, ymm" xed="VCVTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="xmm {k}, ymm" xed="VCVTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="xmm {z}, ymm" xed="VCVTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="xmm, xmm" xed="VCVTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="xmm {k}, xmm" xed="VCVTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="xmm {z}, xmm" xed="VCVTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="FP16"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="ymm {k}, xmm" xed="VCVTPH2PS_YMMf32_MASKmskw_XMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="FP16"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="ymm {z}, xmm" xed="VCVTPH2PS_YMMf32_MASKmskw_XMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="FP16"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="xmm {k}, xmm" xed="VCVTPH2PS_XMMf32_MASKmskw_XMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="FP16"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="xmm {z}, xmm" xed="VCVTPH2PS_XMMf32_MASKmskw_XMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="ymm {k}, ymm" xed="VCVTPS2DQ_YMMi32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="ymm {z}, ymm" xed="VCVTPS2DQ_YMMi32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="xmm {k}, xmm" xed="VCVTPS2DQ_XMMi32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="xmm {z}, xmm" xed="VCVTPS2DQ_XMMi32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_ROUND_MODE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm {k}, ymm, imm8" xed="VCVTPS2PH_XMMf16_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_ROUND_MODE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm {k}, ymm, imm8" xed="VCVTPS2PH_XMMf16_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_ROUND_MODE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm {z}, ymm, imm8" xed="VCVTPS2PH_XMMf16_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_ROUND_MODE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm {z}, ymm, imm8" xed="VCVTPS2PH_XMMf16_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_ROUND_MODE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm {k}, xmm, imm8" xed="VCVTPS2PH_XMMf16_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_ROUND_MODE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm {k}, xmm, imm8" xed="VCVTPS2PH_XMMf16_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_ROUND_MODE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm {z}, xmm, imm8" xed="VCVTPS2PH_XMMf16_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_ROUND_MODE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm {z}, xmm, imm8" xed="VCVTPS2PH_XMMf16_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="ymm, ymm" xed="VCVTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="ymm {k}, ymm" xed="VCVTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="ymm {z}, ymm" xed="VCVTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="xmm, xmm" xed="VCVTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="xmm {k}, xmm" xed="VCVTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="xmm {z}, xmm" xed="VCVTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="xmm {k}, ymm" xed="VCVTTPD2DQ_XMMi32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="xmm {z}, ymm" xed="VCVTTPD2DQ_XMMi32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="xmm {k}, xmm" xed="VCVTTPD2DQ_XMMi32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="xmm {z}, xmm" xed="VCVTTPD2DQ_XMMi32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="xmm, ymm" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="xmm {k}, ymm" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="xmm {z}, ymm" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="xmm, xmm" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="xmm {k}, xmm" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="xmm {z}, xmm" xed="VCVTTPD2UDQ_XMMu32_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="ymm {k}, ymm" xed="VCVTTPS2DQ_YMMi32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="ymm {z}, ymm" xed="VCVTTPS2DQ_YMMi32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="xmm {k}, xmm" xed="VCVTTPS2DQ_XMMi32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="xmm {z}, xmm" xed="VCVTTPS2DQ_XMMi32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="ymm, ymm" xed="VCVTTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="ymm {k}, ymm" xed="VCVTTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="ymm {z}, ymm" xed="VCVTTPS2UDQ_YMMu32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="xmm, xmm" xed="VCVTTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="xmm {k}, xmm" xed="VCVTTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="xmm {z}, xmm" xed="VCVTTPS2UDQ_XMMu32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="ymm, xmm" xed="VCVTUDQ2PD_YMMf64_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="ymm {k}, xmm" xed="VCVTUDQ2PD_YMMf64_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="ymm {z}, xmm" xed="VCVTUDQ2PD_YMMf64_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="xmm, xmm" xed="VCVTUDQ2PD_XMMf64_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="xmm {k}, xmm" xed="VCVTUDQ2PD_XMMf64_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="xmm {z}, xmm" xed="VCVTUDQ2PD_XMMf64_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDIVPD" form="ymm {k}, ymm, ymm" xed="VDIVPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDIVPD" form="ymm {z}, ymm, ymm" xed="VDIVPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVPD" form="xmm {k}, xmm, xmm" xed="VDIVPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVPD" form="xmm {z}, xmm, xmm" xed="VDIVPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDIVPS" form="ymm {k}, ymm, ymm" xed="VDIVPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDIVPS" form="ymm {z}, ymm, ymm" xed="VDIVPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVPS" form="xmm {k}, xmm, xmm" xed="VDIVPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVPS" form="xmm {z}, xmm, xmm" xed="VDIVPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="ymm {k}, ymm" xed="VEXPANDPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="ymm {k}, m256" xed="VEXPANDPD_YMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="ymm {z}, ymm" xed="VEXPANDPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="ymm {z}, m256" xed="VEXPANDPD_YMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="xmm {k}, xmm" xed="VEXPANDPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="xmm {k}, m128" xed="VEXPANDPD_XMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="xmm {z}, xmm " xed="VEXPANDPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="xmm {z}, m128" xed="VEXPANDPD_XMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="ymm {k}, ymm" xed="VEXPANDPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="ymm {k}, m256" xed="VEXPANDPS_YMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="ymm {z}, ymm" xed="VEXPANDPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="ymm {z}, m256" xed="VEXPANDPS_YMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="xmm {k}, xmm" xed="VEXPANDPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="xmm {k}, m128" xed="VEXPANDPS_XMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="xmm {z}, xmm" xed="VEXPANDPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="xmm {z}, m128" xed="VEXPANDPS_XMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF32X4" form="xmm, ymm, imm8" xed="VEXTRACTF32X4_XMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF32X4" form="xmm {k}, ymm, imm8" xed="VEXTRACTF32X4_XMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF32X4" form="xmm {z}, ymm, imm8" xed="VEXTRACTF32X4_XMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI32X4" form="xmm, ymm, imm8" xed="VEXTRACTI32X4_XMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI32X4" form="xmm {k}, ymm, imm8" xed="VEXTRACTI32X4_XMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI32X4" form="xmm {z}, ymm, imm8" xed="VEXTRACTI32X4_XMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN: j := 0
+	SNAN_TOKEN: j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="ymm, ymm, ymm, imm8" xed="VFIXUPIMMPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="ymm {k}, ymm, ymm, imm8" xed="VFIXUPIMMPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="ymm {z}, ymm, ymm, imm8" xed="VFIXUPIMMPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="xmm, xmm, xmm, imm8" xed="VFIXUPIMMPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="xmm {k}, xmm, xmm, imm8" xed="VFIXUPIMMPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="xmm {z}, xmm, xmm, imm8" xed="VFIXUPIMMPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="ymm, ymm, ymm, imm8" xed="VFIXUPIMMPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="ymm {k}, ymm, ymm, imm8" xed="VFIXUPIMMPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="ymm {z}, ymm, ymm, imm8" xed="VFIXUPIMMPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="xmm, xmm, xmm, imm8" xed="VFIXUPIMMPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="xmm {k}, xmm, xmm, imm8" xed="VFIXUPIMMPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="xmm {z}, xmm, xmm, imm8" xed="VFIXUPIMMPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="ymm {k}, ymm, ymm" xed="VFMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="ymm {k}, ymm, ymm" xed="VFMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="ymm {k}, ymm, ymm" xed="VFMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="ymm {k}, ymm, ymm" xed="VFMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="ymm {k}, ymm, ymm" xed="VFMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="ymm {k}, ymm, ymm" xed="VFMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="ymm {z}, ymm, ymm" xed="VFMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="ymm {z}, ymm, ymm" xed="VFMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="ymm {z}, ymm, ymm" xed="VFMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="xmm {k}, xmm, xmm" xed="VFMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="xmm {k}, xmm, xmm" xed="VFMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="xmm {k}, xmm, xmm" xed="VFMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="xmm {k}, xmm, xmm" xed="VFMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="xmm {k}, xmm, xmm" xed="VFMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="xmm {k}, xmm, xmm" xed="VFMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="xmm {z}, xmm, xmm" xed="VFMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="xmm {z}, xmm, xmm" xed="VFMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="xmm {z}, xmm, xmm" xed="VFMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="ymm {k}, ymm, ymm" xed="VFMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="ymm {k}, ymm, ymm" xed="VFMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="ymm {k}, ymm, ymm" xed="VFMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="ymm {k}, ymm, ymm" xed="VFMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="ymm {k}, ymm, ymm" xed="VFMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="ymm {k}, ymm, ymm" xed="VFMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="ymm {z}, ymm, ymm" xed="VFMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="ymm {z}, ymm, ymm" xed="VFMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="ymm {z}, ymm, ymm" xed="VFMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="xmm {k}, xmm, xmm" xed="VFMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="xmm {k}, xmm, xmm" xed="VFMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="xmm {k}, xmm, xmm" xed="VFMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="xmm {k}, xmm, xmm" xed="VFMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="xmm {k}, xmm, xmm" xed="VFMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="xmm {k}, xmm, xmm" xed="VFMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="xmm {z}, xmm, xmm" xed="VFMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="xmm {z}, xmm, xmm" xed="VFMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="xmm {z}, xmm, xmm" xed="VFMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="ymm {k}, ymm, ymm" xed="VFMADDSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="ymm {k}, ymm, ymm" xed="VFMADDSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="ymm {k}, ymm, ymm" xed="VFMADDSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="ymm {k}, ymm, ymm" xed="VFMADDSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="ymm {k}, ymm, ymm" xed="VFMADDSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="ymm {k}, ymm, ymm" xed="VFMADDSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="ymm {z}, ymm, ymm" xed="VFMADDSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="ymm {z}, ymm, ymm" xed="VFMADDSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="ymm {z}, ymm, ymm" xed="VFMADDSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="xmm {k}, xmm, xmm" xed="VFMADDSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="xmm {k}, xmm, xmm" xed="VFMADDSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="xmm {k}, xmm, xmm" xed="VFMADDSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="xmm {k}, xmm, xmm" xed="VFMADDSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="xmm {k}, xmm, xmm" xed="VFMADDSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="xmm {k}, xmm, xmm" xed="VFMADDSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="xmm {z}, xmm, xmm" xed="VFMADDSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="xmm {z}, xmm, xmm" xed="VFMADDSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="xmm {z}, xmm, xmm" xed="VFMADDSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="ymm {k}, ymm, ymm" xed="VFMADDSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="ymm {k}, ymm, ymm" xed="VFMADDSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="ymm {k}, ymm, ymm" xed="VFMADDSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="ymm {k}, ymm, ymm" xed="VFMADDSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="ymm {k}, ymm, ymm" xed="VFMADDSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="ymm {k}, ymm, ymm" xed="VFMADDSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="ymm {z}, ymm, ymm" xed="VFMADDSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="ymm {z}, ymm, ymm" xed="VFMADDSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="ymm {z}, ymm, ymm" xed="VFMADDSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="xmm {k}, xmm, xmm" xed="VFMADDSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="xmm {k}, xmm, xmm" xed="VFMADDSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="xmm {k}, xmm, xmm" xed="VFMADDSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="xmm {k}, xmm, xmm" xed="VFMADDSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="xmm {k}, xmm, xmm" xed="VFMADDSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="xmm {k}, xmm, xmm" xed="VFMADDSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="xmm {z}, xmm, xmm" xed="VFMADDSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="xmm {z}, xmm, xmm" xed="VFMADDSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="xmm {z}, xmm, xmm" xed="VFMADDSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="ymm {k}, ymm, ymm" xed="VFMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="ymm {k}, ymm, ymm" xed="VFMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="ymm {k}, ymm, ymm" xed="VFMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="ymm {k}, ymm, ymm" xed="VFMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="ymm {k}, ymm, ymm" xed="VFMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="ymm {k}, ymm, ymm" xed="VFMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="ymm {z}, ymm, ymm" xed="VFMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="ymm {z}, ymm, ymm" xed="VFMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="ymm {z}, ymm, ymm" xed="VFMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="xmm {k}, xmm, xmm" xed="VFMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="xmm {k}, xmm, xmm" xed="VFMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="xmm {k}, xmm, xmm" xed="VFMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="xmm {k}, xmm, xmm" xed="VFMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="xmm {k}, xmm, xmm" xed="VFMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="xmm {k}, xmm, xmm" xed="VFMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="xmm {z}, xmm, xmm" xed="VFMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="xmm {z}, xmm, xmm" xed="VFMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="xmm {z}, xmm, xmm" xed="VFMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="ymm {k}, ymm, ymm" xed="VFMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="ymm {k}, ymm, ymm" xed="VFMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="ymm {k}, ymm, ymm" xed="VFMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="ymm {k}, ymm, ymm" xed="VFMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="ymm {k}, ymm, ymm" xed="VFMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="ymm {k}, ymm, ymm" xed="VFMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="ymm {z}, ymm, ymm" xed="VFMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="ymm {z}, ymm, ymm" xed="VFMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="ymm {z}, ymm, ymm" xed="VFMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="xmm {k}, xmm, xmm" xed="VFMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="xmm {k}, xmm, xmm" xed="VFMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="xmm {k}, xmm, xmm" xed="VFMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="xmm {k}, xmm, xmm" xed="VFMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="xmm {k}, xmm, xmm" xed="VFMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="xmm {k}, xmm, xmm" xed="VFMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="xmm {z}, xmm, xmm" xed="VFMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="xmm {z}, xmm, xmm" xed="VFMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="xmm {z}, xmm, xmm" xed="VFMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="ymm {k}, ymm, ymm" xed="VFMSUBADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="ymm {k}, ymm, ymm" xed="VFMSUBADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="ymm {k}, ymm, ymm" xed="VFMSUBADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="ymm {k}, ymm, ymm" xed="VFMSUBADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="ymm {k}, ymm, ymm" xed="VFMSUBADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="ymm {k}, ymm, ymm" xed="VFMSUBADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="ymm {z}, ymm, ymm" xed="VFMSUBADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="ymm {z}, ymm, ymm" xed="VFMSUBADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="ymm {z}, ymm, ymm" xed="VFMSUBADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="xmm {k}, xmm, xmm" xed="VFMSUBADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="xmm {k}, xmm, xmm" xed="VFMSUBADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="xmm {k}, xmm, xmm" xed="VFMSUBADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1 
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="xmm {k}, xmm, xmm" xed="VFMSUBADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="xmm {k}, xmm, xmm" xed="VFMSUBADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="xmm {k}, xmm, xmm" xed="VFMSUBADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="xmm {z}, xmm, xmm" xed="VFMSUBADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="xmm {z}, xmm, xmm" xed="VFMSUBADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="xmm {z}, xmm, xmm" xed="VFMSUBADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="ymm {k}, ymm, ymm" xed="VFMSUBADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="ymm {k}, ymm, ymm" xed="VFMSUBADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="ymm {k}, ymm, ymm" xed="VFMSUBADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="ymm {k}, ymm, ymm" xed="VFMSUBADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="ymm {k}, ymm, ymm" xed="VFMSUBADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="ymm {k}, ymm, ymm" xed="VFMSUBADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="ymm {z}, ymm, ymm" xed="VFMSUBADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="ymm {z}, ymm, ymm" xed="VFMSUBADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="ymm {z}, ymm, ymm" xed="VFMSUBADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="xmm {k}, xmm, xmm" xed="VFMSUBADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="xmm {k}, xmm, xmm" xed="VFMSUBADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="xmm {k}, xmm, xmm" xed="VFMSUBADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="xmm {k}, xmm, xmm" xed="VFMSUBADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="xmm {k}, xmm, xmm" xed="VFMSUBADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="xmm {k}, xmm, xmm" xed="VFMSUBADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="xmm {z}, xmm, xmm" xed="VFMSUBADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="xmm {z}, xmm, xmm" xed="VFMSUBADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="xmm {z}, xmm, xmm" xed="VFMSUBADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="ymm {k}, ymm, ymm" xed="VFNMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="ymm {k}, ymm, ymm" xed="VFNMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="ymm {k}, ymm, ymm" xed="VFNMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="ymm {k}, ymm, ymm" xed="VFNMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="ymm {k}, ymm, ymm" xed="VFNMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="ymm {k}, ymm, ymm" xed="VFNMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="ymm {z}, ymm, ymm" xed="VFNMADD132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="ymm {z}, ymm, ymm" xed="VFNMADD213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="ymm {z}, ymm, ymm" xed="VFNMADD231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="xmm {k}, xmm, xmm" xed="VFNMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="xmm {k}, xmm, xmm" xed="VFNMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="xmm {k}, xmm, xmm" xed="VFNMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="xmm {k}, xmm, xmm" xed="VFNMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="xmm {k}, xmm, xmm" xed="VFNMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="xmm {k}, xmm, xmm" xed="VFNMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="xmm {z}, xmm, xmm" xed="VFNMADD132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="xmm {z}, xmm, xmm" xed="VFNMADD213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="xmm {z}, xmm, xmm" xed="VFNMADD231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="ymm {k}, ymm, ymm" xed="VFNMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="ymm {k}, ymm, ymm" xed="VFNMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="ymm {k}, ymm, ymm" xed="VFNMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="ymm {k}, ymm, ymm" xed="VFNMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="ymm {k}, ymm, ymm" xed="VFNMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="ymm {k}, ymm, ymm" xed="VFNMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="ymm {z}, ymm, ymm" xed="VFNMADD132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="ymm {z}, ymm, ymm" xed="VFNMADD213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="ymm {z}, ymm, ymm" xed="VFNMADD231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="xmm {k}, xmm, xmm" xed="VFNMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="xmm {k}, xmm, xmm" xed="VFNMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="xmm {k}, xmm, xmm" xed="VFNMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="xmm {k}, xmm, xmm" xed="VFNMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="xmm {k}, xmm, xmm" xed="VFNMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="xmm {k}, xmm, xmm" xed="VFNMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="xmm {z}, xmm, xmm" xed="VFNMADD132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="xmm {z}, xmm, xmm" xed="VFNMADD213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="xmm {z}, xmm, xmm" xed="VFNMADD231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="ymm {k}, ymm, ymm" xed="VFNMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="ymm {k}, ymm, ymm" xed="VFNMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="ymm {k}, ymm, ymm" xed="VFNMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="ymm {k}, ymm, ymm" xed="VFNMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="ymm {k}, ymm, ymm" xed="VFNMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="ymm {k}, ymm, ymm" xed="VFNMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="ymm {z}, ymm, ymm" xed="VFNMSUB132PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="ymm {z}, ymm, ymm" xed="VFNMSUB213PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="ymm {z}, ymm, ymm" xed="VFNMSUB231PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="xmm {k}, xmm, xmm" xed="VFNMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="xmm {k}, xmm, xmm" xed="VFNMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="xmm {k}, xmm, xmm" xed="VFNMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="xmm {k}, xmm, xmm" xed="VFNMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="xmm {k}, xmm, xmm" xed="VFNMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="xmm {k}, xmm, xmm" xed="VFNMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="xmm {z}, xmm, xmm" xed="VFNMSUB132PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="xmm {z}, xmm, xmm" xed="VFNMSUB213PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="xmm {z}, xmm, xmm" xed="VFNMSUB231PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask3_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="ymm {k}, ymm, ymm" xed="VFNMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="ymm {k}, ymm, ymm" xed="VFNMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="ymm {k}, ymm, ymm" xed="VFNMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="ymm {k}, ymm, ymm" xed="VFNMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="ymm {k}, ymm, ymm" xed="VFNMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="ymm {k}, ymm, ymm" xed="VFNMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="ymm {z}, ymm, ymm" xed="VFNMSUB132PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="ymm {z}, ymm, ymm" xed="VFNMSUB213PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="ymm {z}, ymm, ymm" xed="VFNMSUB231PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="xmm {k}, xmm, xmm" xed="VFNMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="xmm {k}, xmm, xmm" xed="VFNMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="xmm {k}, xmm, xmm" xed="VFNMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="xmm {k}, xmm, xmm" xed="VFNMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="xmm {k}, xmm, xmm" xed="VFNMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="xmm {k}, xmm, xmm" xed="VFNMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="xmm {z}, xmm, xmm" xed="VFNMSUB132PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="xmm {z}, xmm, xmm" xed="VFNMSUB213PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="xmm {z}, xmm, xmm" xed="VFNMSUB231PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mmask_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="ymm {k}, vm32x" xed="VGATHERDPD_YMMf64_MASKmskw_MEMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mmask_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="xmm {k}, vm32x" xed="VGATHERDPD_XMMf64_MASKmskw_MEMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mmask_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="ymm {k}, vm32y" xed="VGATHERDPS_YMMf32_MASKmskw_MEMf32_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mmask_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="xmm {k}, vm32x" xed="VGATHERDPS_XMMf32_MASKmskw_MEMf32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mmask_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERQPD" form="ymm {k}, vm64y" xed="VGATHERQPD_YMMf64_MASKmskw_MEMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mmask_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERQPD" form="xmm {k}, vm64x" xed="VGATHERQPD_XMMf64_MASKmskw_MEMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mmask_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGATHERQPS" form="ymm {k}, vm64y" xed="VGATHERQPS_YMMf32_MASKmskw_MEMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mmask_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VGATHERQPS" form="xmm {k}, vm64x" xed="VGATHERQPS_XMMf32_MASKmskw_MEMf32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="ymm, ymm" xed="VGETEXPPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="ymm {k}, ymm" xed="VGETEXPPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="ymm {z}, ymm" xed="VGETEXPPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="xmm, xmm" xed="VGETEXPPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="xmm {k}, xmm" xed="VGETEXPPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="xmm {z}, xmm" xed="VGETEXPPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="ymm, ymm" xed="VGETEXPPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="ymm {k}, ymm" xed="VGETEXPPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="ymm {z}, ymm" xed="VGETEXPPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="xmm, xmm" xed="VGETEXPPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="xmm {k}, xmm" xed="VGETEXPPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="xmm {z}, xmm" xed="VGETEXPPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="ymm, ymm, imm8" xed="VGETMANTPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="ymm {k}, ymm, imm8" xed="VGETMANTPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="ymm {z}, ymm, imm8" xed="VGETMANTPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="xmm, xmm, imm8" xed="VGETMANTPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="xmm {k}, xmm, imm8" xed="VGETMANTPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="xmm {z}, xmm, imm8" xed="VGETMANTPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="ymm, ymm, imm8" xed="VGETMANTPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="ymm {k}, ymm, imm8" xed="VGETMANTPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="ymm {z}, ymm, imm8" xed="VGETMANTPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="xmm, xmm, imm8" xed="VGETMANTPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="xmm {k}, xmm, imm8" xed="VGETMANTPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="xmm {z}, xmm, imm8" xed="VGETMANTPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF32X4" form="ymm, ymm, xmm, imm8" xed="VINSERTF32X4_YMMf32_MASKmskw_YMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF32X4" form="ymm {k}, ymm, xmm, imm8" xed="VINSERTF32X4_YMMf32_MASKmskw_YMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTF32X4" form="ymm {z}, ymm, xmm, imm8" xed="VINSERTF32X4_YMMf32_MASKmskw_YMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_inserti32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTI32X4" form="ymm, ymm, xmm, imm8" xed="VINSERTI32X4_YMMu32_MASKmskw_YMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_inserti32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTI32X4" form="ymm {k}, ymm, xmm, imm8" xed="VINSERTI32X4_YMMu32_MASKmskw_YMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_inserti32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VINSERTI32X4" form="ymm {z}, ymm, xmm, imm8" xed="VINSERTI32X4_YMMu32_MASKmskw_YMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMAXPD" form="ymm {k}, ymm, ymm" xed="VMAXPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMAXPD" form="ymm {z}, ymm, ymm" xed="VMAXPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXPD" form="xmm {k}, xmm, xmm" xed="VMAXPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXPD" form="xmm {z}, xmm, xmm" xed="VMAXPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMAXPS" form="ymm {k}, ymm, ymm" xed="VMAXPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMAXPS" form="ymm {z}, ymm, ymm" xed="VMAXPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXPS" form="xmm {k}, xmm, xmm" xed="VMAXPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXPS" form="xmm {z}, xmm, xmm" xed="VMAXPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMINPD" form="ymm {k}, ymm, ymm" xed="VMINPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMINPD" form="ymm {z}, ymm, ymm" xed="VMINPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINPD" form="xmm {k}, xmm, xmm" xed="VMINPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINPD" form="xmm {z}, xmm, xmm" xed="VMINPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMINPS" form="ymm {k}, ymm, ymm" xed="VMINPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMINPS" form="ymm {z}, ymm, ymm" xed="VMINPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINPS" form="xmm {k}, xmm, xmm" xed="VMINPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINPS" form="xmm {z}, xmm, xmm" xed="VMINPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="ymm {k}, m64" xed="VMOVAPD_YMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="ymm {k}, ymm" xed="VMOVAPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_store_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPD" form="m256 {k}, ymm" xed="VMOVAPD_MEMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="ymm {z}, m256" xed="VMOVAPD_YMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="ymm {z}, ymm" xed="VMOVAPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="xmm {k}, m128" xed="VMOVAPD_XMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="xmm {k}, xmm" xed="VMOVAPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_store_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPD" form="m128 {k}, xmm" xed="VMOVAPD_MEMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="xmm {z}, m128" xed="VMOVAPD_XMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="xmm {z}, xmm" xed="VMOVAPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="ymm {k}, m256" xed="VMOVAPS_YMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="ymm {k}, ymm" xed="VMOVAPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_store_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPS" form="m256 {k}, ymm" xed="VMOVAPS_MEMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="ymm {z}, m256" xed="VMOVAPS_YMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="ymm {z}, ymm" xed="VMOVAPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="xmm {k}, m128" xed="VMOVAPS_XMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="xmm {k}, xmm" xed="VMOVAPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_store_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPS" form="m128 {k}, xmm" xed="VMOVAPS_MEMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="xmm {z}, m128" xed="VMOVAPS_XMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="xmm {z}, xmm" xed="VMOVAPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDDUP" form="ymm {k}, ymm" xed="VMOVDDUP_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDDUP" form="ymm {z}, ymm" xed="VMOVDDUP_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDDUP" form="xmm {k}, xmm" xed="VMOVDDUP_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDDUP" form="xmm {z}, xmm" xed="VMOVDDUP_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="ymm {k}, m64" xed="VMOVDQA32_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="ymm {k}, ymm" xed="VMOVDQA32_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA32" form="m256 {k}, ymm" xed="VMOVDQA32_MEMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="ymm {z}, m64" xed="VMOVDQA32_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="ymm {z}, ymm" xed="VMOVDQA32_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="xmm {k}, m64" xed="VMOVDQA32_XMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="xmm {k}, xmm" xed="VMOVDQA32_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA32" form="m128 {k}, xmm" xed="VMOVDQA32_MEMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="xmm {z}, m64" xed="VMOVDQA32_XMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="xmm {z}, xmm" xed="VMOVDQA32_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="ymm {k}, m64" xed="VMOVDQA64_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="ymm {k}, ymm" xed="VMOVDQA64_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA64" form="m256 {k}, ymm" xed="VMOVDQA64_MEMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="ymm {z}, m64" xed="VMOVDQA64_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="ymm {z}, ymm" xed="VMOVDQA64_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="xmm {k}, m64" xed="VMOVDQA64_XMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="xmm {k}, xmm" xed="VMOVDQA64_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA64" form="m128 {k}, xmm" xed="VMOVDQA64_MEMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="xmm {z}, m64" xed="VMOVDQA64_XMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="xmm {z}, xmm" xed="VMOVDQA64_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="ymm {k}, m64" xed="VMOVDQU32_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU32" form="m256 {k}, ymm" xed="VMOVDQU32_MEMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="ymm {z}, m64" xed="VMOVDQU32_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="xmm {k}, m64" xed="VMOVDQU32_XMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU32" form="m128 {k}, xmm" xed="VMOVDQU32_MEMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="xmm {z}, m64" xed="VMOVDQU32_XMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU64" form="ymm {k}, m64" xed="VMOVDQU64_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_storeu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU64" form="m256 {k}, ymm" xed="VMOVDQU64_MEMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU64" form="ymm {z}, m64" xed="VMOVDQU64_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU64" form="xmm {k}, m64" xed="VMOVDQU64_XMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_storeu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU64" form="m128 {k}, xmm" xed="VMOVDQU64_MEMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU64" form="xmm {z}, m64" xed="VMOVDQU64_XMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVSHDUP" form="ymm {k}, ymm" xed="VMOVSHDUP_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVSHDUP" form="ymm {z}, ymm" xed="VMOVSHDUP_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVSHDUP" form="xmm {k}, xmm" xed="VMOVSHDUP_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVSHDUP" form="xmm {z}, xmm" xed="VMOVSHDUP_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVSLDUP" form="ymm {k}, ymm" xed="VMOVSLDUP_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVSLDUP" form="ymm {z}, ymm" xed="VMOVSLDUP_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVSLDUP" form="xmm {k}, xmm" xed="VMOVSLDUP_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVSLDUP" form="xmm {z}, xmm" xed="VMOVSLDUP_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVUPD" form="ymm {k}, m64" xed="VMOVUPD_YMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVUPD" form="m256 {k}, ymm" xed="VMOVUPD_MEMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="256"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVUPD" form="ymm {z}, m64" xed="VMOVUPD_YMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVUPD" form="xmm {k}, m64" xed="VMOVUPD_XMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVUPD" form="m128 {k}, xmm" xed="VMOVUPD_MEMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVUPD" form="xmm {z}, m64" xed="VMOVUPD_XMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVUPS" form="ymm {k}, m64" xed="VMOVUPS_YMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVUPS" form="m256 {k}, ymm" xed="VMOVUPS_MEMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="256"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVUPS" form="ymm {z}, m64" xed="VMOVUPS_YMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVUPS" form="xmm {k}, m64" xed="VMOVUPS_XMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVUPS" form="m128 {k}, xmm" xed="VMOVUPS_MEMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVUPS" form="xmm {z}, m64" xed="VMOVUPS_XMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMULPD" form="ymm {k}, ymm, ymm" xed="VMULPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMULPD" form="ymm {z}, ymm, ymm" xed="VMULPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULPD" form="xmm {k}, xmm, xmm" xed="VMULPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULPD" form="xmm {z}, xmm, xmm" xed="VMULPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMULPS" form="ymm {k}, ymm, ymm" xed="VMULPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMULPS" form="ymm {z}, ymm, ymm" xed="VMULPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULPS" form="xmm {k}, xmm, xmm" xed="VMULPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULPS" form="xmm {z}, xmm, xmm" xed="VMULPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSD" form="ymm {k}, ymm" xed="VPABSD_YMMi32_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSD" form="ymm {z}, ymm" xed="VPABSD_YMMi32_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPABSD" form="xmm {k}, xmm" xed="VPABSD_XMMi32_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPABSD" form="xmm {z}, xmm" xed="VPABSD_XMMi32_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ABS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSQ" form="ymm, ymm" xed="VPABSQ_YMMi64_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSQ" form="ymm {k}, ymm" xed="VPABSQ_YMMi64_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPABSQ" form="ymm {z}, ymm" xed="VPABSQ_YMMi64_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ABS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPABSQ" form="xmm, xmm" xed="VPABSQ_XMMi64_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPABSQ" form="xmm {k}, xmm" xed="VPABSQ_XMMi64_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPABSQ" form="xmm {z}, xmm" xed="VPABSQ_XMMi64_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDD" form="ymm {k}, ymm, ymm" xed="VPADDD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDD" form="ymm {z}, ymm, ymm" xed="VPADDD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDD" form="xmm {k}, xmm, xmm" xed="VPADDD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDD" form="xmm {z}, xmm, xmm" xed="VPADDD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDQ" form="ymm {k}, ymm, ymm" xed="VPADDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] :=0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPADDQ" form="ymm {z}, ymm, ymm" xed="VPADDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDQ" form="xmm {k}, xmm, xmm" xed="VPADDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPADDQ" form="xmm {z}, xmm, xmm" xed="VPADDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPANDD" form="ymm {k}, ymm, ymm" xed="VPANDD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPANDD" form="ymm {z}, ymm, ymm" xed="VPANDD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPANDD" form="xmm {k}, xmm, xmm" xed="VPANDD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPANDD" form="xmm {z}, xmm, xmm" xed="VPANDD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPANDND" form="ymm {k}, ymm, ymm" xed="VPANDND_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPANDND" form="ymm {z}, ymm, ymm" xed="VPANDND_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPANDND" form="xmm {k}, xmm, xmm" xed="VPANDND_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPANDND" form="xmm {z}, xmm, xmm" xed="VPANDND_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPANDNQ" form="ymm {k}, ymm, ymm" xed="VPANDNQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPANDNQ" form="ymm {z}, ymm, ymm" xed="VPANDNQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPANDNQ" form="xmm {k}, xmm, xmm" xed="VPANDNQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPANDNQ" form="xmm {z}, xmm, xmm" xed="VPANDNQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPANDQ" form="ymm {k}, ymm, ymm" xed="VPANDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPANDQ" form="ymm {z}, ymm, ymm" xed="VPANDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPANDQ" form="xmm {k}, xmm, xmm" xed="VPANDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPANDQ" form="xmm {z}, xmm, xmm" xed="VPANDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_blend_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBLENDMD" form="ymm {k}, ymm, ymm" xed="VPBLENDMD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_blend_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBLENDMD" form="xmm {k}, xmm, xmm" xed="VPBLENDMD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_blend_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBLENDMQ" form="ymm {k}, ymm, ymm" xed="VPBLENDMQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_blend_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBLENDMQ" form="xmm {k}, xmm, xmm" xed="VPBLENDMQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="ymm {k}, xmm" xed="VPBROADCASTD_YMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="ymm {k}, r32" xed="VPBROADCASTD_YMMu32_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="ymm {z}, xmm" xed="VPBROADCASTD_YMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="ymm {z}, r32" xed="VPBROADCASTD_YMMu32_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="xmm {k}, xmm" xed="VPBROADCASTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="xmm {k}, r32" xed="VPBROADCASTD_XMMu32_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="xmm {z}, xmm" xed="VPBROADCASTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="xmm {z}, r32" xed="VPBROADCASTD_XMMu32_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="ymm {k}, xmm" xed="VPBROADCASTQ_YMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="ymm {k}, r64" xed="VPBROADCASTQ_YMMu64_MASKmskw_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="ymm {z}, xmm" xed="VPBROADCASTQ_YMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="ymm {z}, r64" xed="VPBROADCASTQ_YMMu64_MASKmskw_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="xmm {k}, xmm" xed="VPBROADCASTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="xmm {k}, r64" xed="VPBROADCASTQ_XMMu64_MASKmskw_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="xmm {z}, xmm" xed="VPBROADCASTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="xmm {z}, r64" xed="VPBROADCASTQ_XMMu64_MASKmskw_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, ymm, ymm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, ymm, ymm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, ymm, ymm" xed="VPCMPD_MASKmskw_MASKmskw_YMMi32_YMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, xmm, xmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, xmm, xmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, xmm, xmm" xed="VPCMPD_MASKmskw_MASKmskw_XMMi32_XMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, ymm, ymm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, ymm, ymm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, ymm, ymm" xed="VPCMPQ_MASKmskw_MASKmskw_YMMi64_YMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, xmm, xmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, xmm, xmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, xmm, xmm" xed="VPCMPQ_MASKmskw_MASKmskw_XMMi64_XMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, ymm, ymm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, ymm, ymm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, ymm, ymm" xed="VPCMPUD_MASKmskw_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, xmm, xmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, xmm, xmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, xmm, xmm" xed="VPCMPUD_MASKmskw_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, ymm, ymm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, ymm, ymm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, ymm, ymm" xed="VPCMPUQ_MASKmskw_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, xmm, xmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, xmm, xmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, xmm, xmm" xed="VPCMPUQ_MASKmskw_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCOMPRESSD" form="ymm {k}, ymm" xed="VPCOMPRESSD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compressstoreu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSD" form="m256 {k}, ymm" xed="VPCOMPRESSD_MEMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCOMPRESSD" form="ymm {z}, ymm" xed="VPCOMPRESSD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCOMPRESSD" form="xmm {k}, xmm" xed="VPCOMPRESSD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compressstoreu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSD" form="m128 {k}, xmm" xed="VPCOMPRESSD_MEMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCOMPRESSD" form="xmm {z}, xmm" xed="VPCOMPRESSD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCOMPRESSQ" form="ymm {k}, ymm" xed="VPCOMPRESSQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compressstoreu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSQ" form="m256 {k}, ymm" xed="VPCOMPRESSQ_MEMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCOMPRESSQ" form="ymm {z}, ymm" xed="VPCOMPRESSQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCOMPRESSQ" form="xmm {k}, xmm" xed="VPCOMPRESSQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compressstoreu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSQ" form="m128 {k}, xmm" xed="VPCOMPRESSQ_MEMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCOMPRESSQ" form="xmm {z}, xmm" xed="VPCOMPRESSQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMD" form="ymm {k}, ymm, ymm" xed="VPERMD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMD" form="ymm {z}, ymm, ymm" xed="VPERMD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMD" form="ymm, ymm, ymm" xed="VPERMD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask2_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2D" form="ymm {k}, ymm, ymm" xed="VPERMI2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMT2D" form="ymm {k}, ymm, ymm" xed="VPERMT2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2D" form="ymm {z}, ymm, ymm" xed="VPERMI2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<instruction name="VPERMT2D" form="ymm {z}, ymm, ymm" xed="VPERMT2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2D" form="ymm, ymm, ymm" xed="VPERMI2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<instruction name="VPERMT2D" form="ymm, ymm, ymm" xed="VPERMT2D_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask2_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="idx" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2D" form="xmm {k}, xmm, xmm" xed="VPERMI2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMT2D" form="xmm {k}, xmm, xmm" xed="VPERMT2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="idx" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2D" form="xmm {z}, xmm, xmm" xed="VPERMI2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<instruction name="VPERMT2D" form="xmm {z}, xmm, xmm" xed="VPERMT2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="idx" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2D" form="xmm, xmm, xmm" xed="VPERMI2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<instruction name="VPERMT2D" form="xmm, xmm, xmm" xed="VPERMT2D_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask2_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2PD" form="ymm {k}, ymm, ymm" xed="VPERMI2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMT2PD" form="ymm {k}, ymm, ymm" xed="VPERMT2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2PD" form="ymm {z}, ymm, ymm" xed="VPERMI2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VPERMT2PD" form="ymm {z}, ymm, ymm" xed="VPERMT2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2PD" form="ymm, ymm, ymm" xed="VPERMI2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<instruction name="VPERMT2PD" form="ymm, ymm, ymm" xed="VPERMT2PD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask2_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128i" varname="idx" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set)</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2PD" form="xmm {k}, xmm, xmm" xed="VPERMI2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMT2PD" form="xmm {k}, xmm, xmm" xed="VPERMT2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128i" varname="idx" etype="UI64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2PD" form="xmm {z}, xmm, xmm" xed="VPERMI2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VPERMT2PD" form="xmm {z}, xmm, xmm" xed="VPERMT2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128i" varname="idx" etype="UI64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2PD" form="xmm, xmm, xmm" xed="VPERMI2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VPERMT2PD" form="xmm, xmm, xmm" xed="VPERMT2PD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask2_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2PS" form="ymm {k}, ymm, ymm" xed="VPERMI2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMT2PS" form="ymm {k}, ymm, ymm" xed="VPERMT2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2PS" form="ymm {z}, ymm, ymm" xed="VPERMI2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VPERMT2PS" form="ymm {z}, ymm, ymm" xed="VPERMT2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2PS" form="ymm, ymm, ymm" xed="VPERMI2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<instruction name="VPERMT2PS" form="ymm, ymm, ymm" xed="VPERMT2PS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask2_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128i" varname="idx" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2PS" form="xmm {k}, xmm, xmm" xed="VPERMI2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMT2PS" form="xmm {k}, xmm, xmm" xed="VPERMT2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128i" varname="idx" etype="UI32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2PS" form="xmm {z}, xmm, xmm" xed="VPERMI2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VPERMT2PS" form="xmm {z}, xmm, xmm" xed="VPERMT2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128i" varname="idx" etype="UI32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2PS" form="xmm, xmm, xmm" xed="VPERMI2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VPERMT2PS" form="xmm, xmm, xmm" xed="VPERMT2PS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask2_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2Q" form="ymm {k}, ymm, ymm" xed="VPERMI2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMT2Q" form="ymm {k}, ymm, ymm" xed="VPERMT2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2Q" form="ymm {z}, ymm, ymm" xed="VPERMI2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<instruction name="VPERMT2Q" form="ymm {z}, ymm, ymm" xed="VPERMT2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2Q" form="ymm, ymm, ymm" xed="VPERMI2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<instruction name="VPERMT2Q" form="ymm, ymm, ymm" xed="VPERMT2Q_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask2_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="idx" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2Q" form="xmm {k}, xmm, xmm" xed="VPERMI2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMT2Q" form="xmm {k}, xmm, xmm" xed="VPERMT2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="idx" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2Q" form="xmm {z}, xmm, xmm" xed="VPERMI2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<instruction name="VPERMT2Q" form="xmm {z}, xmm, xmm" xed="VPERMT2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="idx" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2Q" form="xmm, xmm, xmm" xed="VPERMI2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<instruction name="VPERMT2Q" form="xmm, xmm, xmm" xed="VPERMT2Q_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="ymm {k}, ymm, imm8" xed="VPERMILPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="ymm {k}, ymm, ymm" xed="VPERMILPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="ymm {z}, ymm, imm8" xed="VPERMILPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="ymm {z}, ymm, ymm" xed="VPERMILPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="xmm {k}, xmm, imm8" xed="VPERMILPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="xmm {k}, xmm, xmm" xed="VPERMILPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="xmm {z}, xmm, imm8" xed="VPERMILPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="xmm {z}, xmm, xmm" xed="VPERMILPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="ymm {k}, ymm, imm8" xed="VPERMILPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="ymm {k}, ymm, ymm" xed="VPERMILPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="ymm {z}, ymm, imm8" xed="VPERMILPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="ymm {z}, ymm, ymm" xed="VPERMILPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="xmm {k}, xmm, imm8" xed="VPERMILPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="xmm {k}, xmm, xmm" xed="VPERMILPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="xmm {z}, xmm, imm8" xed="VPERMILPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="xmm {z}, xmm, xmm" xed="VPERMILPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPD" form="ymm {k}, ymm, imm8" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPD" form="ymm {k}, ymm, ymm" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPD" form="ymm {z}, ymm, imm8" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPD" form="ymm {z}, ymm, ymm" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPD" form="ymm, ymm, imm8" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPD" form="ymm, ymm, ymm" xed="VPERMPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPS" form="ymm {k}, ymm, ymm" xed="VPERMPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPS" form="ymm {z}, ymm, ymm" xed="VPERMPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256i" varname="idx" etype="UI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMPS" form="ymm, ymm, ymm" xed="VPERMPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 64-bit integers in "a" across lanes lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMQ" form="ymm {k}, ymm, imm8" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMQ" form="ymm {k}, ymm, ymm" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMQ" form="ymm {z}, ymm, imm8" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMQ" form="ymm {z}, ymm, ymm" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMQ" form="ymm, ymm, imm8" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="idx" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMQ" form="ymm, ymm, ymm" xed="VPERMQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="ymm {k}, ymm" xed="VPEXPANDD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="ymm {k}, m64" xed="VPEXPANDD_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="ymm {z}, ymm" xed="VPEXPANDD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="ymm {z}, m64" xed="VPEXPANDD_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="xmm {k}, xmm" xed="VPEXPANDD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="xmm {k}, m64" xed="VPEXPANDD_XMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="xmm {z}, xmm" xed="VPEXPANDD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="xmm {z}, m64" xed="VPEXPANDD_XMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="ymm {k}, ymm" xed="VPEXPANDQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="ymm {k}, m64" xed="VPEXPANDQ_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="ymm {z}, ymm" xed="VPEXPANDQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="ymm {z}, m64" xed="VPEXPANDQ_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="xmm {k}, xmm" xed="VPEXPANDQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="xmm {k}, m64" xed="VPEXPANDQ_XMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="xmm {z}, xmm" xed="VPEXPANDQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="xmm {z}, m64" xed="VPEXPANDQ_XMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mmask_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="ymm {k}, vm32y" xed="VPGATHERDD_YMMu32_MASKmskw_MEMu32_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mmask_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="xmm {k}, vm32x" xed="VPGATHERDD_XMMu32_MASKmskw_MEMu32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mmask_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="ymm {k}, vm32x" xed="VPGATHERDQ_YMMu64_MASKmskw_MEMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mmask_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="xmm {k}, vm32x" xed="VPGATHERDQ_XMMu64_MASKmskw_MEMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mmask_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERQD" form="xmm {k}, vm64y" xed="VPGATHERQD_XMMu32_MASKmskw_MEMu32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mmask_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPGATHERQD" form="xmm {k}, vm64x" xed="VPGATHERQD_XMMu32_MASKmskw_MEMu32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mmask_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERQQ" form="ymm {k}, vm64y" xed="VPGATHERQQ_YMMu64_MASKmskw_MEMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mmask_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPGATHERQQ" form="xmm {k}, vm64x" xed="VPGATHERQQ_XMMu64_MASKmskw_MEMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSD" form="ymm {k}, ymm, ymm" xed="VPMAXSD_YMMi32_MASKmskw_YMMi32_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSD" form="ymm {z}, ymm, ymm" xed="VPMAXSD_YMMi32_MASKmskw_YMMi32_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXSD" form="xmm {k}, xmm, xmm" xed="VPMAXSD_XMMi32_MASKmskw_XMMi32_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXSD" form="xmm {z}, xmm, xmm" xed="VPMAXSD_XMMi32_MASKmskw_XMMi32_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSQ" form="ymm {k}, ymm, ymm" xed="VPMAXSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSQ" form="ymm {z}, ymm, ymm" xed="VPMAXSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXSQ" form="ymm, ymm, ymm" xed="VPMAXSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXSQ" form="xmm {k}, xmm, xmm" xed="VPMAXSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXSQ" form="xmm {z}, xmm, xmm" xed="VPMAXSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXSQ" form="xmm, xmm, xmm" xed="VPMAXSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUD" form="ymm {k}, ymm, ymm" xed="VPMAXUD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUD" form="ymm {z}, ymm, ymm" xed="VPMAXUD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXUD" form="xmm {k}, xmm, xmm" xed="VPMAXUD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXUD" form="xmm {z}, xmm, xmm" xed="VPMAXUD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUQ" form="ymm {k}, ymm, ymm" xed="VPMAXUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUQ" form="ymm {z}, ymm, ymm" xed="VPMAXUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMAXUQ" form="ymm, ymm, ymm" xed="VPMAXUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXUQ" form="xmm {k}, xmm, xmm" xed="VPMAXUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXUQ" form="xmm {z}, xmm, xmm" xed="VPMAXUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMAXUQ" form="xmm, xmm, xmm" xed="VPMAXUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSD" form="ymm {k}, ymm, ymm" xed="VPMINSD_YMMi32_MASKmskw_YMMi32_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSD" form="ymm {z}, ymm, ymm" xed="VPMINSD_YMMi32_MASKmskw_YMMi32_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINSD" form="xmm {k}, xmm, xmm" xed="VPMINSD_XMMi32_MASKmskw_XMMi32_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINSD" form="xmm {z}, xmm, xmm" xed="VPMINSD_XMMi32_MASKmskw_XMMi32_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSQ" form="ymm {k}, ymm, ymm" xed="VPMINSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSQ" form="ymm {z}, ymm, ymm" xed="VPMINSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINSQ" form="ymm, ymm, ymm" xed="VPMINSQ_YMMi64_MASKmskw_YMMi64_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINSQ" form="xmm {k}, xmm, xmm" xed="VPMINSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINSQ" form="xmm {z}, xmm, xmm" xed="VPMINSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINSQ" form="xmm, xmm, xmm" xed="VPMINSQ_XMMi64_MASKmskw_XMMi64_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUD" form="ymm {k}, ymm, ymm" xed="VPMINUD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUD" form="ymm {z}, ymm, ymm" xed="VPMINUD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINUD" form="xmm {k}, xmm, xmm" xed="VPMINUD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINUD" form="xmm {z}, xmm, xmm" xed="VPMINUD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUQ" form="ymm {k}, ymm, ymm" xed="VPMINUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUQ" form="ymm {z}, ymm, ymm" xed="VPMINUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMINUQ" form="ymm, ymm, ymm" xed="VPMINUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINUQ" form="xmm {k}, xmm, xmm" xed="VPMINUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINUQ" form="xmm {z}, xmm, xmm" xed="VPMINUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMINUQ" form="xmm, xmm, xmm" xed="VPMINUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVDB" form="xmm, ymm" xed="VPMOVDB_XMMu8_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVDB" form="xmm {k}, ymm" xed="VPMOVDB_XMMu8_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVDB" form="m64 {k}, ymm" xed="VPMOVDB_MEMu8_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVDB" form="xmm {z}, ymm" xed="VPMOVDB_XMMu8_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVDB" form="xmm, xmm" xed="VPMOVDB_XMMu8_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVDB" form="xmm {k}, xmm" xed="VPMOVDB_XMMu8_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVDB" form="m64 {k}, xmm" xed="VPMOVDB_MEMu8_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVDB" form="xmm {z}, xmm" xed="VPMOVDB_XMMu8_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVDW" form="xmm, ymm" xed="VPMOVDW_XMMu16_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVDW" form="xmm {k}, ymm" xed="VPMOVDW_XMMu16_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVDW" form="m128 {k}, ymm" xed="VPMOVDW_MEMu16_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVDW" form="xmm {z}, ymm" xed="VPMOVDW_XMMu16_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVDW" form="xmm, xmm" xed="VPMOVDW_XMMu16_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVDW" form="xmm {k}, xmm" xed="VPMOVDW_XMMu16_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVDW" form="m64 {k}, xmm" xed="VPMOVDW_MEMu16_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVDW" form="xmm {z}, xmm" xed="VPMOVDW_XMMu16_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQB" form="xmm, ymm" xed="VPMOVQB_XMMu8_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQB" form="xmm {k}, ymm" xed="VPMOVQB_XMMu8_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVQB" form="m32 {k}, ymm" xed="VPMOVQB_MEMu8_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQB" form="xmm {z}, ymm" xed="VPMOVQB_XMMu8_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQB" form="xmm, xmm" xed="VPMOVQB_XMMu8_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQB" form="xmm {k}, xmm" xed="VPMOVQB_XMMu8_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVQB" form="m16 {k}, xmm" xed="VPMOVQB_MEMu8_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQB" form="xmm {z}, xmm" xed="VPMOVQB_XMMu8_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Truncate32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQD" form="xmm, ymm" xed="VPMOVQD_XMMu32_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQD" form="xmm {k}, ymm" xed="VPMOVQD_XMMu32_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVQD" form="m128 {k}, ymm" xed="VPMOVQD_MEMu32_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQD" form="xmm {z}, ymm" xed="VPMOVQD_XMMu32_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Truncate32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVQD" form="xmm, xmm" xed="VPMOVQD_XMMu32_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVQD" form="xmm {k}, xmm" xed="VPMOVQD_XMMu32_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVQD" form="m64 {k}, xmm" xed="VPMOVQD_MEMu32_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVQD" form="xmm {z}, xmm" xed="VPMOVQD_XMMu32_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVQW" form="xmm, ymm" xed="VPMOVQW_XMMu16_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVQW" form="xmm {k}, ymm" xed="VPMOVQW_XMMu16_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVQW" form="m64 {k}, ymm" xed="VPMOVQW_MEMu16_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVQW" form="xmm {z}, ymm" xed="VPMOVQW_XMMu16_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVQW" form="xmm, xmm" xed="VPMOVQW_XMMu16_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVQW" form="xmm {k}, xmm" xed="VPMOVQW_XMMu16_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVQW" form="m32 {k}, xmm" xed="VPMOVQW_MEMu16_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVQW" form="xmm {z}, xmm" xed="VPMOVQW_XMMu16_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSDB" form="xmm, ymm" xed="VPMOVSDB_XMMi8_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSDB" form="xmm {k}, ymm" xed="VPMOVSDB_XMMi8_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI8" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSDB" form="m64 {k}, ymm" xed="VPMOVSDB_MEMi8_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSDB" form="xmm {z}, ymm" xed="VPMOVSDB_XMMi8_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVSDB" form="xmm, xmm" xed="VPMOVSDB_XMMi8_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVSDB" form="xmm {k}, xmm" xed="VPMOVSDB_XMMi8_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI8" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSDB" form="m32 {k}, xmm" xed="VPMOVSDB_MEMi8_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVSDB" form="xmm {z}, xmm" xed="VPMOVSDB_XMMi8_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSDW" form="xmm, ymm" xed="VPMOVSDW_XMMi16_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSDW" form="xmm {k}, ymm" xed="VPMOVSDW_XMMi16_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI16" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSDW" form="m128 {k}, ymm" xed="VPMOVSDW_MEMi16_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSDW" form="xmm {z}, ymm" xed="VPMOVSDW_XMMi16_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSDW" form="xmm, xmm" xed="VPMOVSDW_XMMi16_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSDW" form="xmm {k}, xmm" xed="VPMOVSDW_XMMi16_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI16" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSDW" form="m64 {k}, xmm" xed="VPMOVSDW_MEMi16_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSDW" form="xmm {z}, xmm" xed="VPMOVSDW_XMMi16_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVSQB" form="xmm, ymm" xed="VPMOVSQB_XMMi8_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVSQB" form="xmm {k}, ymm" xed="VPMOVSQB_XMMi8_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI8" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSQB" form="m32 {k}, ymm" xed="VPMOVSQB_MEMi8_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVSQB" form="xmm {z}, ymm" xed="VPMOVSQB_XMMi8_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+63:i])
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="VPMOVSQB" form="xmm, xmm" xed="VPMOVSQB_XMMi8_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="VPMOVSQB" form="xmm {k}, xmm" xed="VPMOVSQB_XMMi8_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI8" memwidth="16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSQB" form="m16 {k}, xmm" xed="VPMOVSQB_MEMi8_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="VPMOVSQB" form="xmm {z}, xmm" xed="VPMOVSQB_XMMi8_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSQD" form="xmm, ymm" xed="VPMOVSQD_XMMi32_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSQD" form="xmm {k}, ymm" xed="VPMOVSQD_XMMi32_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI32" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSQD" form="m128 {k}, ymm" xed="VPMOVSQD_MEMi32_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSQD" form="xmm {z}, ymm" xed="VPMOVSQD_XMMi32_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSQD" form="xmm, xmm" xed="VPMOVSQD_XMMi32_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSQD" form="xmm {k}, xmm" xed="VPMOVSQD_XMMi32_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI32" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSQD" form="m64 {k}, xmm" xed="VPMOVSQD_MEMi32_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSQD" form="xmm {z}, xmm" xed="VPMOVSQD_XMMi32_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSQW" form="xmm, ymm" xed="VPMOVSQW_XMMi16_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="src" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSQW" form="xmm {k}, ymm" xed="VPMOVSQW_XMMi16_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtsepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI16" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSQW" form="m64 {k}, ymm" xed="VPMOVSQW_MEMi16_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSQW" form="xmm {z}, ymm" xed="VPMOVSQW_XMMi16_MASKmskw_YMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVSQW" form="xmm, xmm" xed="VPMOVSQW_XMMi16_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="src" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVSQW" form="xmm {k}, xmm" xed="VPMOVSQW_XMMi16_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI16" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSQW" form="m32 {k}, xmm" xed="VPMOVSQW_MEMi16_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVSQW" form="xmm {z}, xmm" xed="VPMOVSQW_XMMi16_MASKmskw_XMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXBD" form="ymm {k}, xmm" xed="VPMOVSXBD_YMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXBD" form="ymm {z}, xmm" xed="VPMOVSXBD_YMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXBD" form="xmm {k}, xmm" xed="VPMOVSXBD_XMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXBD" form="xmm {z}, xmm" xed="VPMOVSXBD_XMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__m256i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXBQ" form="ymm {k}, xmm" xed="VPMOVSXBQ_YMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXBQ" form="ymm {z}, xmm" xed="VPMOVSXBQ_YMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXBQ" form="xmm {k}, xmm" xed="VPMOVSXBQ_XMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXBQ" form="xmm {z}, xmm" xed="VPMOVSXBQ_XMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXDQ" form="ymm {k}, xmm" xed="VPMOVSXDQ_YMMi64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXDQ" form="ymm {z}, xmm" xed="VPMOVSXDQ_YMMi64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXDQ" form="xmm {k}, xmm" xed="VPMOVSXDQ_XMMi64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXDQ" form="xmm {z}, xmm" xed="VPMOVSXDQ_XMMi64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*16
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXWD" form="ymm {k}, xmm" xed="VPMOVSXWD_YMMi32_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXWD" form="ymm {z}, xmm" xed="VPMOVSXWD_YMMi32_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*16
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXWD" form="xmm {k}, xmm" xed="VPMOVSXWD_XMMi32_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXWD" form="xmm {z}, xmm" xed="VPMOVSXWD_XMMi32_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__m256i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXWQ" form="ymm {k}, xmm" xed="VPMOVSXWQ_YMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSXWQ" form="ymm {z}, xmm" xed="VPMOVSXWQ_YMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXWQ" form="xmm {k}, xmm" xed="VPMOVSXWQ_XMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSXWQ" form="xmm {z}, xmm" xed="VPMOVSXWQ_XMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSDB" form="xmm, ymm" xed="VPMOVUSDB_XMMu8_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSDB" form="xmm {k}, ymm" xed="VPMOVUSDB_XMMu8_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSDB" form="m64 {k}, ymm" xed="VPMOVUSDB_MEMu8_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSDB" form="xmm {z}, ymm" xed="VPMOVUSDB_XMMu8_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVUSDB" form="xmm, xmm" xed="VPMOVUSDB_XMMu8_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVUSDB" form="xmm {k}, xmm" xed="VPMOVUSDB_XMMu8_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSDB" form="m32 {k}, xmm" xed="VPMOVUSDB_MEMu8_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVUSDB" form="xmm {z}, xmm" xed="VPMOVUSDB_XMMu8_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSDW" form="xmm, ymm" xed="VPMOVUSDW_XMMu16_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSDW" form="xmm {k}, ymm" xed="VPMOVUSDW_XMMu16_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSDW" form="m128 {k}, ymm" xed="VPMOVUSDW_MEMu16_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSDW" form="xmm {z}, ymm" xed="VPMOVUSDW_XMMu16_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSDW" form="xmm, xmm" xed="VPMOVUSDW_XMMu16_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSDW" form="xmm {k}, xmm" xed="VPMOVUSDW_XMMu16_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSDW" form="m64 {k}, xmm" xed="VPMOVUSDW_MEMu16_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSDW" form="xmm {z}, xmm" xed="VPMOVUSDW_XMMu16_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVUSQB" form="xmm, ymm" xed="VPMOVUSQB_XMMu8_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVUSQB" form="xmm {k}, ymm" xed="VPMOVUSQB_XMMu8_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSQB" form="m32 {k}, ymm" xed="VPMOVUSQB_MEMu8_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVUSQB" form="xmm {z}, ymm" xed="VPMOVUSQB_XMMu8_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+63:i])
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="VPMOVUSQB" form="xmm, xmm" xed="VPMOVUSQB_XMMu8_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="VPMOVUSQB" form="xmm {k}, xmm" xed="VPMOVUSQB_XMMu8_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSQB" form="m16 {k}, xmm" xed="VPMOVUSQB_MEMu8_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="VPMOVUSQB" form="xmm {z}, xmm" xed="VPMOVUSQB_XMMu8_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := SaturateU32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSQD" form="xmm, ymm" xed="VPMOVUSQD_XMMu32_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSQD" form="xmm {k}, ymm" xed="VPMOVUSQD_XMMu32_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSQD" form="m128 {k}, ymm" xed="VPMOVUSQD_MEMu32_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSQD" form="xmm {z}, ymm" xed="VPMOVUSQD_XMMu32_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := SaturateU32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSQD" form="xmm, xmm" xed="VPMOVUSQD_XMMu32_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSQD" form="xmm {k}, xmm" xed="VPMOVUSQD_XMMu32_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSQD" form="m64 {k}, xmm" xed="VPMOVUSQD_MEMu32_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSQD" form="xmm {z}, xmm" xed="VPMOVUSQD_XMMu32_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSQW" form="xmm, ymm" xed="VPMOVUSQW_XMMu16_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSQW" form="xmm {k}, ymm" xed="VPMOVUSQW_XMMu16_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtusepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSQW" form="m64 {k}, ymm" xed="VPMOVUSQW_MEMu16_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSQW" form="xmm {z}, ymm" xed="VPMOVUSQW_XMMu16_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVUSQW" form="xmm, xmm" xed="VPMOVUSQW_XMMu16_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVUSQW" form="xmm {k}, xmm" xed="VPMOVUSQW_XMMu16_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtusepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSQW" form="m32 {k}, xmm" xed="VPMOVUSQW_MEMu16_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPMOVUSQW" form="xmm {z}, xmm" xed="VPMOVUSQW_XMMu16_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXBD" form="ymm {k}, xmm" xed="VPMOVZXBD_YMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXBD" form="ymm {z}, xmm" xed="VPMOVZXBD_YMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXBD" form="xmm {k}, xmm" xed="VPMOVZXBD_XMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in th elow 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXBD" form="xmm {z}, xmm" xed="VPMOVZXBD_XMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXBQ" form="ymm {k}, xmm" xed="VPMOVZXBQ_YMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXBQ" form="ymm {z}, xmm" xed="VPMOVZXBQ_YMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXBQ" form="xmm {k}, xmm" xed="VPMOVZXBQ_XMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXBQ" form="xmm {z}, xmm" xed="VPMOVZXBQ_XMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXDQ" form="ymm {k}, xmm" xed="VPMOVZXDQ_YMMi64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE 
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXDQ" form="ymm {z}, xmm" xed="VPMOVZXDQ_YMMi64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXDQ" form="xmm {k}, xmm" xed="VPMOVZXDQ_XMMi64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE 
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXDQ" form="xmm {z}, xmm" xed="VPMOVZXDQ_XMMi64_MASKmskw_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXWD" form="ymm {k}, xmm" xed="VPMOVZXWD_YMMi32_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXWD" form="ymm {z}, xmm" xed="VPMOVZXWD_YMMi32_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXWD" form="xmm {k}, xmm" xed="VPMOVZXWD_XMMi32_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXWD" form="xmm {z}, xmm" xed="VPMOVZXWD_XMMi32_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXWQ" form="ymm {k}, xmm" xed="VPMOVZXWQ_YMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVZXWQ" form="ymm {z}, xmm" xed="VPMOVZXWQ_YMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXWQ" form="xmm {k}, xmm" xed="VPMOVZXWQ_XMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVZXWQ" form="xmm {z}, xmm" xed="VPMOVZXWQ_XMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__m256i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULDQ" form="ymm {k}, ymm, ymm" xed="VPMULDQ_YMMi64_MASKmskw_YMMi32_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="b" etype="SI32"/>
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULDQ" form="ymm {z}, ymm, ymm" xed="VPMULDQ_YMMi64_MASKmskw_YMMi32_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULDQ" form="xmm {k}, xmm, xmm" xed="VPMULDQ_XMMi64_MASKmskw_XMMi32_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULDQ" form="xmm {z}, xmm, xmm" xed="VPMULDQ_XMMi64_MASKmskw_XMMi32_XMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULLD" form="ymm {k}, ymm, ymm" xed="VPMULLD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULLD" form="ymm {z}, ymm, ymm" xed="VPMULLD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULLD" form="zmm {z}, zmm, zmm" xed="VPMULLD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULLD" form="xmm {k}, xmm, xmm" xed="VPMULLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULLD" form="xmm {z}, xmm, xmm" xed="VPMULLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULUDQ" form="ymm {k}, ymm, ymm" xed="VPMULUDQ_YMMu64_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULUDQ" form="ymm {z}, ymm, ymm" xed="VPMULUDQ_YMMu64_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULUDQ" form="xmm {k}, xmm, xmm" xed="VPMULUDQ_XMMu64_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULUDQ" form="xmm {z}, xmm, xmm" xed="VPMULUDQ_XMMu64_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPORD" form="ymm {k}, ymm, ymm" xed="VPORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPORD" form="ymm {z}, ymm, ymm" xed="VPORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPORD" form="xmm {k}, xmm, xmm" xed="VPORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPORD" form="xmm {z}, xmm, xmm" xed="VPORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPORQ" form="ymm {k}, ymm, ymm" xed="VPORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPORQ" form="ymm {z}, ymm, ymm" xed="VPORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPORQ" form="xmm {k}, xmm, xmm" xed="VPORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPORQ" form="xmm {z}, xmm, xmm" xed="VPORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLD" form="ymm {k}, ymm, imm8" xed="VPROLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLD" form="ymm {z}, ymm, imm8" xed="VPROLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLD" form="ymm, ymm, imm8" xed="VPROLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLD" form="xmm {k}, xmm, imm8" xed="VPROLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLD" form="xmm {z}, xmm, imm8" xed="VPROLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLD" form="xmm, xmm, imm8" xed="VPROLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLQ" form="ymm {k}, ymm, imm8" xed="VPROLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLQ" form="ymm {z}, ymm, imm8" xed="VPROLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLQ" form="ymm, ymm, imm8" xed="VPROLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLQ" form="xmm {k}, xmm, imm8" xed="VPROLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLQ" form="xmm {z}, xmm, imm8" xed="VPROLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLQ" form="xmm, xmm, imm8" xed="VPROLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLVD" form="ymm {k}, ymm, ymm" xed="VPROLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLVD" form="ymm {z}, ymm, ymm" xed="VPROLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLVD" form="ymm, ymm, ymm" xed="VPROLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLVD" form="xmm {k}, xmm, xmm" xed="VPROLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLVD" form="xmm {z}, xmm, xmm" xed="VPROLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLVD" form="xmm, xmm, xmm" xed="VPROLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLVQ" form="ymm {k}, ymm, ymm" xed="VPROLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLVQ" form="ymm {z}, ymm, ymm" xed="VPROLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPROLVQ" form="ymm, ymm, ymm" xed="VPROLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLVQ" form="xmm {k}, xmm, xmm" xed="VPROLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLVQ" form="xmm {z}, xmm, xmm" xed="VPROLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPROLVQ" form="xmm, xmm, xmm" xed="VPROLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORD" form="ymm {k}, ymm, imm8" xed="VPRORD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORD" form="ymm {z}, ymm, imm8" xed="VPRORD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORD" form="ymm, ymm, imm8" xed="VPRORD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORD" form="xmm {k}, xmm, imm8" xed="VPRORD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORD" form="xmm {z}, xmm, imm8" xed="VPRORD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORD" form="xmm, xmm, imm8" xed="VPRORD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORQ" form="ymm {k}, ymm, imm8" xed="VPRORQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORQ" form="ymm {z}, ymm, imm8" xed="VPRORQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORQ" form="ymm, ymm, imm8" xed="VPRORQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORQ" form="xmm {k}, xmm, imm8" xed="VPRORQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORQ" form="xmm {z}, xmm, imm8" xed="VPRORQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORQ" form="xmm, xmm, imm8" xed="VPRORQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORVD" form="ymm {k}, ymm, ymm" xed="VPRORVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORVD" form="ymm {z}, ymm, ymm" xed="VPRORVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORVD" form="ymm, ymm, ymm" xed="VPRORVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORVD" form="xmm {k}, xmm, xmm" xed="VPRORVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORVD" form="xmm {z}, xmm, xmm" xed="VPRORVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORVD" form="xmm, xmm, xmm" xed="VPRORVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORVQ" form="ymm {k}, ymm, ymm" xed="VPRORVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORVQ" form="ymm {z}, ymm, ymm" xed="VPRORVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPRORVQ" form="ymm, ymm, ymm" xed="VPRORVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORVQ" form="xmm {k}, xmm, xmm" xed="VPRORVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORVQ" form="xmm {z}, xmm, xmm" xed="VPRORVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPRORVQ" form="xmm, xmm, xmm" xed="VPRORVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDD" form="vm32y, ymm" xed="VPSCATTERDD_MEMu32_MASKmskw_YMMu32_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDD" form="vm32y {k}, ymm" xed="VPSCATTERDD_MEMu32_MASKmskw_YMMu32_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDD" form="vm32x, xmm" xed="VPSCATTERDD_MEMu32_MASKmskw_XMMu32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDD" form="vm32x {k}, xmm" xed="VPSCATTERDD_MEMu32_MASKmskw_XMMu32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="vm32x, ymm" xed="VPSCATTERDQ_MEMu64_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="vm32x {k}, ymm" xed="VPSCATTERDQ_MEMu64_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="vm32x, xmm" xed="VPSCATTERDQ_MEMu64_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="vm32x {k}, xmm" xed="VPSCATTERDQ_MEMu64_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQD" form="vm64y, xmm" xed="VPSCATTERQD_MEMu32_MASKmskw_XMMu32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQD" form="vm64y {k}, xmm" xed="VPSCATTERQD_MEMu32_MASKmskw_XMMu32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQD" form="vm64x, xmm" xed="VPSCATTERQD_MEMu32_MASKmskw_XMMu32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQD" form="vm64x {k}, xmm" xed="VPSCATTERQD_MEMu32_MASKmskw_XMMu32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQQ" form="vm64y, ymm" xed="VPSCATTERQQ_MEMu64_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQQ" form="vm64y {k}, ymm" xed="VPSCATTERQQ_MEMu64_MASKmskw_YMMu64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQQ" form="vm64x, xmm" xed="VPSCATTERQQ_MEMu64_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQQ" form="vm64x {k}, xmm" xed="VPSCATTERQQ_MEMu64_MASKmskw_XMMu64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFD" form="ymm {k}, ymm, imm8" xed="VPSHUFD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHUFD" form="ymm {z}, ymm, imm8" xed="VPSHUFD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHUFD" form="xmm {k}, xmm, imm8" xed="VPSHUFD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHUFD" form="xmm {z}, xmm, imm8" xed="VPSHUFD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLD" form="ymm {k}, ymm, xmm" xed="VPSLLD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLD" form="ymm {k}, ymm, imm8" xed="VPSLLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLD" form="ymm {z}, ymm, xmm" xed="VPSLLD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLD" form="ymm {z}, ymm, imm8" xed="VPSLLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLD" form="xmm {k}, xmm, xmm" xed="VPSLLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLD" form="xmm {k}, xmm, imm8" xed="VPSLLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLD" form="xmm {z}, xmm, xmm" xed="VPSLLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLD" form="xmm {z}, xmm, imm8" xed="VPSLLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="ymm {k}, ymm, xmm" xed="VPSLLQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="ymm {k}, ymm, imm8" xed="VPSLLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="ymm {z}, ymm, xmm" xed="VPSLLQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="ymm {z}, ymm, imm8" xed="VPSLLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="xmm {k}, xmm, xmm" xed="VPSLLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="xmm {k}, xmm, imm8" xed="VPSLLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="xmm {z}, xmm, xmm" xed="VPSLLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="xmm {z}, xmm, imm8" xed="VPSLLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLVD" form="ymm {k}, ymm, ymm" xed="VPSLLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLVD" form="ymm {z}, ymm, ymm" xed="VPSLLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLVD" form="xmm {k}, xmm, xmm" xed="VPSLLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLVD" form="xmm {z}, xmm, xmm" xed="VPSLLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLVQ" form="ymm {k}, ymm, ymm" xed="VPSLLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSLLVQ" form="ymm {z}, ymm, ymm" xed="VPSLLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLVQ" form="xmm {k}, xmm, xmm" xed="VPSLLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSLLVQ" form="xmm {z}, xmm, xmm" xed="VPSLLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAD" form="ymm {k}, ymm, xmm" xed="VPSRAD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAD" form="ymm {k}, ymm, imm8" xed="VPSRAD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAD" form="ymm {z}, ymm, xmm" xed="VPSRAD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAD" form="ymm {z}, ymm, imm8" xed="VPSRAD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAD" form="xmm {k}, xmm, xmm" xed="VPSRAD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAD" form="xmm {k}, xmm, imm8" xed="VPSRAD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAD" form="xmm {z}, xmm, xmm" xed="VPSRAD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="6"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAD" form="xmm {z}, xmm, imm8" xed="VPSRAD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="ymm {k}, ymm, xmm" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="ymm {k}, ymm, imm8" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="ymm {z}, ymm, xmm" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="ymm {z}, ymm, imm8" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="ymm, ymm, xmm" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="ymm, ymm, imm8" xed="VPSRAQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="xmm {k}, xmm, xmm" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="xmm {k}, xmm, imm8" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="xmm {z}, xmm, xmm" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="7"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="xmm {z}, xmm, imm8" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="xmm, xmm, xmm" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="7"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="xmm, xmm, imm8" xed="VPSRAQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAVD" form="ymm {k}, ymm, ymm" xed="VPSRAVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<parameter type="__m256i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAVD" form="ymm {z}, ymm, ymm" xed="VPSRAVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAVD" form="xmm {k}, xmm, xmm" xed="VPSRAVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAVD" form="xmm {z}, xmm, xmm" xed="VPSRAVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAVQ" form="ymm {k}, ymm, ymm" xed="VPSRAVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAVQ" form="ymm {z}, ymm, ymm" xed="VPSRAVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="SI64"/>
+	<parameter type="__m256i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRAVQ" form="ymm, ymm, ymm" xed="VPSRAVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAVQ" form="xmm {k}, xmm, xmm" xed="VPSRAVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAVQ" form="xmm {z}, xmm, xmm" xed="VPSRAVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRAVQ" form="xmm, xmm, xmm" xed="VPSRAVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLD" form="ymm {k}, ymm, xmm" xed="VPSRLD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLD" form="ymm {k}, ymm, imm8" xed="VPSRLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLD" form="ymm {z}, ymm, xmm" xed="VPSRLD_YMMu32_MASKmskw_YMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLD" form="ymm {z}, ymm, imm8" xed="VPSRLD_YMMu32_MASKmskw_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLD" form="xmm {k}, xmm, xmm" xed="VPSRLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLD" form="xmm {k}, xmm, imm8" xed="VPSRLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLD" form="xmm {z}, xmm, xmm" xed="VPSRLD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLD" form="xmm {z}, xmm, imm8" xed="VPSRLD_XMMu32_MASKmskw_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="ymm {k}, ymm, xmm" xed="VPSRLQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="ymm {k}, ymm, imm8" xed="VPSRLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="ymm {z}, ymm, xmm" xed="VPSRLQ_YMMu64_MASKmskw_YMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="ymm {z}, ymm, imm8" xed="VPSRLQ_YMMu64_MASKmskw_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="xmm {k}, xmm, xmm" xed="VPSRLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="xmm {k}, xmm, imm8" xed="VPSRLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="xmm {z}, xmm, xmm" xed="VPSRLQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="xmm {z}, xmm, imm8" xed="VPSRLQ_XMMu64_MASKmskw_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLVD" form="ymm {k}, ymm, ymm" xed="VPSRLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLVD" form="ymm {z}, ymm, ymm" xed="VPSRLVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLVD" form="xmm {k}, xmm, xmm" xed="VPSRLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLVD" form="xmm {z}, xmm, xmm" xed="VPSRLVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLVQ" form="ymm {k}, ymm, ymm" xed="VPSRLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSRLVQ" form="ymm {z}, ymm, ymm" xed="VPSRLVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLVQ" form="xmm {k}, xmm, xmm" xed="VPSRLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSRLVQ" form="xmm {z}, xmm, xmm" xed="VPSRLVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBD" form="ymm {k}, ymm, ymm" xed="VPSUBD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBD" form="ymm {z}, ymm, ymm" xed="VPSUBD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBD" form="xmm {k}, xmm, xmm" xed="VPSUBD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBD" form="xmm {z}, xmm, xmm" xed="VPSUBD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBQ" form="ymm {k}, ymm, ymm" xed="VPSUBQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSUBQ" form="ymm {z}, ymm, ymm" xed="VPSUBQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBQ" form="xmm {k}, xmm, xmm" xed="VPSUBQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSUBQ" form="xmm {z}, xmm, xmm" xed="VPSUBQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPTERNLOGD" form="ymm {k}, ymm, ymm, imm8" xed="VPTERNLOGD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPTERNLOGD" form="ymm {z}, ymm, ymm, imm8" xed="VPTERNLOGD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	FOR h := 0 to 31
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPTERNLOGD" form="ymm, ymm, ymm, imm8" xed="VPTERNLOGD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPTERNLOGD" form="xmm {k}, xmm, xmm, imm8" xed="VPTERNLOGD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPTERNLOGD" form="xmm {z}, xmm, xmm, imm8" xed="VPTERNLOGD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	FOR h := 0 to 31
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPTERNLOGD" form="xmm, xmm, xmm, imm8" xed="VPTERNLOGD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPTERNLOGQ" form="ymm {k}, ymm, ymm, imm8" xed="VPTERNLOGQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPTERNLOGQ" form="ymm {z}, ymm, ymm, imm8" xed="VPTERNLOGQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	FOR h := 0 to 63
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPTERNLOGQ" form="ymm, ymm, ymm, imm8" xed="VPTERNLOGQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPTERNLOGQ" form="xmm {k}, xmm, xmm, imm8" xed="VPTERNLOGQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPTERNLOGQ" form="xmm {z}, xmm, xmm, imm8" xed="VPTERNLOGQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	FOR h := 0 to 63
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPTERNLOGQ" form="xmm, xmm, xmm, imm8" xed="VPTERNLOGQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTMD" form="k {k}, ymm, ymm" xed="VPTESTMD_MASKmskw_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTMD" form="k, ymm, ymm" xed="VPTESTMD_MASKmskw_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPTESTMD" form="k {k}, xmm, xmm" xed="VPTESTMD_MASKmskw_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPTESTMD" form="k, xmm, xmm" xed="VPTESTMD_MASKmskw_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPTESTMQ" form="k {k}, ymm, ymm" xed="VPTESTMQ_MASKmskw_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPTESTMQ" form="k, ymm, ymm" xed="VPTESTMQ_MASKmskw_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPTESTMQ" form="k {k}, xmm, xmm" xed="VPTESTMQ_MASKmskw_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPTESTMQ" form="k, xmm, xmm" xed="VPTESTMQ_MASKmskw_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTNMD" form="k {k}, ymm, ymm" xed="VPTESTNMD_MASKmskw_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTNMD" form="k, ymm, ymm" xed="VPTESTNMD_MASKmskw_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPTESTNMD" form="k {k}, xmm, xmm" xed="VPTESTNMD_MASKmskw_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPTESTNMD" form="k, xmm, xmm" xed="VPTESTNMD_MASKmskw_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPTESTNMQ" form="k {k}, ymm, ymm" xed="VPTESTNMQ_MASKmskw_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="VPTESTNMQ" form="k, ymm, ymm" xed="VPTESTNMQ_MASKmskw_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPTESTNMQ" form="k {k}, xmm, xmm" xed="VPTESTNMQ_MASKmskw_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="VPTESTNMQ" form="k, xmm, xmm" xed="VPTESTNMQ_MASKmskw_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHDQ" form="ymm {k}, ymm, ymm" xed="VPUNPCKHDQ_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHDQ" form="ymm {z}, ymm, ymm" xed="VPUNPCKHDQ_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKHDQ" form="xmm {k}, xmm, xmm" xed="VPUNPCKHDQ_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKHDQ" form="xmm {z}, xmm, xmm" xed="VPUNPCKHDQ_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHQDQ" form="ymm {k}, ymm, ymm" xed="VPUNPCKHQDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKHQDQ" form="ymm {z}, ymm, ymm" xed="VPUNPCKHQDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKHQDQ" form="xmm {k}, xmm, xmm" xed="VPUNPCKHQDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKHQDQ" form="xmm {z}, xmm, xmm" xed="VPUNPCKHQDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLDQ" form="ymm {k}, ymm, ymm" xed="VPUNPCKLDQ_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLDQ" form="ymm {z}, ymm, ymm" xed="VPUNPCKLDQ_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKLDQ" form="xmm {k}, xmm, xmm" xed="VPUNPCKLDQ_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKLDQ" form="xmm {z}, xmm, xmm" xed="VPUNPCKLDQ_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLQDQ" form="ymm {k}, ymm, ymm" xed="VPUNPCKLQDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPUNPCKLQDQ" form="ymm {z}, ymm, ymm" xed="VPUNPCKLQDQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKLQDQ" form="xmm {k}, xmm, xmm" xed="VPUNPCKLQDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPUNPCKLQDQ" form="xmm {z}, xmm, xmm" xed="VPUNPCKLQDQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPXORD" form="ymm {k}, ymm, ymm" xed="VPXORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPXORD" form="ymm {z}, ymm, ymm" xed="VPXORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPXORD" form="xmm {k}, xmm, xmm" xed="VPXORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPXORD" form="xmm {z}, xmm, xmm" xed="VPXORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPXORQ" form="ymm {k}, ymm, ymm" xed="VPXORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPXORQ" form="ymm {z}, ymm, ymm" xed="VPXORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPXORQ" form="xmm {k}, xmm, xmm" xed="VPXORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPXORQ" form="xmm {z}, xmm, xmm" xed="VPXORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRCP14PD" form="ymm {k}, ymm" xed="VRCP14PD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRCP14PD" form="ymm {z}, ymm" xed="VRCP14PD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRCP14PD" form="ymm, ymm" xed="VRCP14PD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14PD" form="xmm {k}, xmm" xed="VRCP14PD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14PD" form="xmm {z}, xmm" xed="VRCP14PD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14PD" form="xmm, xmm" xed="VRCP14PD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRCP14PS" form="ymm {k}, ymm" xed="VRCP14PS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRCP14PS" form="ymm {z}, ymm" xed="VRCP14PS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRCP14PS" form="ymm, ymm" xed="VRCP14PS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14PS" form="xmm {k}, xmm" xed="VRCP14PS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14PS" form="xmm {z}, xmm" xed="VRCP14PS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14PS" form="xmm, xmm" xed="VRCP14PS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="ymm {k}, ymm, imm8" xed="VRNDSCALEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="ymm {z}, ymm, imm8" xed="VRNDSCALEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="ymm, ymm, imm8" xed="VRNDSCALEPD_YMMf64_MASKmskw_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="xmm {k}, xmm, imm8" xed="VRNDSCALEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="xmm {z}, xmm, imm8" xed="VRNDSCALEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="xmm, xmm, imm8" xed="VRNDSCALEPD_XMMf64_MASKmskw_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="ymm {k}, ymm, imm8" xed="VRNDSCALEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="ymm {z}, ymm, imm8" xed="VRNDSCALEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="ymm, ymm, imm8" xed="VRNDSCALEPS_YMMf32_MASKmskw_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="xmm {k}, xmm, imm8" xed="VRNDSCALEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="xmm {z}, xmm, imm8" xed="VRNDSCALEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="xmm, xmm, imm8" xed="VRNDSCALEPS_XMMf32_MASKmskw_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRSQRT14PD" form="ymm {k}, ymm" xed="VRSQRT14PD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRSQRT14PD" form="ymm {z}, ymm" xed="VRSQRT14PD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14PD" form="xmm {k}, xmm" xed="VRSQRT14PD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14PD" form="xmm {z}, xmm" xed="VRSQRT14PD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRSQRT14PS" form="ymm {k}, ymm" xed="VRSQRT14PS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VRSQRT14PS" form="ymm {z}, ymm" xed="VRSQRT14PS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14PS" form="xmm {k}, xmm" xed="VRSQRT14PS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14PS" form="xmm {z}, xmm" xed="VRSQRT14PS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="ymm {k}, ymm, ymm" xed="VSCALEFPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="ymm {z}, ymm, ymm" xed="VSCALEFPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="ymm, ymm, ymm" xed="VSCALEFPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="xmm {k}, xmm, xmm" xed="VSCALEFPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="xmm {z}, xmm, xmm" xed="VSCALEFPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="xmm, xmm, xmm" xed="VSCALEFPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="ymm {k}, ymm, ymm" xed="VSCALEFPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="ymm {z}, ymm, ymm" xed="VSCALEFPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="ymm, ymm, ymm" xed="VSCALEFPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="xmm {k}, xmm, xmm" xed="VSCALEFPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="xmm {z}, xmm, xmm" xed="VSCALEFPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="xmm, xmm, xmm" xed="VSCALEFPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="vm32x, ymm" xed="VSCATTERDPD_MEMf64_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="vm32x {k}, ymm" xed="VSCATTERDPD_MEMf64_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="vm32x, xmm" xed="VSCATTERDPD_MEMf64_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="vm32x {k}, xmm" xed="VSCATTERDPD_MEMf64_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPS" form="vm32y, ymm" xed="VSCATTERDPS_MEMf32_MASKmskw_YMMf32_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPS" form="vm32y {k}, ymm" xed="VSCATTERDPS_MEMf32_MASKmskw_YMMf32_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPS" form="vm32x, xmm" xed="VSCATTERDPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPS" form="vm32x {k}, xmm" xed="VSCATTERDPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPD" form="vm64y, ymm" xed="VSCATTERQPD_MEMf64_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPD" form="vm64y {k}, ymm" xed="VSCATTERQPD_MEMf64_MASKmskw_YMMf64_AVX512_VL256"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPD" form="vm64x, xmm" xed="VSCATTERQPD_MEMf64_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPD" form="vm64x {k}, xmm" xed="VSCATTERQPD_MEMf64_MASKmskw_XMMf64_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPS" form="vm64y, xmm" xed="VSCATTERQPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPS" form="vm64y {k}, xmm" xed="VSCATTERQPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPS" form="vm64x, xmm" xed="VSCATTERQPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="vindex" etype="SI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPS" form="vm64x {k}, xmm" xed="VSCATTERQPS_MEMf32_MASKmskw_XMMf32_AVX512_VL128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFF32X4" form="ymm {k}, ymm, ymm, imm8" xed="VSHUFF32X4_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFF32X4" form="ymm {z}, ymm, ymm, imm8" xed="VSHUFF32X4_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst.m128[0] := a.m128[imm8[0]]
+dst.m128[1] := b.m128[imm8[1]]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFF32X4" form="ymm, ymm, ymm, imm8" xed="VSHUFF32X4_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFF64X2" form="ymm {k}, ymm, ymm, imm8" xed="VSHUFF64X2_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFF64X2" form="ymm {z}, ymm, ymm, imm8" xed="VSHUFF64X2_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst.m128[0] := a.m128[imm8[0]]
+dst.m128[1] := b.m128[imm8[1]]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFF64X2" form="ymm, ymm, ymm, imm8" xed="VSHUFF64X2_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shuffle_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFI32X4" form="ymm {k}, ymm, ymm, imm8" xed="VSHUFI32X4_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shuffle_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFI32X4" form="ymm {z}, ymm, ymm, imm8" xed="VSHUFI32X4_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shuffle_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst.m128[0] := a.m128[imm8[0]]
+dst.m128[1] := b.m128[imm8[1]]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFI32X4" form="ymm, ymm, ymm, imm8" xed="VSHUFI32X4_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shuffle_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFI64X2" form="ymm {k}, ymm, ymm, imm8" xed="VSHUFI64X2_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shuffle_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst.m128[0] := a.m128[imm8[0]]
+tmp_dst.m128[1] := b.m128[imm8[1]]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFI64X2" form="ymm {z}, ymm, ymm, imm8" xed="VSHUFI64X2_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shuffle_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst.m128[0] := a.m128[imm8[0]]
+dst.m128[1] := b.m128[imm8[1]]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFI64X2" form="ymm, ymm, ymm, imm8" xed="VSHUFI64X2_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFPD" form="ymm {k}, ymm, ymm, imm8" xed="VSHUFPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFPD" form="ymm {z}, ymm, ymm, imm8" xed="VSHUFPD_YMMf64_MASKmskw_YMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSHUFPD" form="xmm {k}, xmm, xmm, imm8" xed="VSHUFPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSHUFPD" form="xmm {z}, xmm, xmm, imm8" xed="VSHUFPD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFPS" form="ymm {k}, ymm, ymm, imm8" xed="VSHUFPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSHUFPS" form="ymm {z}, ymm, ymm, imm8" xed="VSHUFPS_YMMf32_MASKmskw_YMMf32_YMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSHUFPS" form="xmm {k}, xmm, xmm, imm8" xed="VSHUFPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSHUFPS" form="xmm {z}, xmm, xmm, imm8" xed="VSHUFPS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="ymm {k}, ymm" xed="VSQRTPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="ymm {z}, ymm" xed="VSQRTPD_YMMf64_MASKmskw_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="xmm {k}, xmm" xed="VSQRTPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="xmm {z}, xmm" xed="VSQRTPD_XMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="ymm {k}, ymm" xed="VSQRTPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="ymm {z}, ymm" xed="VSQRTPS_YMMf32_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="xmm {k}, xmm" xed="VSQRTPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="xmm {z}, xmm" xed="VSQRTPS_XMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSUBPD" form="ymm {k}, ymm, ymm" xed="VSUBPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSUBPD" form="ymm {z}, ymm, ymm" xed="VSUBPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBPD" form="xmm {k}, xmm, xmm" xed="VSUBPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBPD" form="xmm {z}, xmm, xmm" xed="VSUBPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSUBPS" form="ymm {k}, ymm, ymm" xed="VSUBPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VSUBPS" form="ymm {z}, ymm, ymm" xed="VSUBPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBPS" form="xmm {k}, xmm, xmm" xed="VSUBPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBPS" form="xmm {z}, xmm, xmm" xed="VSUBPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKHPD" form="ymm {k}, ymm, ymm" xed="VUNPCKHPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKHPD" form="ymm {z}, ymm, ymm" xed="VUNPCKHPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VUNPCKHPD" form="xmm {k}, xmm, xmm" xed="VUNPCKHPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VUNPCKHPD" form="xmm {z}, xmm, xmm" xed="VUNPCKHPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKHPS" form="ymm {k}, ymm, ymm" xed="VUNPCKHPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKHPS" form="ymm {z}, ymm, ymm" xed="VUNPCKHPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VUNPCKHPS" form="xmm {k}, xmm, xmm" xed="VUNPCKHPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VUNPCKHPS" form="xmm {z}, xmm, xmm" xed="VUNPCKHPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKLPD" form="ymm {k}, ymm, ymm" xed="VUNPCKLPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKLPD" form="ymm {z}, ymm, ymm" xed="VUNPCKLPD_YMMf64_MASKmskw_YMMf64_YMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VUNPCKLPD" form="xmm {k}, xmm, xmm" xed="VUNPCKLPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VUNPCKLPD" form="xmm {z}, xmm, xmm" xed="VUNPCKLPD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKLPS" form="ymm {k}, ymm, ymm" xed="VUNPCKLPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VUNPCKLPS" form="ymm {z}, ymm, ymm" xed="VUNPCKLPS_YMMf32_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VUNPCKLPS" form="xmm {k}, xmm, xmm" xed="VUNPCKLPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VUNPCKLPS" form="xmm {z}, xmm, xmm" xed="VUNPCKLPS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_storeu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVDQU64" form="m512, zmm" xed="VMOVDQU64_MEMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVDQU32" form="m512, zmm" xed="VMOVDQU32_MEMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_storeu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVDQU64" form="m256, ymm" xed="VMOVDQU64_MEMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVDQU32" form="m256, ymm" xed="VMOVDQU32_MEMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_storeu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="VMOVDQU64" form="m128, xmm" xed="VMOVDQU64_MEMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory.
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="VMOVDQU32" form="m128, xmm" xed="VMOVDQU32_MEMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory.
+		"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVDQA64" form="m256, ymm" xed="VMOVDQA64_MEMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory.
+		"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name="VMOVDQA32" form="m256, ymm" xed="VMOVDQA32_MEMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory.
+		"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="VMOVDQA64" form="m128, xmm" xed="VMOVDQA64_MEMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory.
+		"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="VMOVDQA32" form="m128, xmm" xed="VMOVDQA32_MEMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<description>Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU64" form="zmm, m512" xed="VMOVDQU64_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<description>Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="zmm, m512" xed="VMOVDQU32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<description>Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU64" form="ymm, m256" xed="VMOVDQU64_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<description>Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="ymm, m256" xed="VMOVDQU32_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<description>Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU64" form="xmm, m128" xed="VMOVDQU64_XMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<description>Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst".
+		"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="xmm, m128" xed="VMOVDQU32_XMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="256"/>
+	<description>Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst".
+		"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="ymm, m256" xed="VMOVDQA64_YMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="256"/>
+	<description>Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst".
+		"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="ymm, m256" xed="VMOVDQA32_YMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="128"/>
+	<description>Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst".
+		"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="xmm, m128" xed="VMOVDQA64_XMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<description>Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst".
+		"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="xmm, m128" xed="VMOVDQA32_XMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPXORQ" form="ymm, ymm" xed="VPXORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPXORD" form="ymm, ymm" xed="VPXORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPXORQ" form="xmm, xmm" xed="VPXORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPXORD" form="xmm, xmm" xed="VPXORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPORQ" form="ymm, ymm" xed="VPORQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPORD" form="ymm, ymm" xed="VPORD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPORQ" form="xmm, xmm" xed="VPORQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPORD" form="xmm, xmm" xed="VPORD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_aesenclast_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>VAES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m512i" varname="dst" etype="M128"/>
+	<parameter type="__m512i" varname="a" etype="M128"/>
+	<parameter type="__m512i" varname="RoundKey" etype="M128"/>
+	<description>Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"."</description>
+	<operation>FOR j := 0 to 3
+	i := j*128
+	a[i+127:i] := ShiftRows(a[i+127:i])
+	a[i+127:i] := SubBytes(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VAESENCLAST" form="zmm, zmm" xed="VAESENCLAST_ZMMu128_ZMMu128_ZMMu128_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_aesenc_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>VAES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m512i" varname="dst" etype="M128"/>
+	<parameter type="__m512i" varname="a" etype="M128"/>
+	<parameter type="__m512i" varname="RoundKey" etype="M128"/>
+	<description>Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"."</description>
+	<operation>FOR j := 0 to 3
+	i := j*128
+	a[i+127:i] := ShiftRows(a[i+127:i])
+	a[i+127:i] := SubBytes(a[i+127:i])
+	a[i+127:i] := MixColumns(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VAESENC" form="zmm, zmm" xed="VAESENC_ZMMu128_ZMMu128_ZMMu128_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_aesdeclast_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>VAES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m512i" varname="dst" etype="M128"/>
+	<parameter type="__m512i" varname="a" etype="M128"/>
+	<parameter type="__m512i" varname="RoundKey" etype="M128"/>
+	<description>Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*128
+	a[i+127:i] := InvShiftRows(a[i+127:i])
+	a[i+127:i] := InvSubBytes(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VAESDECLAST" form="zmm, zmm" xed="VAESDECLAST_ZMMu128_ZMMu128_ZMMu128_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_aesdec_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>VAES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m512i" varname="dst" etype="M128"/>
+	<parameter type="__m512i" varname="a" etype="M128"/>
+	<parameter type="__m512i" varname="RoundKey" etype="M128"/>
+	<description>Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*128
+	a[i+127:i] := InvShiftRows(a[i+127:i])
+	a[i+127:i] := InvSubBytes(a[i+127:i])
+	a[i+127:i] := InvMixColumns(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VAESDEC" form="zmm, zmm" xed="VAESDEC_ZMMu128_ZMMu128_ZMMu128_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kand_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KANDW" form="k, k, k" xed="KANDW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kandn_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := (NOT a[15:0]) AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KANDNW" form="k, k, k" xed="KANDNW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_knot_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<description>Compute the bitwise NOT of 16-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KNOTW" form="k, k" xed="KNOTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kor_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] OR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KORW" form="k, k, k" xed="KORW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kxnor_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT (a[15:0] XOR b[15:0])
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KXNORW" form="k, k, k" xed="KXNORW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kxor_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] XOR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KXORW" form="k, k, k" xed="KXORW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kshiftli_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="unsigned int" varname="count" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of 16-bit mask "a" left by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 15
+	k[15:0] := a[15:0] &lt;&lt; count[7:0]
+FI
+	</operation>
+	<instruction name="KSHIFTLW" form="k, k, imm8" xed="KSHIFTLW_MASKmskw_MASKmskw_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kshiftri_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="unsigned int" varname="count" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of 16-bit mask "a" right by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k".</description>
+	<operation>
+k[MAX:0] := 0
+IF count[7:0] &lt;= 15
+	k[15:0] := a[15:0] &gt;&gt; count[7:0]
+FI
+	</operation>
+	<instruction name="KSHIFTRW" form="k, k, imm8" xed="KSHIFTRW_MASKmskw_MASKmskw_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_load_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16*" varname="mem_addr" etype="MASK" memwidth="16"/>
+	<description>Load 16-bit mask from memory into "k".</description>
+	<operation>
+k[15:0] := MEM[mem_addr+15:mem_addr]
+	</operation>
+	<instruction name="KMOVW" form="k, m16" xed="KMOVW_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_store_mask16">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__mmask16*" varname="mem_addr" etype="MASK" memwidth="16"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<description>Store 16-bit mask from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+15:mem_addr] := a[15:0]
+	</operation>
+	<instruction name="KMOVW" form="m16, k" xed="KMOVW_MEMu16_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortest_mask16_u8">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<parameter type="unsigned char*" varname="all_ones" etype="UI8" memwidth="8"/>
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones".</description>
+	<operation>
+tmp[15:0] := a[15:0] OR b[15:0]
+IF tmp[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+IF tmp[15:0] == 0xFFFF
+	MEM[all_ones+7:all_ones] := 1
+ELSE
+	MEM[all_ones+7:all_ones] := 0
+FI
+	</operation>
+	<instruction name="KORTESTW" form="k, k" xed="KORTESTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortestz_mask16_u8">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[15:0] := a[15:0] OR b[15:0]
+IF tmp[15:0] == 0x0
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KORTESTW" form="k, k" xed="KORTESTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_kortestc_mask16_u8">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst".</description>
+	<operation>
+tmp[15:0] := a[15:0] OR b[15:0]
+IF tmp[15:0] == 0xFFFF
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="KORTESTW" form="k, k" xed="KORTESTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_cvtmask16_u32">
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<description>Convert 16-bit mask "a" into an integer value, and store the result in "dst".</description>
+	<operation>
+dst := ZeroExtend32(a[15:0])
+	</operation>
+	<instruction name="KMOVW" form="r32, k" xed="KMOVW_GPR32u32_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_cvtu32_mask16">
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="unsigned int" varname="a" etype="UI16"/>
+	<description>Convert integer value "a" into an 16-bit mask, and store the result in "k".</description>
+	<operation>
+k := ZeroExtend16(a[15:0])
+	</operation>
+	<instruction name="KMOVW" form="k, r32" xed="KMOVW_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kandn">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := (NOT a[15:0]) AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KANDNW" form="k, k, k" xed="KANDNW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kand">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KANDW" form="k, k, k" xed="KANDW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kmov">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<description>Copy 16-bit mask "a" to "k".</description>
+	<operation>
+k[15:0] := a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KMOVW" form="k, k" xed="KMOVW_MASKmskw_MASKu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_knot">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<description>Compute the bitwise NOT of 16-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KNOTW" form="k, k" xed="KNOTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kor">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] OR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KORW" form="k, k, k" xed="KORW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kunpackb">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Unpack and interleave 8 bits from masks "a" and "b", and store the 16-bit result in "k".</description>
+	<operation>
+k[7:0] := b[7:0]
+k[15:8] := a[7:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KUNPCKBW" form="k, k, k" xed="KUNPCKBW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kxnor">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT (a[15:0] XOR b[15:0])
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KXNORW" form="k, k, k" xed="KXNORW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kxor">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] XOR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KXORW" form="k, k, k" xed="KXORW_MASKmskw_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPD" form="zmm {z}, zmm, zmm" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_add_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPD" form="zmm {z}, zmm, zmm {er}" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPS" form="zmm {z}, zmm, zmm" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_add_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPS" form="zmm {z}, zmm, zmm {er}" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_add_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := a[63:0] + b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSD" form="xmm, xmm, xmm {er}" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSD" form="xmm {k}, xmm, xmm {er}" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSD" form="xmm {k}, xmm, xmm" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSD" form="xmm {z}, xmm, xmm {er}" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSD" form="xmm {z}, xmm, xmm" xed="VADDSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_add_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst[31:0] := a[31:0] + b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSS" form="xmm, xmm, xmm {er}" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSS" form="xmm {k}, xmm, xmm {er}" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_add_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSS" form="xmm {k}, xmm, xmm" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSS" form="xmm {z}, xmm, xmm {er}" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_add_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VADDSS" form="xmm {z}, xmm, xmm" xed="VADDSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and stores the low 64 bytes (16 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (32*imm8[3:0])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VALIGND" form="zmm {z}, zmm, zmm, imm8" xed="VALIGND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="3"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst".</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (64*imm8[2:0])
+dst[511:0] := temp[511:0]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VALIGNQ" form="zmm, zmm, zmm, imm8" xed="VALIGNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="3"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (64*imm8[2:0])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VALIGNQ" form="zmm {k}, zmm, zmm, imm8" xed="VALIGNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="3"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and stores the low 64 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (64*imm8[2:0])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VALIGNQ" form="zmm {z}, zmm, zmm, imm8" xed="VALIGNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X4" form="zmm, m128" xed="VBROADCASTF32X4_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X4" form="zmm {k}, m128" xed="VBROADCASTF32X4_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF32X4" form="zmm {z}, m128" xed="VBROADCASTF32X4_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcast_f64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF64X4" form="zmm, m256" xed="VBROADCASTF64X4_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcast_f64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF64X4" form="zmm {k}, m256" xed="VBROADCASTF64X4_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcast_f64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTF64X4" form="zmm {z}, m256" xed="VBROADCASTF64X4_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcast_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X4" form="zmm, m128" xed="VBROADCASTI32X4_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcast_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X4" form="zmm {k}, m128" xed="VBROADCASTI32X4_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcast_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j % 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI32X4" form="zmm {z}, m128" xed="VBROADCASTI32X4_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcast_i64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI64X4" form="zmm, m256" xed="VBROADCASTI64X4_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcast_i64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI64X4" form="zmm {k}, m256" xed="VBROADCASTI64X4_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcast_i64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j % 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTI64X4" form="zmm {z}, m256" xed="VBROADCASTI64X4_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTSD" form="zmm, xmm" xed="VBROADCASTSD_ZMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTSD" form="zmm {k}, xmm" xed="VBROADCASTSD_ZMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTSD" form="zmm {z}, xmm" xed="VBROADCASTSD_ZMMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="zmm, xmm" xed="VBROADCASTSS_ZMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="zmm {k}, xmm" xed="VBROADCASTSS_ZMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBROADCASTSS" form="zmm {z}, xmm" xed="VBROADCASTSS_ZMMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_round_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VCMPSD" form="k, xmm, xmm {sae}, imm8" xed="VCMPSD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VCMPSD" form="k, xmm, xmm, imm8" xed="VCMPSD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_round_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VCMPSD" form="k {k}, xmm, xmm {sae}, imm8" xed="VCMPSD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VCMPSD" form="k {k}, xmm, xmm, imm8" xed="VCMPSD_MASKmskw_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_round_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VCMPSS" form="k, xmm, xmm {sae}, imm8" xed="VCMPSS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cmp_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VCMPSS" form="k, xmm, xmm, imm8" xed="VCMPSS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_round_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VCMPSS" form="k {k}, xmm, xmm {sae}, imm8" xed="VCMPSS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cmp_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+IF k1[0]
+	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name="VCMPSS" form="k {k}, xmm, xmm, imm8" xed="VCMPSS_MASKmskw_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_comi_round_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="VCOMISD" form="xmm, xmm {sae}" xed="VCOMISD_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_comi_round_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="VCOMISS" form="xmm, xmm {sae}" xed="VCOMISS_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VUCOMISS" form="xmm, xmm {sae}" xed="VUCOMISS_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCOMPRESSPD" form="zmm {k}, zmm" xed="VCOMPRESSPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compressstoreu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VCOMPRESSPD" form="m512 {k}, zmm" xed="VCOMPRESSPD_MEMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCOMPRESSPD" form="zmm {z}, zmm" xed="VCOMPRESSPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCOMPRESSPS" form="zmm {k}, zmm" xed="VCOMPRESSPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compressstoreu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VCOMPRESSPS" form="m512 {k}, zmm" xed="VCOMPRESSPS_MEMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCOMPRESSPS" form="zmm {z}, zmm" xed="VCOMPRESSPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="zmm, ymm" xed="VCVTDQ2PD_ZMMf64_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := src[m+63:m]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="zmm {k}, ymm" xed="VCVTDQ2PD_ZMMf64_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="zmm {z}, ymm" xed="VCVTDQ2PD_ZMMf64_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="zmm, zmm {er}" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="zmm, zmm" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="zmm {k}, zmm {er}" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="zmm {k}, zmm" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="zmm {z}, zmm {er}" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PS" form="zmm {z}, zmm" xed="VCVTDQ2PS_ZMMf32_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="ymm, zmm {er}" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="ymm, zmm" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="ymm {k}, zmm {er}" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="ymm {k}, zmm" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="ymm {z}, zmm {er}" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2DQ" form="ymm {z}, zmm" xed="VCVTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="ymm, zmm {er}" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="ymm, zmm" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="ymm {k}, zmm {er}" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="ymm {k}, zmm" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="ymm {z}, zmm {er}" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="ymm {z}, zmm" xed="VCVTPD2PS_YMMf32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="ymm, zmm {er}" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="ymm, zmm" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="ymm {k}, zmm {er}" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="ymm {k}, zmm" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="ymm {z}, zmm {er}" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPD2UDQ" form="ymm {z}, zmm" xed="VCVTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m256i" varname="a" etype="FP16"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="zmm, ymm {sae}" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m256i" varname="a" etype="FP16"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="zmm, ymm" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="FP16"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="zmm {k}, ymm {sae}" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="FP16"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="zmm {k}, ymm" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="FP16"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="zmm {z}, ymm {sae}" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="FP16"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="zmm {z}, ymm" xed="VCVTPH2PS_ZMMf32_MASKmskw_YMMf16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="zmm, zmm {er}" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="zmm, zmm" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="zmm {k}, zmm {er}" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="zmm {k}, zmm" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="zmm {z}, zmm {er}" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2DQ" form="zmm {z}, zmm" xed="VCVTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2PD" form="zmm, ymm {sae}" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2PD" form="zmm, ymm" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2PD" form="zmm {k}, ymm {sae}" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2PD" form="zmm {k}, ymm" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2PD" form="zmm {z}, ymm {sae}" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2PD" form="zmm {z}, ymm" xed="VCVTPS2PD_ZMMf64_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="ymm, zmm {sae}" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="ymm, zmm {sae}" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="ymm {k}, zmm {sae}" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="ymm {k}, zmm {sae}" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="ymm {z}, zmm {sae}" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="ymm {z}, zmm {sae}" xed="VCVTPS2PH_YMMf16_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="zmm, zmm {er}" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="zmm, zmm" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="zmm {k}, zmm {er}" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="zmm {k}, zmm" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="zmm {z}, zmm {er}" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2UDQ" form="zmm {z}, zmm" xed="VCVTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsd_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2SI" form="r32, xmm {er}" xed="VCVTSD2SI_GPR32i32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsd_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2SI" form="r64, xmm {er}" xed="VCVTSD2SI_GPR64i64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsd_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2SI" form="r32, xmm {er}" xed="VCVTSD2SI_GPR32i32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsd_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2SI" form="r64, xmm {er}" xed="VCVTSD2SI_GPR64i64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsd_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2SI" form="r32, xmm" xed="VCVTSD2SI_GPR32i32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsd_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2SI" form="r64, xmm" xed="VCVTSD2SI_GPR64i64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSD2SS" form="xmm, xmm, xmm {er}" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvt_roundsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSD2SS" form="xmm {k}, xmm, xmm {er}" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSD2SS" form="xmm {k}, xmm, xmm" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvt_roundsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSD2SS" form="xmm {z}, xmm, xmm {er}" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSD2SS" form="xmm {z}, xmm, xmm" xed="VCVTSD2SS_XMMf32_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsd_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UInt32(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2USI" form="r32, xmm {er}" xed="VCVTSD2USI_GPR32u32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsd_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UInt64(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2USI" form="r64, xmm {er}" xed="VCVTSD2USI_GPR64u64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsd_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UInt32(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2USI" form="r32, xmm" xed="VCVTSD2USI_GPR32u32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtsd_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UInt64(a[63:0])
+	</operation>
+	<instruction name="VCVTSD2USI" form="r64, xmm" xed="VCVTSD2USI_GPR64u64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundi64_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__int64" varname="b" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SD" form="xmm, xmm, r64 {er}" xed="VCVTSI2SD_XMMf64_XMMf64_GPR64i64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsi64_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__int64" varname="b" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SD" form="xmm, xmm, r64 {er}" xed="VCVTSI2SD_XMMf64_XMMf64_GPR64i64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvti32_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="b" etype="SI32"/>
+	<description>Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SD" form="xmm, xmm, r32" xed="VCVTSI2SD_XMMf64_XMMf64_GPR32i32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvti64_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__int64" varname="b" etype="SI64"/>
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SD" form="xmm, xmm, r64" xed="VCVTSI2SD_XMMf64_XMMf64_GPR64i64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundi32_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="b" etype="SI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SS" form="xmm, xmm, r32 {er}" xed="VCVTSI2SS_XMMf32_XMMf32_GPR32i32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundi64_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__int64" varname="b" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SS" form="xmm, xmm, r64 {er}" xed="VCVTSI2SS_XMMf32_XMMf32_GPR64i64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsi32_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="b" etype="SI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SS" form="xmm, xmm, r32 {er}" xed="VCVTSI2SS_XMMf32_XMMf32_GPR32i32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundsi64_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__int64" varname="b" etype="SI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SS" form="xmm, xmm, r64 {er}" xed="VCVTSI2SS_XMMf32_XMMf32_GPR64i64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvti32_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="b" etype="SI32"/>
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SS" form="xmm, xmm, r32" xed="VCVTSI2SS_XMMf32_XMMf32_GPR32i32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvti64_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__int64" varname="b" etype="SI64"/>
+	<description>Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSI2SS" form="xmm, xmm, r64" xed="VCVTSI2SS_XMMf32_XMMf32_GPR64i64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSS2SD" form="xmm, xmm, xmm {sae}" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvt_roundss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSS2SD" form="xmm {k}, xmm, xmm {sae}" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSS2SD" form="xmm {k}, xmm, xmm" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvt_roundss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". 
+	[sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSS2SD" form="xmm {z}, xmm, xmm {sae}" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTSS2SD" form="xmm {z}, xmm, xmm" xed="VCVTSS2SD_XMMf64_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundss_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2SI" form="r32, xmm {er}" xed="VCVTSS2SI_GPR32i32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundss_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2SI" form="r64, xmm {er}" xed="VCVTSS2SI_GPR64i64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundss_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2SI" form="r32, xmm {er}" xed="VCVTSS2SI_GPR32i32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundss_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2SI" form="r64, xmm {er}" xed="VCVTSS2SI_GPR64i64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtss_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2SI" form="r32, xmm" xed="VCVTSS2SI_GPR32i32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtss_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2SI" form="r64, xmm" xed="VCVTSS2SI_GPR64i64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundss_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UInt32(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2USI" form="r32, xmm {er}" xed="VCVTSS2USI_GPR32u32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundss_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UInt64(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2USI" form="r64, xmm {er}" xed="VCVTSS2USI_GPR64u64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtss_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UInt32(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2USI" form="r32, xmm" xed="VCVTSS2USI_GPR32u32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtss_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UInt64(a[31:0])
+	</operation>
+	<instruction name="VCVTSS2USI" form="r64, xmm" xed="VCVTSS2USI_GPR64u64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="ymm, zmm {sae}" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="ymm, zmm" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="ymm {k}, zmm {sae}" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="ymm {k}, zmm" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="ymm {z}, zmm {sae}" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2DQ" form="ymm {z}, zmm" xed="VCVTTPD2DQ_YMMi32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="ymm, zmm {sae}" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="ymm, zmm" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="ymm {k}, zmm {sae}" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="ymm {k}, zmm" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="ymm {z}, zmm {sae}" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTTPD2UDQ" form="ymm {z}, zmm" xed="VCVTTPD2UDQ_YMMu32_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="zmm, zmm {sae}" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="zmm, zmm" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="zmm {k}, zmm {sae}" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="zmm {k}, zmm" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="zmm {z}, zmm {sae}" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2DQ" form="zmm {z}, zmm" xed="VCVTTPS2DQ_ZMMi32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="zmm, zmm {sae}" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="zmm, zmm" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="zmm {k}, zmm {sae}" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="zmm {k}, zmm" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="zmm {z}, zmm {sae}" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTTPS2UDQ" form="zmm {z}, zmm" xed="VCVTTPS2UDQ_ZMMu32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundsd_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2SI" form="r32, xmm {sae}" xed="VCVTTSD2SI_GPR32i32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundsd_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2SI" form="r64, xmm {sae}" xed="VCVTTSD2SI_GPR64i64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundsd_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2SI" form="r32, xmm {sae}" xed="VCVTTSD2SI_GPR32i32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundsd_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2SI" form="r64, xmm {sae}" xed="VCVTTSD2SI_GPR64i64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttsd_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2SI" form="r32, xmm" xed="VCVTTSD2SI_GPR32i32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttsd_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2SI" form="r64, xmm" xed="VCVTTSD2SI_GPR64i64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundsd_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2USI" form="r32, xmm {sae}" xed="VCVTTSD2USI_GPR32u32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundsd_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2USI" form="r64, xmm {sae}" xed="VCVTTSD2USI_GPR64u64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttsd_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2USI" form="r32, xmm" xed="VCVTTSD2USI_GPR32u32_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttsd_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0])
+	</operation>
+	<instruction name="VCVTTSD2USI" form="r64, xmm" xed="VCVTTSD2USI_GPR64u64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundss_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2SI" form="r32, xmm {sae}" xed="VCVTTSS2SI_GPR32i32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundss_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2SI" form="r64, xmm {sae}" xed="VCVTTSS2SI_GPR64i64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundss_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2SI" form="r32, xmm {sae}" xed="VCVTTSS2SI_GPR32i32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundss_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2SI" form="r64, xmm {sae}" xed="VCVTTSS2SI_GPR64i64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttss_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2SI" form="r32, xmm" xed="VCVTTSS2SI_GPR32i32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttss_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2SI" form="r64, xmm" xed="VCVTTSS2SI_GPR64i64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundss_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2USI" form="r32, xmm {sae}" xed="VCVTTSS2USI_GPR32u32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtt_roundss_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".
+	[sae_note]</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2USI" form="r64, xmm {sae}" xed="VCVTTSS2USI_GPR64u64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttss_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2USI" form="r32, xmm" xed="VCVTTSS2USI_GPR32u32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvttss_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0])
+	</operation>
+	<instruction name="VCVTTSS2USI" form="r64, xmm" xed="VCVTTSS2USI_GPR64u64_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="zmm, ymm" xed="VCVTUDQ2PD_ZMMf64_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="zmm {k}, ymm" xed="VCVTUDQ2PD_ZMMf64_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="zmm {z}, ymm" xed="VCVTUDQ2PD_ZMMf64_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvt_roundepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PS" form="zmm, zmm {er}" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PS" form="zmm, zmm" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvt_roundepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PS" form="zmm {k}, zmm {er}" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PS" form="zmm {k}, zmm" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvt_roundepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PS" form="zmm {z}, zmm {er}" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PS" form="zmm {z}, zmm" xed="VCVTUDQ2PS_ZMMf32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundu64_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="unsigned __int64" varname="b" etype="UI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUSI2SD" form="xmm, xmm, r64 {er}" xed="VCVTUSI2SD_XMMf64_XMMf64_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtu32_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="unsigned int" varname="b" etype="UI32"/>
+	<description>Convert the unsigned 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUSI2SD" form="xmm, xmm, r32" xed="VCVTUSI2SD_XMMf64_XMMf64_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtu64_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="unsigned __int64" varname="b" etype="UI64"/>
+	<description>Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUSI2SD" form="xmm, xmm, r64" xed="VCVTUSI2SD_XMMf64_XMMf64_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundu32_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="unsigned int" varname="b" etype="UI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUSI2SS" form="xmm, xmm, r32 {er}" xed="VCVTUSI2SS_XMMf32_XMMf32_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvt_roundu64_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="unsigned __int64" varname="b" etype="UI64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUSI2SS" form="xmm, xmm, r64 {er}" xed="VCVTUSI2SS_XMMf32_XMMf32_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtu32_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="unsigned int" varname="b" etype="UI32"/>
+	<description>Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUSI2SS" form="xmm, xmm, r32" xed="VCVTUSI2SS_XMMf32_XMMf32_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtu64_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="unsigned __int64" varname="b" etype="UI64"/>
+	<description>Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTUSI2SS" form="xmm, xmm, r64" xed="VCVTUSI2SS_XMMf32_XMMf32_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPD" form="zmm, zmm, zmm" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_div_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", =and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPD" form="zmm, zmm, zmm {er}" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPD" form="zmm {k}, zmm, zmm" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_div_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPD" form="zmm {k}, zmm, zmm {er}" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPD" form="zmm {z}, zmm, zmm" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_div_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPD" form="zmm {z}, zmm, zmm {er}" xed="VDIVPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPS" form="zmm, zmm, zmm" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_div_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPS" form="zmm, zmm, zmm {er}" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPS" form="zmm {k}, zmm, zmm" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_div_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPS" form="zmm {k}, zmm, zmm {er}" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPS" form="zmm {z}, zmm, zmm" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_div_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDIVPS" form="zmm {z}, zmm, zmm {er}" xed="VDIVPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_div_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+dst[63:0] := a[63:0] / b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSD" form="xmm, xmm, xmm {er}" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_div_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". 
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSD" form="xmm {k}, xmm, xmm {er}" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_div_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSD" form="xmm {k}, xmm, xmm" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_div_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSD" form="xmm {z}, xmm, xmm {er}" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_div_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSD" form="xmm {z}, xmm, xmm" xed="VDIVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_div_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst[31:0] := a[31:0] / b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSS" form="xmm, xmm, xmm {er}" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_div_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSS" form="xmm {k}, xmm, xmm {er}" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_div_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSS" form="xmm {k}, xmm, xmm" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_div_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSS" form="xmm {z}, xmm, xmm {er}" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_div_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDIVSS" form="xmm {z}, xmm, xmm" xed="VDIVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="zmm {k}, zmm" xed="VEXPANDPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="zmm {k}, m512" xed="VEXPANDPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="zmm {z}, zmm" xed="VEXPANDPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXPANDPD" form="zmm {z}, m512" xed="VEXPANDPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="zmm {k}, zmm" xed="VEXPANDPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="zmm {k}, m512" xed="VEXPANDPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="zmm {z}, zmm" xed="VEXPANDPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXPANDPS" form="zmm {z}, m512" xed="VEXPANDPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[1:0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF32X4" form="xmm, zmm, imm8" xed="VEXTRACTF32X4_XMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF32X4" form="xmm {k}, zmm, imm8" xed="VEXTRACTF32X4_XMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTF32X4" form="xmm {z}, zmm, imm8" xed="VEXTRACTF32X4_XMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_extractf64x4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTF64X4" form="ymm, zmm, imm8" xed="VEXTRACTF64X4_YMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_extractf64x4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTF64X4" form="ymm {k}, zmm, imm8" xed="VEXTRACTF64X4_YMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_extractf64x4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTF64X4" form="ymm {z}, zmm, imm8" xed="VEXTRACTF64X4_YMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[1:0] OF
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI32X4" form="xmm, zmm, imm8" xed="VEXTRACTI32X4_XMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI32X4" form="xmm {k}, zmm, imm8" xed="VEXTRACTI32X4_XMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[1:0] OF
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VEXTRACTI32X4" form="xmm {z}, zmm, imm8" xed="VEXTRACTI32X4_XMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_extracti64x4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[0] OF
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTI64X4" form="ymm, zmm, imm8" xed="VEXTRACTI64X4_YMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_extracti64x4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTI64X4" form="ymm {k}, zmm, imm8" xed="VEXTRACTI64X4_YMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_extracti64x4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE imm8[0] OF
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VEXTRACTI64X4" form="ymm {z}, zmm, imm8" xed="VEXTRACTI64X4_YMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="zmm, zmm, zmm, imm8" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fixupimm_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="zmm, zmm, zmm, imm8 {sae}" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="zmm {k}, zmm, zmm, imm8" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fixupimm_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="zmm {k}, zmm, zmm, imm8 {sae}" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="zmm {z}, zmm, zmm, imm8" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fixupimm_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPD" form="zmm {z}, zmm, zmm, imm8 {sae}" xed="VFIXUPIMMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="zmm, zmm, zmm, imm8" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fixupimm_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="zmm, zmm, zmm, imm8 {sae}" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="zmm {k}, zmm, zmm, imm8" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fixupimm_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="zmm {k}, zmm, zmm, imm8 {sae}" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="zmm {z}, zmm, zmm, imm8" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fixupimm_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPIMMPS" form="zmm {z}, zmm, zmm, imm8 {sae}" xed="VFIXUPIMMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fixupimm_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSD" form="xmm, xmm, xmm, imm8 {sae}" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fixupimm_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSD" form="xmm, xmm, xmm, imm8" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fixupimm_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSD" form="xmm {k}, xmm, xmm, imm8 {sae}" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fixupimm_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSD" form="xmm {k}, xmm, xmm, imm8" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fixupimm_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSD" form="xmm {z}, xmm, xmm, imm8 {sae}" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fixupimm_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) {
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? -INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1/2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[63:0]
+}
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSD" form="xmm {z}, xmm, xmm, imm8" xed="VFIXUPIMMSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fixupimm_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSS" form="xmm, xmm, xmm, imm8 {sae}" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fixupimm_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSS" form="xmm, xmm, xmm, imm8" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fixupimm_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSS" form="xmm {k}, xmm, xmm, imm8 {sae}" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fixupimm_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSS" form="xmm {k}, xmm, xmm, imm8" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fixupimm_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.
+	[sae_note]</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSS" form="xmm {z}, xmm, xmm, imm8 {sae}" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fixupimm_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, \
+	SNAN_TOKEN := 1, \
+	ZERO_VALUE_TOKEN := 2, \
+	ONE_VALUE_TOKEN := 3, \
+	NEG_INF_TOKEN := 4, \
+	POS_INF_TOKEN := 5, \
+	NEG_VALUE_TOKEN := 6, \
+	POS_VALUE_TOKEN := 7
+}
+DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) {
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0]) OF
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) OF
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? -INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1/2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0]) OF
+	ZERO_VALUE_TOKEN:
+		IF (imm8[0]) #ZE; FI
+	ZERO_VALUE_TOKEN:
+		IF (imm8[1]) #IE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[2]) #ZE; FI
+	ONE_VALUE_TOKEN:
+		IF (imm8[3]) #IE; FI
+	SNAN_TOKEN:
+		IF (imm8[4]) #IE; FI
+	NEG_INF_TOKEN:
+		IF (imm8[5]) #IE; FI
+	NEG_VALUE_TOKEN:
+		IF (imm8[6]) #IE; FI
+	POS_INF_TOKEN:
+		IF (imm8[7]) #IE; FI
+	ESAC
+	RETURN dest[31:0]
+}
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFIXUPIMMSS" form="xmm {z}, xmm, xmm, imm8" xed="VFIXUPIMMSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="zmm {z}, zmm, zmm" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="zmm {z}, zmm, zmm" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="zmm {z}, zmm, zmm" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="zmm {z}, zmm, zmm {er}" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="zmm {z}, zmm, zmm {er}" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="zmm {z}, zmm, zmm {er}" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="zmm {z}, zmm, zmm" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="zmm {z}, zmm, zmm" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="zmm {z}, zmm, zmm" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "a" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="zmm {z}, zmm, zmm {er}" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="zmm {z}, zmm, zmm {er}" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="zmm {z}, zmm, zmm {er}" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SD" form="xmm, xmm, xmm {er}" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213SD" form="xmm, xmm, xmm {er}" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231SD" form="xmm, xmm, xmm {er}" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SD" form="xmm {k}, xmm, xmm {er}" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213SD" form="xmm {k}, xmm, xmm {er}" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231SD" form="xmm {k}, xmm, xmm {er}" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SD" form="xmm {k}, xmm, xmm" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213SD" form="xmm {k}, xmm, xmm" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231SD" form="xmm {k}, xmm, xmm" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SD" form="xmm {k}, xmm, xmm {er}" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213SD" form="xmm {k}, xmm, xmm {er}" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231SD" form="xmm {k}, xmm, xmm {er}" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SD" form="xmm {k}, xmm, xmm" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213SD" form="xmm {k}, xmm, xmm" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231SD" form="xmm {k}, xmm, xmm" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SD" form="xmm {z}, xmm, xmm {er}" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213SD" form="xmm {z}, xmm, xmm {er}" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231SD" form="xmm {z}, xmm, xmm {er}" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SD" form="xmm {z}, xmm, xmm" xed="VFMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD213SD" form="xmm {z}, xmm, xmm" xed="VFMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMADD231SD" form="xmm {z}, xmm, xmm" xed="VFMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SS" form="xmm {k}, xmm, xmm {er}" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213SS" form="xmm {k}, xmm, xmm {er}" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231SS" form="xmm {k}, xmm, xmm {er}" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SS" form="xmm {k}, xmm, xmm" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213SS" form="xmm {k}, xmm, xmm" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231SS" form="xmm {k}, xmm, xmm" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SS" form="xmm, xmm, xmm {er}" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213SS" form="xmm, xmm, xmm {er}" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231SS" form="xmm, xmm, xmm {er}" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SS" form="xmm {k}, xmm, xmm {er}" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213SS" form="xmm {k}, xmm, xmm {er}" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231SS" form="xmm {k}, xmm, xmm {er}" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SS" form="xmm {k}, xmm, xmm" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213SS" form="xmm {k}, xmm, xmm" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231SS" form="xmm {k}, xmm, xmm" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SS" form="xmm {z}, xmm, xmm {er}" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213SS" form="xmm {z}, xmm, xmm {er}" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231SS" form="xmm {z}, xmm, xmm {er}" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SS" form="xmm {z}, xmm, xmm" xed="VFMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD213SS" form="xmm {z}, xmm, xmm" xed="VFMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMADD231SS" form="xmm {z}, xmm, xmm" xed="VFMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="zmm, zmm, zmm" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="zmm, zmm, zmm" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="zmm, zmm, zmm" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fmaddsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="zmm, zmm, zmm {er}" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="zmm, zmm, zmm {er}" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="zmm, zmm, zmm {er}" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask3_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="zmm {k}, zmm, zmm" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="zmm {k}, zmm, zmm" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="zmm {k}, zmm, zmm" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask3_fmaddsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE 
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="zmm {k}, zmm, zmm" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="zmm {k}, zmm, zmm" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="zmm {k}, zmm, zmm" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fmaddsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="zmm {z}, zmm, zmm" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="zmm {z}, zmm, zmm" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="zmm {z}, zmm, zmm" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmaddsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="zmm {z}, zmm, zmm {er}" xed="VFMADDSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB213PD" form="zmm {z}, zmm, zmm {er}" xed="VFMADDSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADDSUB231PD" form="zmm {z}, zmm, zmm {er}" xed="VFMADDSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="zmm, zmm, zmm" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="zmm, zmm, zmm" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="zmm, zmm, zmm" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fmaddsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="zmm, zmm, zmm {er}" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="zmm, zmm, zmm {er}" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="zmm, zmm, zmm {er}" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask3_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="zmm {k}, zmm, zmm" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="zmm {k}, zmm, zmm" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="zmm {k}, zmm, zmm" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask3_fmaddsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="zmm {k}, zmm, zmm" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="zmm {k}, zmm, zmm" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="zmm {k}, zmm, zmm" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fmaddsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="zmm {z}, zmm, zmm" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="zmm {z}, zmm, zmm" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="zmm {z}, zmm, zmm" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmaddsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="zmm {z}, zmm, zmm {er}" xed="VFMADDSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB213PS" form="zmm {z}, zmm, zmm {er}" xed="VFMADDSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADDSUB231PS" form="zmm {z}, zmm, zmm {er}" xed="VFMADDSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="zmm {z}, zmm, zmm" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="zmm {z}, zmm, zmm" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="zmm {z}, zmm, zmm" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="zmm {z}, zmm, zmm {er}" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="zmm {z}, zmm, zmm {er}" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="zmm {z}, zmm, zmm {er}" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="zmm {z}, zmm, zmm" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="zmm {z}, zmm, zmm" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="zmm {z}, zmm, zmm" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="zmm {z}, zmm, zmm {er}" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="zmm {z}, zmm, zmm {er}" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="zmm {z}, zmm, zmm {er}" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SD" form="xmm, xmm, xmm {er}" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213SD" form="xmm, xmm, xmm {er}" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231SD" form="xmm, xmm, xmm {er}" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SD" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213SD" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231SD" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SD" form="xmm {k}, xmm, xmm" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213SD" form="xmm {k}, xmm, xmm" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231SD" form="xmm {k}, xmm, xmm" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SD" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213SD" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231SD" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SD" form="xmm {k}, xmm, xmm" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213SD" form="xmm {k}, xmm, xmm" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231SD" form="xmm {k}, xmm, xmm" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SD" form="xmm {z}, xmm, xmm {er}" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213SD" form="xmm {z}, xmm, xmm {er}" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231SD" form="xmm {z}, xmm, xmm {er}" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SD" form="xmm {z}, xmm, xmm" xed="VFMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB213SD" form="xmm {z}, xmm, xmm" xed="VFMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFMSUB231SD" form="xmm {z}, xmm, xmm" xed="VFMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SS" form="xmm, xmm, xmm {er}" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213SS" form="xmm, xmm, xmm {er}" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231SS" form="xmm, xmm, xmm {er}" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SS" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213SS" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231SS" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SS" form="xmm {k}, xmm, xmm" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213SS" form="xmm {k}, xmm, xmm" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231SS" form="xmm {k}, xmm, xmm" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SS" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213SS" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231SS" form="xmm {k}, xmm, xmm {er}" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SS" form="xmm {k}, xmm, xmm" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213SS" form="xmm {k}, xmm, xmm" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231SS" form="xmm {k}, xmm, xmm" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SS" form="xmm {z}, xmm, xmm {er}" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213SS" form="xmm {z}, xmm, xmm {er}" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231SS" form="xmm {z}, xmm, xmm {er}" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SS" form="xmm {z}, xmm, xmm" xed="VFMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB213SS" form="xmm {z}, xmm, xmm" xed="VFMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFMSUB231SS" form="xmm {z}, xmm, xmm" xed="VFMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="zmm, zmm, zmm" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="zmm, zmm, zmm" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="zmm, zmm, zmm" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fmsubadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="zmm, zmm, zmm {er}" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="zmm, zmm, zmm {er}" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="zmm, zmm, zmm {er}" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask3_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="zmm {k}, zmm, zmm" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="zmm {k}, zmm, zmm" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="zmm {k}, zmm, zmm" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask3_fmsubadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="zmm {k}, zmm, zmm" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="zmm {k}, zmm, zmm" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="zmm {k}, zmm, zmm" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fmsubadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="zmm {z}, zmm, zmm" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="zmm {z}, zmm, zmm" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="zmm {z}, zmm, zmm" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmsubadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="zmm {z}, zmm, zmm {er}" xed="VFMSUBADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD213PD" form="zmm {z}, zmm, zmm {er}" xed="VFMSUBADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUBADD231PD" form="zmm {z}, zmm, zmm {er}" xed="VFMSUBADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="zmm, zmm, zmm" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="zmm, zmm, zmm" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="zmm, zmm, zmm" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_fmsubadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="zmm, zmm, zmm {er}" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="zmm, zmm, zmm {er}" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="zmm, zmm, zmm {er}" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask3_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="zmm {k}, zmm, zmm" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="zmm {k}, zmm, zmm" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="zmm {k}, zmm, zmm" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask3_fmsubadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="zmm {k}, zmm, zmm" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="zmm {k}, zmm, zmm" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="zmm {k}, zmm, zmm" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_fmsubadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="zmm {z}, zmm, zmm" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="zmm {z}, zmm, zmm" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="zmm {z}, zmm, zmm" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fmsubadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF ((j &amp; 1) == 0)
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="zmm {z}, zmm, zmm {er}" xed="VFMSUBADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD213PS" form="zmm {z}, zmm, zmm {er}" xed="VFMSUBADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUBADD231PS" form="zmm {z}, zmm, zmm {er}" xed="VFMSUBADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="zmm {z}, zmm, zmm" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="zmm {z}, zmm, zmm" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="zmm {z}, zmm, zmm" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fnmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="zmm {z}, zmm, zmm {er}" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="zmm {z}, zmm, zmm {er}" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="zmm {z}, zmm, zmm {er}" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="zmm {z}, zmm, zmm" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="zmm {z}, zmm, zmm" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="zmm {z}, zmm, zmm" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fnmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="zmm {z}, zmm, zmm {er}" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="zmm {z}, zmm, zmm {er}" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="zmm {z}, zmm, zmm {er}" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fnmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SD" form="xmm, xmm, xmm {er}" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD213SD" form="xmm, xmm, xmm {er}" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231SD" form="xmm, xmm, xmm {er}" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD213SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SD" form="xmm {k}, xmm, xmm" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD213SD" form="xmm {k}, xmm, xmm" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231SD" form="xmm {k}, xmm, xmm" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD213SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SD" form="xmm {k}, xmm, xmm" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD213SD" form="xmm {k}, xmm, xmm" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231SD" form="xmm {k}, xmm, xmm" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SD" form="xmm {z}, xmm, xmm {er}" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD213SD" form="xmm {z}, xmm, xmm {er}" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231SD" form="xmm {z}, xmm, xmm {er}" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD213SD" form="xmm {z}, xmm, xmm" xed="VFNMADD213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD231SD" form="xmm {z}, xmm, xmm" xed="VFNMADD231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMADD132SD" form="xmm {z}, xmm, xmm" xed="VFNMADD132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fnmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SS" form="xmm, xmm, xmm {er}" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213SS" form="xmm, xmm, xmm {er}" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231SS" form="xmm, xmm, xmm {er}" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SS" form="xmm {k}, xmm, xmm" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213SS" form="xmm {k}, xmm, xmm" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231SS" form="xmm {k}, xmm, xmm" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SS" form="xmm {k}, xmm, xmm" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213SS" form="xmm {k}, xmm, xmm" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231SS" form="xmm {k}, xmm, xmm" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SS" form="xmm {z}, xmm, xmm {er}" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213SS" form="xmm {z}, xmm, xmm {er}" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231SS" form="xmm {z}, xmm, xmm {er}" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SS" form="xmm {z}, xmm, xmm" xed="VFNMADD132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD213SS" form="xmm {z}, xmm, xmm" xed="VFNMADD213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMADD231SS" form="xmm {z}, xmm, xmm" xed="VFNMADD231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="zmm {z}, zmm, zmm" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="zmm {z}, zmm, zmm" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="zmm {z}, zmm, zmm" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fnmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="zmm {z}, zmm, zmm {er}" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="zmm {z}, zmm, zmm {er}" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="zmm {z}, zmm, zmm {er}" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="zmm {z}, zmm, zmm" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="zmm {z}, zmm, zmm" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="zmm {z}, zmm, zmm" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_fnmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="const int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="zmm {z}, zmm, zmm {er}" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="zmm {z}, zmm, zmm {er}" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="zmm {z}, zmm, zmm {er}" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fnmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SD" form="xmm, xmm, xmm {er}" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213SD" form="xmm, xmm, xmm {er}" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231SD" form="xmm, xmm, xmm {er}" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := c[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SD" form="xmm {k}, xmm, xmm" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213SD" form="xmm {k}, xmm, xmm" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231SD" form="xmm {k}, xmm, xmm" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231SD" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SD" form="xmm {k}, xmm, xmm" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213SD" form="xmm {k}, xmm, xmm" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231SD" form="xmm {k}, xmm, xmm" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SD" form="xmm {z}, xmm, xmm {er}" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213SD" form="xmm {z}, xmm, xmm {er}" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231SD" form="xmm {z}, xmm, xmm {er}" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SD" form="xmm {z}, xmm, xmm" xed="VFNMSUB132SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB213SD" form="xmm {z}, xmm, xmm" xed="VFNMSUB213SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<instruction name="VFNMSUB231SD" form="xmm {z}, xmm, xmm" xed="VFNMSUB231SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_fnmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SS" form="xmm, xmm, xmm {er}" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213SS" form="xmm, xmm, xmm {er}" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231SS" form="xmm, xmm, xmm {er}" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask3_fnmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := c[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SS" form="xmm {k}, xmm, xmm" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213SS" form="xmm {k}, xmm, xmm" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231SS" form="xmm {k}, xmm, xmm" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231SS" form="xmm {k}, xmm, xmm {er}" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_fnmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SS" form="xmm {k}, xmm, xmm" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213SS" form="xmm {k}, xmm, xmm" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231SS" form="xmm {k}, xmm, xmm" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SS" form="xmm {z}, xmm, xmm {er}" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213SS" form="xmm {z}, xmm, xmm {er}" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231SS" form="xmm {z}, xmm, xmm {er}" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_fnmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SS" form="xmm {z}, xmm, xmm" xed="VFNMSUB132SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB213SS" form="xmm {z}, xmm, xmm" xed="VFNMSUB213SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<instruction name="VFNMSUB231SS" form="xmm {z}, xmm, xmm" xed="VFNMSUB231SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="zmm, vm32y" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="zmm {k}, vm32y" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERQPD" form="zmm, vm32z" xed="VGATHERQPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERQPD" form="zmm {k}, vm32z" xed="VGATHERQPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERQPS" form="ymm, vm64z" xed="VGATHERQPS_YMMf32_MASKmskw_MEMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGATHERQPS" form="ymm {k}, vm64z" xed="VGATHERQPS_YMMf32_MASKmskw_MEMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="zmm {z}, zmm" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_getexp_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="zmm {z}, zmm {sae}" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="zmm {z}, zmm" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_getexp_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="zmm {z}, zmm {sae}" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getexp_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>dst[63:0] := ConvertExpFP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSD" form="xmm, xmm, xmm {sae}" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getexp_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>dst[63:0] := ConvertExpFP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSD" form="xmm, xmm, xmm" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getexp_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSD" form="xmm {k}, xmm, xmm {sae}" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getexp_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSD" form="xmm {k}, xmm, xmm" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getexp_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSD" form="xmm {z}, xmm, xmm {sae}" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getexp_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSD" form="xmm {z}, xmm, xmm" xed="VGETEXPSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getexp_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>dst[31:0] := ConvertExpFP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSS" form="xmm, xmm, xmm {sae}" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getexp_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>dst[31:0] := ConvertExpFP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSS" form="xmm, xmm, xmm" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getexp_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSS" form="xmm {k}, xmm, xmm {sae}" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getexp_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSS" form="xmm {k}, xmm, xmm" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getexp_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[sae_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSS" form="xmm {z}, xmm, xmm {sae}" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getexp_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETEXPSS" form="xmm {z}, xmm, xmm" xed="VGETEXPSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="zmm {z}, zmm, imm8" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_getmant_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="zmm {z}, zmm, imm8 {sae}" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="zmm {z}, zmm, imm8" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_getmant_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="zmm {z}, zmm, imm8 {sae}" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getmant_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSD" form="xmm, xmm, xmm, imm8 {sae}" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getmant_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSD" form="xmm, xmm, xmm, imm8" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getmant_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSD" form="xmm {k}, xmm, xmm, imm8 {sae}" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getmant_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSD" form="xmm {k}, xmm, xmm, imm8" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getmant_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSD" form="xmm {z}, xmm, xmm, imm8 {sae}" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getmant_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv)
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSD" form="xmm {z}, xmm, xmm, imm8" xed="VGETMANTSD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getmant_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSS" form="xmm, xmm, xmm, imm8 {sae}" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_getmant_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSS" form="xmm, xmm, xmm, imm8" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getmant_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSS" form="xmm {k}, xmm, xmm, imm8 {sae}" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_getmant_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSS" form="xmm {k}, xmm, xmm, imm8" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getmant_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSS" form="xmm {z}, xmm, xmm, imm8 {sae}" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_getmant_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>IF k[0]
+	dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv)
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGETMANTSS" form="xmm {z}, xmm, xmm, imm8" xed="VGETMANTSS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF32X4" form="zmm, zmm, xmm, imm8" xed="VINSERTF32X4_ZMMf32_MASKmskw_ZMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF32X4" form="zmm {k}, zmm, xmm, imm8" xed="VINSERTF32X4_ZMMf32_MASKmskw_ZMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF32X4" form="zmm {z}, zmm, xmm, imm8" xed="VINSERTF32X4_ZMMf32_MASKmskw_ZMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_insertf64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF64X4" form="zmm, zmm, ymm, imm8" xed="VINSERTF64X4_ZMMf64_MASKmskw_ZMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_insertf64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF64X4" form="zmm {k}, zmm, ymm, imm8" xed="VINSERTF64X4_ZMMf64_MASKmskw_ZMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_insertf64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTF64X4" form="zmm {z}, zmm, ymm, imm8" xed="VINSERTF64X4_ZMMf64_MASKmskw_ZMMf64_YMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_inserti32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI32X4" form="zmm, zmm, xmm, imm8" xed="VINSERTI32X4_ZMMu32_MASKmskw_ZMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_inserti32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI32X4" form="zmm {k}, zmm, xmm, imm8" xed="VINSERTI32X4_ZMMu32_MASKmskw_ZMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_inserti32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) OF
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI32X4" form="zmm {z}, zmm, xmm, imm8" xed="VINSERTI32X4_ZMMu32_MASKmskw_ZMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_inserti64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI64X4" form="zmm, zmm, ymm, imm8" xed="VINSERTI64X4_ZMMu64_MASKmskw_ZMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_inserti64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI64X4" form="zmm {k}, zmm, ymm, imm8" xed="VINSERTI64X4_ZMMu64_MASKmskw_ZMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_inserti64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VINSERTI64X4" form="zmm {z}, zmm, ymm, imm8" xed="VINSERTI64X4_ZMMu64_MASKmskw_ZMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPD" form="zmm {k}, zmm, zmm" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPD" form="zmm {k}, zmm, zmm {sae}" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPD" form="zmm {z}, zmm, zmm" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPD" form="zmm {z}, zmm, zmm {sae}" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPD" form="zmm, zmm, zmm" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPD" form="zmm, zmm, zmm {sae}" xed="VMAXPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPS" form="zmm {k}, zmm, zmm" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPS" form="zmm {k}, zmm, zmm {sae}" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPS" form="zmm {z}, zmm, zmm" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPS" form="zmm {z}, zmm, zmm {sae}" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPS" form="zmm, zmm, zmm" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMAXPS" form="zmm, zmm, zmm {sae}" xed="VMAXPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSD" form="xmm {k}, xmm, xmm {sae}" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSD" form="xmm {k}, xmm, xmm" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSD" form="xmm {z}, xmm, xmm {sae}" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSD" form="xmm {z}, xmm, xmm" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_max_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note]</description>
+	<operation>
+dst[63:0] := MAX(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSD" form="xmm, xmm, xmm {sae}" xed="VMAXSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSS" form="xmm {k}, xmm, xmm {sae}" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_max_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSS" form="xmm {k}, xmm, xmm" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSS" form="xmm {z}, xmm, xmm {sae}" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_max_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSS" form="xmm {z}, xmm, xmm" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_max_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+dst[31:0] := MAX(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMAXSS" form="xmm, xmm, xmm {sae}" xed="VMAXSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPD" form="zmm {k}, zmm, zmm" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPD" form="zmm {k}, zmm, zmm {sae}" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPD" form="zmm {z}, zmm, zmm" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPD" form="zmm {z}, zmm, zmm {sae}" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPD" form="zmm, zmm, zmm" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPD" form="zmm, zmm, zmm {sae}" xed="VMINPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPS" form="zmm {k}, zmm, zmm" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).   [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPS" form="zmm {k}, zmm, zmm {sae}" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPS" form="zmm {z}, zmm, zmm" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPS" form="zmm {z}, zmm, zmm {sae}" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPS" form="zmm, zmm, zmm" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".  [sae_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMINPS" form="zmm, zmm, zmm {sae}" xed="VMINPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSD" form="xmm {k}, xmm, xmm {sae}" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSD" form="xmm {k}, xmm, xmm" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSD" form="xmm {z}, xmm, xmm {sae}" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSD" form="xmm {z}, xmm, xmm" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_min_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" , and copy the upper element from "a" to the upper element of "dst". [sae_note]</description>
+	<operation>
+dst[63:0] := MIN(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSD" form="xmm, xmm, xmm {sae}" xed="VMINSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSS" form="xmm {k}, xmm, xmm {sae}" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_min_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSS" form="xmm {k}, xmm, xmm" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSS" form="xmm {z}, xmm, xmm {sae}" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_min_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSS" form="xmm {z}, xmm, xmm" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_min_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note]</description>
+	<operation>
+dst[31:0] := MIN(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMINSS" form="xmm, xmm, xmm {sae}" xed="VMINSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="zmm {z}, m512" xed="VMOVAPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="zmm {z}, zmm" xed="VMOVAPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="zmm {z}, m512" xed="VMOVAPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="zmm {z}, zmm" xed="VMOVAPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+tmp[319:256] := a[319:256] 
+tmp[383:320] := a[319:256] 
+tmp[447:384] := a[447:384]
+tmp[511:448] := a[447:384]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDDUP" form="zmm {k}, zmm" xed="VMOVDDUP_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+tmp[319:256] := a[319:256] 
+tmp[383:320] := a[319:256] 
+tmp[447:384] := a[447:384]
+tmp[511:448] := a[447:384]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDDUP" form="zmm {z}, zmm" xed="VMOVDDUP_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := a[63:0]
+dst[191:128] := a[191:128]
+dst[255:192] := a[191:128]
+dst[319:256] := a[319:256]
+dst[383:320] := a[319:256]
+dst[447:384] := a[447:384]
+dst[511:448] := a[447:384]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDDUP" form="zmm, zmm" xed="VMOVDDUP_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="zmm {z}, m512" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="zmm {z}, zmm" xed="VMOVDQA32_ZMMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="zmm {z}, m512" xed="VMOVDQA64_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="zmm {z}, zmm" xed="VMOVDQA64_ZMMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_loadu_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<description>Load 512-bits of integer data from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="zmm, m512" xed="VMOVDQU32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="zmm {k}, m512" xed="VMOVDQU32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU32" form="m512 {k}, zmm" xed="VMOVDQU32_MEMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU32" form="zmm {z}, m512" xed="VMOVDQU32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_storeu_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="M512" memwidth="512"/>
+	<parameter type="__m512i" varname="a" etype="M512"/>
+	<description>Store 512-bits of integer data from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVDQU32" form="m512, zmm" xed="VMOVDQU32_MEMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU64" form="zmm {k}, m512" xed="VMOVDQU64_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_storeu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQU64" form="m512 {k}, zmm" xed="VMOVDQU64_MEMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQU64" form="zmm {z}, m512" xed="VMOVDQU64_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_stream_load_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="void const*" varname="mem_addr" etype="M512" memwidth="512"/>
+	<description>Load 512-bits of integer data from memory into "dst" using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVNTDQA" form="zmm, m512" xed="VMOVNTDQA_ZMMu32_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_stream_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="M512" memwidth="512"/>
+	<parameter type="__m512i" varname="a" etype="M512"/>
+	<description>Store 512-bits of integer data from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVNTDQ" form="m512, zmm" xed="VMOVNTDQ_MEMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_stream_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVNTPD" form="m512, zmm" xed="VMOVNTPD_MEMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_stream_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVNTPS" form="m512, zmm" xed="VMOVNTPS_MEMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_load_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="const double*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MEM[mem_addr+63:mem_addr]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VMOVSD" form="xmm {k}, m64" xed="VMOVSD_XMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_move_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVSD" form="xmm {k}, xmm, xmm" xed="VMOVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_store_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	MEM[mem_addr+63:mem_addr] := a[63:0]
+FI
+	</operation>
+	<instruction name="VMOVSD" form="m64 {k}, xmm" xed="VMOVSD_MEMf64_MASKmskw_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_load_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="const double*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MEM[mem_addr+63:mem_addr]
+ELSE
+	dst[63:0] := 0
+FI
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VMOVSD" form="xmm {z}, m64" xed="VMOVSD_XMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_move_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVSD" form="xmm {z}, xmm, xmm" xed="VMOVSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+tmp[287:256] := a[319:288] 
+tmp[319:288] := a[319:288] 
+tmp[351:320] := a[383:352] 
+tmp[383:352] := a[383:352] 
+tmp[415:384] := a[447:416] 
+tmp[447:416] := a[447:416] 
+tmp[479:448] := a[511:480]
+tmp[511:480] := a[511:480]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVSHDUP" form="zmm {k}, zmm" xed="VMOVSHDUP_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+tmp[287:256] := a[319:288] 
+tmp[319:288] := a[319:288] 
+tmp[351:320] := a[383:352] 
+tmp[383:352] := a[383:352] 
+tmp[415:384] := a[447:416] 
+tmp[447:416] := a[447:416] 
+tmp[479:448] := a[511:480]
+tmp[511:480] := a[511:480]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVSHDUP" form="zmm {z}, zmm" xed="VMOVSHDUP_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] 
+dst[63:32] := a[63:32] 
+dst[95:64] := a[127:96] 
+dst[127:96] := a[127:96]
+dst[159:128] := a[191:160] 
+dst[191:160] := a[191:160] 
+dst[223:192] := a[255:224] 
+dst[255:224] := a[255:224]
+dst[287:256] := a[319:288] 
+dst[319:288] := a[319:288] 
+dst[351:320] := a[383:352] 
+dst[383:352] := a[383:352] 
+dst[415:384] := a[447:416] 
+dst[447:416] := a[447:416] 
+dst[479:448] := a[511:480]
+dst[511:480] := a[511:480]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVSHDUP" form="zmm, zmm" xed="VMOVSHDUP_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+tmp[287:256] := a[287:256] 
+tmp[319:288] := a[287:256] 
+tmp[351:320] := a[351:320] 
+tmp[383:352] := a[351:320] 
+tmp[415:384] := a[415:384] 
+tmp[447:416] := a[415:384] 
+tmp[479:448] := a[479:448]
+tmp[511:480] := a[479:448]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVSLDUP" form="zmm {k}, zmm" xed="VMOVSLDUP_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+tmp[287:256] := a[287:256] 
+tmp[319:288] := a[287:256] 
+tmp[351:320] := a[351:320] 
+tmp[383:352] := a[351:320] 
+tmp[415:384] := a[415:384] 
+tmp[447:416] := a[415:384] 
+tmp[479:448] := a[479:448]
+tmp[511:480] := a[479:448]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVSLDUP" form="zmm {z}, zmm" xed="VMOVSLDUP_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] 
+dst[63:32] := a[31:0] 
+dst[95:64] := a[95:64] 
+dst[127:96] := a[95:64]
+dst[159:128] := a[159:128] 
+dst[191:160] := a[159:128] 
+dst[223:192] := a[223:192] 
+dst[255:224] := a[223:192]
+dst[287:256] := a[287:256] 
+dst[319:288] := a[287:256] 
+dst[351:320] := a[351:320] 
+dst[383:352] := a[351:320] 
+dst[415:384] := a[415:384] 
+dst[447:416] := a[415:384] 
+dst[479:448] := a[479:448]
+dst[511:480] := a[479:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVSLDUP" form="zmm, zmm" xed="VMOVSLDUP_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_load_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="const float*" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<description>Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MEM[mem_addr+31:mem_addr]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VMOVSS" form="xmm {k}, m32" xed="VMOVSS_XMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_move_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVSS" form="xmm {k}, xmm, xmm" xed="VMOVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_store_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float*" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	MEM[mem_addr+31:mem_addr] := a[31:0]
+FI
+	</operation>
+	<instruction name="VMOVSS" form="m32 {k}, xmm" xed="VMOVSS_MEMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_load_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="const float*" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<description>Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MEM[mem_addr+31:mem_addr]
+ELSE
+	dst[31:0] := 0
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VMOVSS" form="xmm {z}, m32" xed="VMOVSS_XMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_move_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMOVSS" form="xmm {z}, xmm, xmm" xed="VMOVSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<description>Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVUPD" form="zmm, m512" xed="VMOVUPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVUPD" form="zmm {k}, m512" xed="VMOVUPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVUPD" form="m512 {k}, zmm" xed="VMOVUPD_MEMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVUPD" form="zmm {z}, m512" xed="VMOVUPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVUPD" form="m512, zmm" xed="VMOVUPD_MEMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<description>Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVUPS" form="zmm, m512" xed="VMOVUPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVUPS" form="zmm {k}, m512" xed="VMOVUPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVUPS" form="m512 {k}, zmm" xed="VMOVUPS_MEMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVUPS" form="zmm {z}, m512" xed="VMOVUPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVUPS" form="m512, zmm" xed="VMOVUPS_MEMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPD" form="zmm {z}, zmm, zmm" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mul_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPD" form="zmm {z}, zmm, zmm {er}" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPS" form="zmm {z}, zmm, zmm" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mul_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPS" form="zmm {z}, zmm, zmm {er}" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mul_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSD" form="xmm {k}, xmm, xmm {er}" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mul_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSD" form="xmm {k}, xmm, xmm" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mul_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSD" form="xmm {z}, xmm, xmm {er}" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mul_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSD" form="xmm {z}, xmm, xmm" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mul_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+		[round_note]</description>
+	<operation>
+dst[63:0] := a[63:0] * b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSD" form="xmm, xmm, xmm {er}" xed="VMULSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mul_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSS" form="xmm {k}, xmm, xmm {er}" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_mul_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSS" form="xmm {k}, xmm, xmm" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mul_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSS" form="xmm {z}, xmm, xmm {er}" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_mul_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSS" form="xmm {z}, xmm, xmm" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mul_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]</description>
+	<operation>
+dst[31:0] := a[31:0] * b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VMULSS" form="xmm, xmm, xmm {er}" xed="VMULSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSD" form="zmm, zmm" xed="VPABSD_ZMMi32_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSD" form="zmm {k}, zmm" xed="VPABSD_ZMMi32_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSD" form="zmm {z}, zmm" xed="VPABSD_ZMMi32_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ABS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSQ" form="zmm, zmm" xed="VPABSQ_ZMMi64_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSQ" form="zmm {k}, zmm" xed="VPABSQ_ZMMi64_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPABSQ" form="zmm {z}, zmm" xed="VPABSQ_ZMMi64_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDD" form="zmm {z}, zmm, zmm" xed="VPADDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDQ" form="zmm, zmm, zmm" xed="VPADDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDQ" form="zmm {k}, zmm, zmm" xed="VPADDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDQ" form="zmm {z}, zmm, zmm" xed="VPADDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDD" form="zmm {z}, zmm, zmm" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDND" form="zmm {z}, zmm, zmm" xed="VPANDND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDNQ" form="zmm {z}, zmm, zmm" xed="VPANDNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDQ" form="zmm {z}, zmm, zmm" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTB" form="zmm, r8" xed="VPBROADCASTB_ZMMu8_MASKmskw_GPR32u8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="zmm, xmm" xed="VPBROADCASTD_ZMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="zmm {k}, xmm" xed="VPBROADCASTD_ZMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="zmm {k}, r32" xed="VPBROADCASTD_ZMMu32_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="zmm {z}, xmm" xed="VPBROADCASTD_ZMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="zmm {z}, r32" xed="VPBROADCASTD_ZMMu32_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTD" form="zmm, r32" xed="VPBROADCASTD_ZMMu32_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="zmm, xmm" xed="VPBROADCASTQ_ZMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="zmm {k}, xmm" xed="VPBROADCASTQ_ZMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="zmm {k}, r64" xed="VPBROADCASTQ_ZMMu64_MASKmskw_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="zmm {z}, xmm" xed="VPBROADCASTQ_ZMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="zmm {z}, r64" xed="VPBROADCASTQ_ZMMu64_MASKmskw_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTQ" form="zmm, r64" xed="VPBROADCASTQ_ZMMu64_MASKmskw_GPR64u64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBROADCASTW" form="zmm, r16" xed="VPBROADCASTW_ZMMu16_MASKmskw_GPR32u16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPEQQ" form="k, zmm, zmm" xed="VPCMPEQQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPGTQ" form="k, zmm, zmm" xed="VPCMPGTQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPEQQ" form="k {k}, zmm, zmm" xed="VPCMPEQQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPGTQ" form="k {k}, zmm, zmm" xed="VPCMPGTQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPQ_MASKmskw_MASKmskw_ZMMi64_ZMMi64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPCMPUQ" form="k {k}, zmm, zmm, imm8" xed="VPCMPUQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCOMPRESSD" form="zmm {k}, zmm" xed="VPCOMPRESSD_ZMMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compressstoreu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSD" form="m32 {k}, zmm" xed="VPCOMPRESSD_MEMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCOMPRESSD" form="zmm {z}, zmm" xed="VPCOMPRESSD_ZMMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCOMPRESSQ" form="zmm {k}, zmm" xed="VPCOMPRESSQ_ZMMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compressstoreu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSQ" form="m64 {k}, zmm" xed="VPCOMPRESSQ_MEMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCOMPRESSQ" form="zmm {z}, zmm" xed="VPCOMPRESSQ_ZMMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMD" form="zmm {k}, zmm, zmm" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMD" form="zmm {z}, zmm, zmm" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMD" form="zmm, zmm, zmm" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask2_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2D" form="zmm {k}, zmm, zmm" xed="VPERMI2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMT2D" form="zmm {k}, zmm, zmm" xed="VPERMT2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2D" form="zmm {z}, zmm, zmm" xed="VPERMI2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<instruction name="VPERMT2D" form="zmm {z}, zmm, zmm" xed="VPERMT2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2D" form="zmm, zmm, zmm" xed="VPERMI2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<instruction name="VPERMT2D" form="zmm, zmm, zmm" xed="VPERMT2D_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask2_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set)</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2PD" form="zmm {k}, zmm, zmm" xed="VPERMI2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMT2PD" form="zmm {k}, zmm, zmm" xed="VPERMT2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2PD" form="zmm {z}, zmm, zmm" xed="VPERMI2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VPERMT2PD" form="zmm {z}, zmm, zmm" xed="VPERMT2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2PD" form="zmm, zmm, zmm" xed="VPERMI2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VPERMT2PD" form="zmm, zmm, zmm" xed="VPERMT2PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask2_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2PS" form="zmm {k}, zmm, zmm" xed="VPERMI2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMT2PS" form="zmm {k}, zmm, zmm" xed="VPERMT2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2PS" form="zmm {z}, zmm, zmm" xed="VPERMI2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VPERMT2PS" form="zmm {z}, zmm, zmm" xed="VPERMT2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2PS" form="zmm, zmm, zmm" xed="VPERMI2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VPERMT2PS" form="zmm, zmm, zmm" xed="VPERMT2PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask2_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2Q" form="zmm {k}, zmm, zmm" xed="VPERMI2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMT2Q" form="zmm {k}, zmm, zmm" xed="VPERMT2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2Q" form="zmm {z}, zmm, zmm" xed="VPERMI2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<instruction name="VPERMT2Q" form="zmm {z}, zmm, zmm" xed="VPERMT2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2Q" form="zmm, zmm, zmm" xed="VPERMI2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<instruction name="VPERMT2Q" form="zmm, zmm, zmm" xed="VPERMT2Q_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI
+IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI
+IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI
+IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI
+IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI
+IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI
+IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI
+IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI
+IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="zmm {k}, zmm, imm8" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI
+IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI
+IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI
+IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI
+IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI
+IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI
+IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI
+IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI
+IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="zmm {k}, zmm, zmm" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI
+IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI
+IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI
+IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI
+IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI
+IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI
+IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI
+IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI
+IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="zmm {z}, zmm, imm8" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI
+IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI
+IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI
+IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI
+IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI
+IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI
+IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI
+IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI
+IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="zmm {z}, zmm, zmm" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI
+IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI
+IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI
+IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI
+IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI
+IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI
+IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI
+IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI
+IF (imm8[4] == 0) dst[319:256] := a[319:256]; FI
+IF (imm8[4] == 1) dst[319:256] := a[383:320]; FI
+IF (imm8[5] == 0) dst[383:320] := a[319:256]; FI
+IF (imm8[5] == 1) dst[383:320] := a[383:320]; FI
+IF (imm8[6] == 0) dst[447:384] := a[447:384]; FI
+IF (imm8[6] == 1) dst[447:384] := a[511:448]; FI
+IF (imm8[7] == 0) dst[511:448] := a[447:384]; FI
+IF (imm8[7] == 1) dst[511:448] := a[511:448]; FI
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="zmm, zmm, imm8" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+IF (b[1] == 0) dst[63:0] := a[63:0]; FI
+IF (b[1] == 1) dst[63:0] := a[127:64]; FI
+IF (b[65] == 0) dst[127:64] := a[63:0]; FI
+IF (b[65] == 1) dst[127:64] := a[127:64]; FI
+IF (b[129] == 0) dst[191:128] := a[191:128]; FI
+IF (b[129] == 1) dst[191:128] := a[255:192]; FI
+IF (b[193] == 0) dst[255:192] := a[191:128]; FI
+IF (b[193] == 1) dst[255:192] := a[255:192]; FI
+IF (b[257] == 0) dst[319:256] := a[319:256]; FI
+IF (b[257] == 1) dst[319:256] := a[383:320]; FI
+IF (b[321] == 0) dst[383:320] := a[319:256]; FI
+IF (b[321] == 1) dst[383:320] := a[383:320]; FI
+IF (b[385] == 0) dst[447:384] := a[447:384]; FI
+IF (b[385] == 1) dst[447:384] := a[511:448]; FI
+IF (b[449] == 0) dst[511:448] := a[447:384]; FI
+IF (b[449] == 1) dst[511:448] := a[511:448]; FI
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPD" form="zmm, zmm, zmm" xed="VPERMILPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="zmm {k}, zmm, imm8" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
+tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
+tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
+tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
+tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
+tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
+tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
+tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="zmm {k}, zmm, zmm" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="zmm {z}, zmm, imm8" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
+tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
+tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
+tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
+tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
+tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
+tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
+tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="zmm {z}, zmm, zmm" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="zmm, zmm, imm8" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], b[1:0])
+dst[63:32] := SELECT4(a[127:0], b[33:32])
+dst[95:64] := SELECT4(a[127:0], b[65:64])
+dst[127:96] := SELECT4(a[127:0], b[97:96])
+dst[159:128] := SELECT4(a[255:128], b[129:128])
+dst[191:160] := SELECT4(a[255:128], b[161:160])
+dst[223:192] := SELECT4(a[255:128], b[193:192])
+dst[255:224] := SELECT4(a[255:128], b[225:224])
+dst[287:256] := SELECT4(a[383:256], b[257:256])
+dst[319:288] := SELECT4(a[383:256], b[289:288])
+dst[351:320] := SELECT4(a[383:256], b[321:320])
+dst[383:352] := SELECT4(a[383:256], b[353:352])
+dst[415:384] := SELECT4(a[511:384], b[385:384])
+dst[447:416] := SELECT4(a[511:384], b[417:416])
+dst[479:448] := SELECT4(a[511:384], b[449:448])
+dst[511:480] := SELECT4(a[511:384], b[481:480])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMILPS" form="zmm, zmm, zmm" xed="VPERMILPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMPD" form="zmm {k}, zmm, imm8" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMPD" form="zmm {k}, zmm, zmm" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMPD" form="zmm {z}, zmm, imm8" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMPD" form="zmm {z}, zmm, zmm" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMPD" form="zmm, zmm, imm8" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMPD" form="zmm, zmm, zmm" xed="VPERMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMPS" form="zmm {k}, zmm, zmm" xed="VPERMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMPS" form="zmm {z}, zmm, zmm" xed="VPERMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMPS" form="zmm, zmm, zmm" xed="VPERMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMQ" form="zmm {k}, zmm, imm8" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMQ" form="zmm {k}, zmm, zmm" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMQ" form="zmm {z}, zmm, imm8" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMQ" form="zmm {z}, zmm, zmm" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMQ" form="zmm, zmm, imm8" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="idx" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMQ" form="zmm, zmm, zmm" xed="VPERMQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="zmm {k}, zmm" xed="VPEXPANDD_ZMMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="zmm {k}, m32" xed="VPEXPANDD_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="zmm {z}, zmm" xed="VPEXPANDD_ZMMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDD" form="zmm {z}, m32" xed="VPEXPANDD_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="zmm {k}, zmm" xed="VPEXPANDQ_ZMMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="zmm {k}, m64" xed="VPEXPANDQ_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="zmm {z}, zmm" xed="VPEXPANDQ_ZMMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDQ" form="zmm {z}, m64" xed="VPEXPANDQ_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="zmm, vm32y" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="zmm {k}, vm32y" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERQD" form="ymm, vm64z" xed="VPGATHERQD_YMMu32_MASKmskw_MEMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPGATHERQD" form="ymm {k}, vm64z" xed="VPGATHERQD_YMMu32_MASKmskw_MEMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERQQ" form="zmm, vm64z" xed="VPGATHERQQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERQQ" form="zmm {k}, vm64z" xed="VPGATHERQQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0 
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSD" form="zmm {z}, zmm, zmm" xed="VPMAXSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSQ" form="zmm {k}, zmm, zmm" xed="VPMAXSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSQ" form="zmm {z}, zmm, zmm" xed="VPMAXSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSQ" form="zmm, zmm, zmm" xed="VPMAXSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUD" form="zmm {z}, zmm, zmm" xed="VPMAXUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUQ" form="zmm {k}, zmm, zmm" xed="VPMAXUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUQ" form="zmm {z}, zmm, zmm" xed="VPMAXUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUQ" form="zmm, zmm, zmm" xed="VPMAXUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSD" form="zmm {z}, zmm, zmm" xed="VPMINSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSQ" form="zmm {k}, zmm, zmm" xed="VPMINSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSQ" form="zmm {z}, zmm, zmm" xed="VPMINSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSQ" form="zmm, zmm, zmm" xed="VPMINSQ_ZMMi64_MASKmskw_ZMMi64_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUD" form="zmm {z}, zmm, zmm" xed="VPMINUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUQ" form="zmm {k}, zmm, zmm" xed="VPMINUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUQ" form="zmm {z}, zmm, zmm" xed="VPMINUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUQ" form="zmm, zmm, zmm" xed="VPMINUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVDB" form="xmm, zmm" xed="VPMOVDB_XMMu8_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVDB" form="xmm {k}, zmm" xed="VPMOVDB_XMMu8_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="128"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVDB" form="m128 {k}, zmm" xed="VPMOVDB_MEMu8_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVDB" form="xmm {z}, zmm" xed="VPMOVDB_XMMu8_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVDW" form="ymm, zmm" xed="VPMOVDW_YMMu16_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVDW" form="ymm {k}, zmm" xed="VPMOVDW_YMMu16_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="256"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVDW" form="m256 {k}, zmm" xed="VPMOVDW_MEMu16_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVDW" form="ymm {z}, zmm" xed="VPMOVDW_YMMu16_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Truncate8(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQB" form="xmm, zmm" xed="VPMOVQB_XMMu8_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQB" form="xmm {k}, zmm" xed="VPMOVQB_XMMu8_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVQB" form="m64 {k}, zmm" xed="VPMOVQB_MEMu8_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQB" form="xmm {z}, zmm" xed="VPMOVQB_XMMu8_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Truncate32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVQD" form="ymm, zmm" xed="VPMOVQD_YMMu32_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVQD" form="ymm {k}, zmm" xed="VPMOVQD_YMMu32_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVQD" form="m256 {k}, zmm" xed="VPMOVQD_MEMu32_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVQD" form="ymm {z}, zmm" xed="VPMOVQD_YMMu32_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Truncate16(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQW" form="xmm, zmm" xed="VPMOVQW_XMMu16_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQW" form="xmm {k}, zmm" xed="VPMOVQW_XMMu16_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVQW" form="m128 {k}, zmm" xed="VPMOVQW_MEMu16_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVQW" form="xmm {z}, zmm" xed="VPMOVQW_XMMu16_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSDB" form="xmm, zmm" xed="VPMOVSDB_XMMi8_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="src" etype="SI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSDB" form="xmm {k}, zmm" xed="VPMOVSDB_XMMi8_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI8" memwidth="128"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSDB" form="m128 {k}, zmm" xed="VPMOVSDB_MEMi8_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSDB" form="xmm {z}, zmm" xed="VPMOVSDB_XMMi8_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSDW" form="ymm, zmm" xed="VPMOVSDW_YMMi16_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__m256i" varname="src" etype="SI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSDW" form="ymm {k}, zmm" xed="VPMOVSDW_YMMi16_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI16" memwidth="256"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSDW" form="m256 {k}, zmm" xed="VPMOVSDW_MEMi16_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSDW" form="ymm {z}, zmm" xed="VPMOVSDW_YMMi16_MASKmskw_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate8(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSQB" form="xmm, zmm" xed="VPMOVSQB_XMMi8_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="src" etype="SI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSQB" form="xmm {k}, zmm" xed="VPMOVSQB_XMMi8_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI8" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSQB" form="m64 {k}, zmm" xed="VPMOVSQB_MEMi8_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVSQB" form="xmm {z}, zmm" xed="VPMOVSQB_XMMi8_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSQD" form="ymm, zmm" xed="VPMOVSQD_YMMi32_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSQD" form="ymm {k}, zmm" xed="VPMOVSQD_YMMi32_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI32" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSQD" form="m256 {k}, zmm" xed="VPMOVSQD_MEMi32_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVSQD" form="ymm {z}, zmm" xed="VPMOVSQD_YMMi32_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate16(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSQW" form="xmm, zmm" xed="VPMOVSQW_XMMi16_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="src" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSQW" form="xmm {k}, zmm" xed="VPMOVSQW_XMMi16_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtsepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="SI16" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVSQW" form="m128 {k}, zmm" xed="VPMOVSQW_MEMi16_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVSQW" form="xmm {z}, zmm" xed="VPMOVSQW_XMMi16_MASKmskw_ZMMi64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := SignExtend32(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXBD" form="zmm, xmm" xed="VPMOVSXBD_ZMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXBD" form="zmm {k}, xmm" xed="VPMOVSXBD_ZMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXBD" form="zmm {z}, xmm" xed="VPMOVSXBD_ZMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := SignExtend64(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXBQ" form="zmm, xmm" xed="VPMOVSXBQ_ZMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__m512i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXBQ" form="zmm {k}, xmm" xed="VPMOVSXBQ_ZMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXBQ" form="zmm {z}, xmm" xed="VPMOVSXBQ_ZMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := SignExtend64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXDQ" form="zmm, ymm" xed="VPMOVSXDQ_ZMMi64_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__m512i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXDQ" form="zmm {k}, ymm" xed="VPMOVSXDQ_ZMMi64_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI32"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXDQ" form="zmm {z}, ymm" xed="VPMOVSXDQ_ZMMi64_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := SignExtend32(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXWD" form="zmm, ymm" xed="VPMOVSXWD_ZMMi32_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	l := j*16
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXWD" form="zmm {k}, ymm" xed="VPMOVSXWD_ZMMi32_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := SignExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXWD" form="zmm {z}, ymm" xed="VPMOVSXWD_ZMMi32_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := SignExtend64(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXWQ" form="zmm, xmm" xed="VPMOVSXWQ_ZMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__m512i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXWQ" form="zmm {k}, xmm" xed="VPMOVSXWQ_ZMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVSXWQ" form="zmm {z}, xmm" xed="VPMOVSXWQ_ZMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSDB" form="xmm, zmm" xed="VPMOVUSDB_XMMu8_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSDB" form="xmm {k}, zmm" xed="VPMOVUSDB_XMMu8_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="128"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSDB" form="m128 {k}, zmm" xed="VPMOVUSDB_MEMu8_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSDB" form="xmm {z}, zmm" xed="VPMOVUSDB_XMMu8_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVUSDW" form="ymm, zmm" xed="VPMOVUSDW_YMMu16_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVUSDW" form="ymm {k}, zmm" xed="VPMOVUSDW_YMMu16_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="256"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSDW" form="m256 {k}, zmm" xed="VPMOVUSDW_MEMu16_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVUSDW" form="ymm {z}, zmm" xed="VPMOVUSDW_YMMu16_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := SaturateU8(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSQB" form="xmm, zmm" xed="VPMOVUSQB_XMMu8_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSQB" form="xmm {k}, zmm" xed="VPMOVUSQB_XMMu8_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSQB" form="m64 {k}, zmm" xed="VPMOVUSQB_MEMu8_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := SaturateU8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPMOVUSQB" form="xmm {z}, zmm" xed="VPMOVUSQB_XMMu8_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := SaturateU32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVUSQD" form="ymm, zmm" xed="VPMOVUSQD_YMMu32_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVUSQD" form="ymm {k}, zmm" xed="VPMOVUSQD_YMMu32_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32" memwidth="256"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSQD" form="m256 {k}, zmm" xed="VPMOVUSQD_MEMu32_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := SaturateU32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMOVUSQD" form="ymm {z}, zmm" xed="VPMOVUSQD_YMMu32_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := SaturateU16(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSQW" form="xmm, zmm" xed="VPMOVUSQW_XMMu16_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSQW" form="xmm {k}, zmm" xed="VPMOVUSQW_XMMu16_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtusepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPMOVUSQW" form="m128 {k}, zmm" xed="VPMOVUSQW_MEMu16_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := SaturateU16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMOVUSQW" form="xmm {z}, zmm" xed="VPMOVUSQW_XMMu16_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := ZeroExtend32(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXBD" form="zmm, xmm" xed="VPMOVZXBD_ZMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXBD" form="zmm {k}, xmm" xed="VPMOVZXBD_ZMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXBD" form="zmm {z}, xmm" xed="VPMOVZXBD_ZMMi32_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := ZeroExtend64(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXBQ" form="zmm, xmm" xed="VPMOVZXBQ_ZMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXBQ" form="zmm {k}, xmm" xed="VPMOVZXBQ_ZMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXBQ" form="zmm {z}, xmm" xed="VPMOVZXBQ_ZMMi64_MASKmskw_XMMi8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := ZeroExtend64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXDQ" form="zmm, ymm" xed="VPMOVZXDQ_ZMMi64_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXDQ" form="zmm {k}, ymm" xed="VPMOVZXDQ_ZMMi64_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+31:l])
+	ELSE 
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXDQ" form="zmm {z}, ymm" xed="VPMOVZXDQ_ZMMi64_MASKmskw_YMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := ZeroExtend32(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXWD" form="zmm, ymm" xed="VPMOVZXWD_ZMMi32_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXWD" form="zmm {k}, ymm" xed="VPMOVZXWD_ZMMi32_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend32(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXWD" form="zmm {z}, ymm" xed="VPMOVZXWD_ZMMi32_MASKmskw_YMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := ZeroExtend64(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXWQ" form="zmm, xmm" xed="VPMOVZXWQ_ZMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXWQ" form="zmm {k}, xmm" xed="VPMOVZXWQ_ZMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend64(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMOVZXWQ" form="zmm {z}, xmm" xed="VPMOVZXWQ_ZMMi64_MASKmskw_XMMi16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__m512i" varname="src" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULDQ" form="zmm {k}, zmm, zmm" xed="VPMULDQ_ZMMi64_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULDQ" form="zmm {z}, zmm, zmm" xed="VPMULDQ_ZMMi64_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULDQ" form="zmm, zmm, zmm" xed="VPMULDQ_ZMMi64_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULUDQ" form="zmm {k}, zmm, zmm" xed="VPMULUDQ_ZMMu64_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULUDQ" form="zmm {z}, zmm, zmm" xed="VPMULUDQ_ZMMu64_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULUDQ" form="zmm, zmm, zmm" xed="VPMULUDQ_ZMMu64_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPORD" form="zmm {z}, zmm, zmm" xed="VPORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPORQ" form="zmm {z}, zmm, zmm" xed="VPORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLD" form="zmm {k}, zmm, imm8" xed="VPROLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLD" form="zmm {z}, zmm, imm8" xed="VPROLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLD" form="zmm, zmm, imm8" xed="VPROLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLQ" form="zmm {k}, zmm, imm8" xed="VPROLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLQ" form="zmm {z}, zmm, imm8" xed="VPROLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLQ" form="zmm, zmm, imm8" xed="VPROLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLVD" form="zmm {k}, zmm, zmm" xed="VPROLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLVD" form="zmm {z}, zmm, zmm" xed="VPROLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLVD" form="zmm, zmm, zmm" xed="VPROLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLVQ" form="zmm {k}, zmm, zmm" xed="VPROLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLVQ" form="zmm {z}, zmm, zmm" xed="VPROLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE LEFT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPROLVQ" form="zmm, zmm, zmm" xed="VPROLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORD" form="zmm {k}, zmm, imm8" xed="VPRORD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORD" form="zmm {z}, zmm, imm8" xed="VPRORD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORD" form="zmm, zmm, imm8" xed="VPRORD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORQ" form="zmm {k}, zmm, imm8" xed="VPRORQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORQ" form="zmm {z}, zmm, imm8" xed="VPRORQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORQ" form="zmm, zmm, imm8" xed="VPRORQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORVD" form="zmm {k}, zmm, zmm" xed="VPRORVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORVD" form="zmm {z}, zmm, zmm" xed="VPRORVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_DWORDS(src, count_src) {
+	count := count_src % 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORVD" form="zmm, zmm, zmm" xed="VPRORVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORVQ" form="zmm {k}, zmm, zmm" xed="VPRORVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORVQ" form="zmm {z}, zmm, zmm" xed="VPRORVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst".</description>
+	<operation>
+DEFINE RIGHT_ROTATE_QWORDS(src, count_src) {
+	count := count_src % 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPRORVQ" form="zmm, zmm, zmm" xed="VPRORVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="vm32y, zmm" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="vm32y {k}, zmm" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQD" form="vm64z, ymm" xed="VPSCATTERQD_MEMu32_MASKmskw_YMMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQD" form="vm64z {k}, ymm" xed="VPSCATTERQD_MEMu32_MASKmskw_YMMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQQ" form="vm64z, zmm" xed="VPSCATTERQQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERQQ" form="vm64z {k}, zmm" xed="VPSCATTERQQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFD" form="zmm {z}, zmm, imm8" xed="VPSHUFD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLD" form="zmm {k}, zmm, xmm" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLD" form="zmm {z}, zmm, xmm" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLD" form="zmm {z}, zmm, imm8" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLD" form="zmm, zmm, xmm" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="zmm {k}, zmm, xmm" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="zmm {k}, zmm, imm8" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="zmm {z}, zmm, xmm" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="zmm {z}, zmm, imm8" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="zmm, zmm, xmm" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLQ" form="zmm, zmm, imm8" xed="VPSLLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLVD" form="zmm {z}, zmm, zmm" xed="VPSLLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLVQ" form="zmm {k}, zmm, zmm" xed="VPSLLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLVQ" form="zmm {z}, zmm, zmm" xed="VPSLLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLVQ" form="zmm, zmm, zmm" xed="VPSLLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAD" form="zmm {k}, zmm, xmm" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAD" form="zmm {z}, zmm, xmm" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAD" form="zmm {z}, zmm, imm8" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAD" form="zmm, zmm, xmm" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="zmm {k}, zmm, xmm" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="zmm {k}, zmm, imm8" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="zmm {z}, zmm, xmm" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+		ELSE
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="zmm {z}, zmm, imm8" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="zmm, zmm, xmm" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0)
+	ELSE
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAQ" form="zmm, zmm, imm8" xed="VPSRAQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAVD" form="zmm {z}, zmm, zmm" xed="VPSRAVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAVQ" form="zmm {k}, zmm, zmm" xed="VPSRAVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAVQ" form="zmm {z}, zmm, zmm" xed="VPSRAVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := SignExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAVQ" form="zmm, zmm, zmm" xed="VPSRAVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLD" form="zmm {k}, zmm, xmm" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLD" form="zmm {z}, zmm, xmm" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLD" form="zmm {z}, zmm, imm8" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLD" form="zmm, zmm, xmm" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="zmm {k}, zmm, xmm" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="zmm {k}, zmm, imm8" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="zmm {z}, zmm, xmm" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="zmm {z}, zmm, imm8" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="zmm, zmm, xmm" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLQ" form="zmm, zmm, imm8" xed="VPSRLQ_ZMMu64_MASKmskw_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLVD" form="zmm {z}, zmm, zmm" xed="VPSRLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLVQ" form="zmm {k}, zmm, zmm" xed="VPSRLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[i+63:i] &lt; 64
+			dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+		ELSE
+			dst[i+63:i] := 0
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLVQ" form="zmm {z}, zmm, zmm" xed="VPSRLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[i+63:i] &lt; 64
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLVQ" form="zmm, zmm, zmm" xed="VPSRLVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBD" form="zmm {z}, zmm, zmm" xed="VPSUBD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBQ" form="zmm {k}, zmm, zmm" xed="VPSUBQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBQ" form="zmm {z}, zmm, zmm" xed="VPSUBQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBQ" form="zmm, zmm, zmm" xed="VPSUBQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPTERNLOGD" form="zmm {k}, zmm, zmm, imm8" xed="VPTERNLOGD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPTERNLOGD" form="zmm {z}, zmm, zmm, imm8" xed="VPTERNLOGD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	FOR h := 0 to 31
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPTERNLOGD" form="zmm, zmm, zmm, imm8" xed="VPTERNLOGD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPTERNLOGQ" form="zmm {k}, zmm, zmm, imm8" xed="VPTERNLOGQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPTERNLOGQ" form="zmm {z}, zmm, zmm, imm8" xed="VPTERNLOGQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	FOR h := 0 to 63
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPTERNLOGQ" form="zmm, zmm, zmm, imm8" xed="VPTERNLOGQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTMQ" form="k {k}, zmm, zmm" xed="VPTESTMQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTMQ" form="k, zmm, zmm" xed="VPTESTMQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTNMD" form="k {k}, zmm, zmm" xed="VPTESTNMD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTNMD" form="k, zmm, zmm" xed="VPTESTNMD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTNMQ" form="k {k}, zmm, zmm" xed="VPTESTNMQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VPTESTNMQ" form="k, zmm, zmm" xed="VPTESTNMQ_MASKmskw_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHDQ" form="zmm {k}, zmm, zmm" xed="VPUNPCKHDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHDQ" form="zmm {z}, zmm, zmm" xed="VPUNPCKHDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHDQ" form="zmm, zmm, zmm" xed="VPUNPCKHDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHQDQ" form="zmm {k}, zmm, zmm" xed="VPUNPCKHQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHQDQ" form="zmm {z}, zmm, zmm" xed="VPUNPCKHQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKHQDQ" form="zmm, zmm, zmm" xed="VPUNPCKHQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLDQ" form="zmm {k}, zmm, zmm" xed="VPUNPCKLDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLDQ" form="zmm {z}, zmm, zmm" xed="VPUNPCKLDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLDQ" form="zmm, zmm, zmm" xed="VPUNPCKLDQ_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLQDQ" form="zmm {k}, zmm, zmm" xed="VPUNPCKLQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLQDQ" form="zmm {z}, zmm, zmm" xed="VPUNPCKLQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPUNPCKLQDQ" form="zmm, zmm, zmm" xed="VPUNPCKLQDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPXORD" form="zmm {z}, zmm, zmm" xed="VPXORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPXORQ" form="zmm {z}, zmm, zmm" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRCP14PD" form="zmm {k}, zmm" xed="VRCP14PD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRCP14PD" form="zmm {z}, zmm" xed="VRCP14PD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRCP14PD" form="zmm, zmm" xed="VRCP14PD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRCP14PS" form="zmm {k}, zmm" xed="VRCP14PS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRCP14PS" form="zmm {z}, zmm" xed="VRCP14PS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRCP14PS" form="zmm, zmm" xed="VRCP14PS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rcp14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14SD" form="xmm {k}, xmm, xmm" xed="VRCP14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rcp14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14SD" form="xmm {z}, xmm, xmm" xed="VRCP14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rcp14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[63:0] := (1.0 / b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14SD" form="xmm, xmm, xmm" xed="VRCP14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rcp14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14SS" form="xmm {k}, xmm, xmm" xed="VRCP14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rcp14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14SS" form="xmm {z}, xmm, xmm" xed="VRCP14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rcp14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[31:0] := (1.0 / b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRCP14SS" form="xmm, xmm, xmm" xed="VRCP14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="zmm {k}, zmm, imm8" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_roundscale_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="zmm {k}, zmm, imm8 {sae}" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="zmm {z}, zmm, imm8" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_roundscale_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="zmm {z}, zmm, imm8 {sae}" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="zmm, zmm, imm8" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_roundscale_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPD" form="zmm, zmm, imm8 {sae}" xed="VRNDSCALEPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="zmm {k}, zmm, imm8" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_roundscale_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="zmm {k}, zmm, imm8 {sae}" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="zmm {z}, zmm, imm8" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_roundscale_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="zmm {z}, zmm, imm8 {sae}" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="zmm, zmm, imm8" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_roundscale_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDSCALEPS" form="zmm, zmm, imm8 {sae}" xed="VRNDSCALEPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_roundscale_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESD" form="xmm {k}, xmm, xmm, imm8 {sae}" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_roundscale_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESD" form="xmm {k}, xmm, xmm, imm8" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_roundscale_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESD" form="xmm {z}, xmm, xmm, imm8 {sae}" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_roundscale_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+IF k[0]
+	dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESD" form="xmm {z}, xmm, xmm, imm8" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_roundscale_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESD" form="xmm, xmm, xmm, imm8 {sae}" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_roundscale_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) {
+	m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0])
+	IF IsInf(tmp[63:0])
+		tmp[63:0] := src1[63:0]
+	FI
+	RETURN tmp[63:0]
+}
+dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESD" form="xmm, xmm, xmm, imm8" xed="VRNDSCALESD_XMMf64_MASKmskw_XMMf64_XMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_roundscale_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESS" form="xmm {k}, xmm, xmm, imm8 {sae}" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_roundscale_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESS" form="xmm {k}, xmm, xmm, imm8" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_roundscale_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESS" form="xmm {z}, xmm, xmm, imm8 {sae}" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_roundscale_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+IF k[0]
+	dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESS" form="xmm {z}, xmm, xmm, imm8" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_roundscale_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESS" form="xmm, xmm, xmm, imm8 {sae}" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_roundscale_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_MM_REDUCE"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note]</description>
+	<operation>
+DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) {
+	m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved
+	tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0])
+	IF IsInf(tmp[31:0])
+		tmp[31:0] := src1[31:0]
+	FI
+	RETURN tmp[31:0]
+}
+dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRNDSCALESS" form="xmm, xmm, xmm, imm8" xed="VRNDSCALESS_XMMf32_MASKmskw_XMMf32_XMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRSQRT14PD" form="zmm {k}, zmm" xed="VRSQRT14PD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRSQRT14PD" form="zmm {z}, zmm" xed="VRSQRT14PD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / SQRT(a[i+63:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRSQRT14PD" form="zmm, zmm" xed="VRSQRT14PD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRSQRT14PS" form="zmm {k}, zmm" xed="VRSQRT14PS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRSQRT14PS" form="zmm {z}, zmm" xed="VRSQRT14PS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRSQRT14PS" form="zmm, zmm" xed="VRSQRT14PS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rsqrt14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14SD" form="xmm {k}, xmm, xmm" xed="VRSQRT14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rsqrt14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14SD" form="xmm {z}, xmm, xmm" xed="VRSQRT14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rsqrt14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[63:0] := (1.0 / SQRT(b[63:0]))
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14SD" form="xmm, xmm, xmm" xed="VRSQRT14SD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_rsqrt14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14SS" form="xmm {k}, xmm, xmm" xed="VRSQRT14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_rsqrt14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14SS" form="xmm {z}, xmm, xmm" xed="VRSQRT14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_rsqrt14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[31:0] := (1.0 / SQRT(b[31:0]))
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VRSQRT14SS" form="xmm, xmm, xmm" xed="VRSQRT14SS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="zmm {k}, zmm, zmm" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_scalef_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="zmm {k}, zmm, zmm {er}" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="zmm {z}, zmm, zmm" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_scalef_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="zmm {z}, zmm, zmm {er}" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="zmm, zmm, zmm" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_scalef_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPD" form="zmm, zmm, zmm {er}" xed="VSCALEFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="zmm {k}, zmm, zmm" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_scalef_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="zmm {k}, zmm, zmm {er}" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="zmm {z}, zmm, zmm" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_scalef_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="zmm {z}, zmm, zmm {er}" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="zmm, zmm, zmm" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_scalef_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEFPS" form="zmm, zmm, zmm {er}" xed="VSCALEFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_scalef_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSD" form="xmm {k}, xmm, xmm {er}" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_scalef_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSD" form="xmm {k}, xmm, xmm" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_scalef_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSD" form="xmm {z}, xmm, xmm {er}" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_scalef_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSD" form="xmm {z}, xmm, xmm" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_scalef_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+dst[63:0] := SCALE(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSD" form="xmm, xmm, xmm {er}" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_scalef_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+dst[63:0] := SCALE(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSD" form="xmm, xmm, xmm" xed="VSCALEFSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_scalef_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSS" form="xmm {k}, xmm, xmm {er}" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_scalef_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSS" form="xmm {k}, xmm, xmm" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_scalef_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSS" form="xmm {z}, xmm, xmm {er}" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_scalef_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSS" form="xmm {z}, xmm, xmm" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_scalef_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+dst[31:0] := SCALE(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSS" form="xmm, xmm, xmm {er}" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_scalef_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>DEFINE SCALE(src1, src2) {
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (IS_DENORMAL(src2) AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (IS_DENORMAL(src1) AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+dst[31:0] := SCALE(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSCALEFSS" form="xmm, xmm, xmm" xed="VSCALEFSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="vm32y, zmm" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="vm32y {k}, zmm" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPD" form="vm32z, zmm" xed="VSCATTERQPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPD" form="vm32z {k}, zmm" xed="VSCATTERQPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPS" form="vm32z, ymm" xed="VSCATTERQPS_MEMf32_MASKmskw_YMMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERQPS" form="vm32z {k}, ymm" xed="VSCATTERQPS_MEMf32_MASKmskw_YMMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFF32X4" form="zmm {k}, zmm, zmm, imm8" xed="VSHUFF32X4_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFF32X4" form="zmm {z}, zmm, zmm, imm8" xed="VSHUFF32X4_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFF32X4" form="zmm, zmm, zmm, imm8" xed="VSHUFF32X4_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFF64X2" form="zmm {k}, zmm, zmm, imm8" xed="VSHUFF64X2_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFF64X2" form="zmm {z}, zmm, zmm, imm8" xed="VSHUFF64X2_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFF64X2" form="zmm, zmm, zmm, imm8" xed="VSHUFF64X2_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shuffle_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFI32X4" form="zmm {k}, zmm, zmm, imm8" xed="VSHUFI32X4_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shuffle_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFI32X4" form="zmm {z}, zmm, zmm, imm8" xed="VSHUFI32X4_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shuffle_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFI32X4" form="zmm, zmm, zmm, imm8" xed="VSHUFI32X4_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shuffle_i64x2">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFI64X2" form="zmm {k}, zmm, zmm, imm8" xed="VSHUFI64X2_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shuffle_i64x2">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFI64X2" form="zmm {z}, zmm, zmm, imm8" xed="VSHUFI64X2_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shuffle_i64x2">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFI64X2" form="zmm, zmm, zmm, imm8" xed="VSHUFI64X2_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
+tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
+tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
+tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFPD" form="zmm {k}, zmm, zmm, imm8" xed="VSHUFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
+tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
+tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
+tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFPD" form="zmm {z}, zmm, zmm, imm8" xed="VSHUFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
+dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
+dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
+dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFPD" form="zmm, zmm, zmm, imm8" xed="VSHUFPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFPS" form="zmm {k}, zmm, zmm, imm8" xed="VSHUFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFPS" form="zmm {z}, zmm, zmm, imm8" xed="VSHUFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+dst[351:320] := SELECT4(b[383:256], imm8[5:4])
+dst[383:352] := SELECT4(b[383:256], imm8[7:6])
+dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+dst[479:448] := SELECT4(b[511:384], imm8[5:4])
+dst[511:480] := SELECT4(b[511:384], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSHUFPS" form="zmm, zmm, zmm, imm8" xed="VSHUFPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="zmm {k}, zmm" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sqrt_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="zmm {k}, zmm {er}" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="zmm {z}, zmm" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sqrt_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note].</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="zmm {z}, zmm {er}" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="zmm, zmm" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sqrt_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note].</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPD" form="zmm, zmm {er}" xed="VSQRTPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="zmm {k}, zmm" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_sqrt_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="zmm {k}, zmm {er}" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="zmm {z}, zmm" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sqrt_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="zmm {z}, zmm {er}" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="zmm, zmm" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_sqrt_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note].</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSQRTPS" form="zmm, zmm {er}" xed="VSQRTPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sqrt_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSD" form="xmm {k}, xmm, xmm {er}" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sqrt_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSD" form="xmm {k}, xmm, xmm" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sqrt_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSD" form="xmm {z}, xmm, xmm {er}" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sqrt_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSD" form="xmm {z}, xmm, xmm" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_sqrt_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := SQRT(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSD" form="xmm, xmm, xmm {er}" xed="VSQRTSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sqrt_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSS" form="xmm {k}, xmm, xmm {er}" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sqrt_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSS" form="xmm {k}, xmm, xmm" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sqrt_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSS" form="xmm {z}, xmm, xmm {er}" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sqrt_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSS" form="xmm {z}, xmm, xmm" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_sqrt_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := SQRT(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSQRTSS" form="xmm, xmm, xmm {er}" xed="VSQRTSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPD" form="zmm {z}, zmm, zmm" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPD" form="zmm {z}, zmm, zmm {er}" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPS" form="zmm {z}, zmm, zmm" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_sub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPS" form="zmm {z}, zmm, zmm {er}" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSD" form="xmm {k}, xmm, xmm {er}" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSD" form="xmm {k}, xmm, xmm" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSD" form="xmm {z}, xmm, xmm {er}" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSD" form="xmm {z}, xmm, xmm" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_sub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := a[63:0] - b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSD" form="xmm, xmm, xmm {er}" xed="VSUBSD_XMMf64_MASKmskw_XMMf64_XMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSS" form="xmm {k}, xmm, xmm {er}" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_sub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSS" form="xmm {k}, xmm, xmm" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSS" form="xmm {z}, xmm, xmm {er}" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_sub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSS" form="xmm {z}, xmm, xmm" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_sub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := a[31:0] - b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VSUBSS" form="xmm, xmm, xmm {er}" xed="VSUBSS_XMMf32_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKHPD" form="zmm {k}, zmm, zmm" xed="VUNPCKHPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKHPD" form="zmm {z}, zmm, zmm" xed="VUNPCKHPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKHPD" form="zmm, zmm, zmm" xed="VUNPCKHPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKHPS" form="zmm {k}, zmm, zmm" xed="VUNPCKHPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKHPS" form="zmm {z}, zmm, zmm" xed="VUNPCKHPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKHPS" form="zmm, zmm, zmm" xed="VUNPCKHPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKLPD" form="zmm {k}, zmm, zmm" xed="VUNPCKLPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKLPD" form="zmm {z}, zmm, zmm" xed="VUNPCKLPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKLPD" form="zmm, zmm, zmm" xed="VUNPCKLPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKLPS" form="zmm {k}, zmm, zmm" xed="VUNPCKLPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKLPS" form="zmm {z}, zmm, zmm" xed="VUNPCKLPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VUNPCKLPS" form="zmm, zmm, zmm" xed="VUNPCKLPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castpd128_pd512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castpd256_pd512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castpd512_pd128">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m512d to type __m128d. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castps512_ps128">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m512 to type __m128. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castpd512_pd256">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m512d to type __m256d. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castps128_ps512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castps256_ps512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castps512_ps256">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m512 to type __m256. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castsi128_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m128i" varname="a" etype="M512"/>
+	<description>Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castsi256_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m256i" varname="a" etype="M512"/>
+	<description>Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined.
+	 This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castsi512_si128">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m512i" varname="a" etype="M128"/>
+	<description>Cast vector of type __m512i to type __m128i.
+	 This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_castsi512_si256">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m256i" varname="dst" etype="M256"/>
+	<parameter type="__m512i" varname="a" etype="M256"/>
+	<description>Cast vector of type __m512i to type __m256i.
+	 This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_zextpd128_pd512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_zextps128_ps512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_zextsi128_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m128i" varname="a" etype="M512"/>
+	<description>Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_zextpd256_pd512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_zextps256_ps512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_zextsi256_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m256i" varname="a" etype="M512"/>
+	<description>Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set1_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="a" etype="FP64"/>
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set1_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="a" etype="FP32"/>
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="d" etype="UI32"/>
+	<parameter type="int" varname="c" etype="UI32"/>
+	<parameter type="int" varname="b" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[31:0] := a
+dst[63:32] := b
+dst[95:64] := c
+dst[127:96] := d
+dst[159:128] := a
+dst[191:160] := b
+dst[223:192] := c
+dst[255:224] := d
+dst[287:256] := a
+dst[319:288] := b
+dst[351:320] := c
+dst[383:352] := d
+dst[415:384] := a
+dst[447:416] := b
+dst[479:448] := c
+dst[511:480] := d
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="d" etype="UI64"/>
+	<parameter type="__int64" varname="c" etype="UI64"/>
+	<parameter type="__int64" varname="b" etype="UI64"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Set packed 64-bit integers in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[63:0] := a
+dst[127:64] := b
+dst[191:128] := c
+dst[255:192] := d
+dst[319:256] := a
+dst[383:320] := b
+dst[447:384] := c
+dst[511:448] := d
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="d" etype="FP64"/>
+	<parameter type="double" varname="c" etype="FP64"/>
+	<parameter type="double" varname="b" etype="FP64"/>
+	<parameter type="double" varname="a" etype="FP64"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[63:0] := a
+dst[127:64] := b
+dst[191:128] := c
+dst[255:192] := d
+dst[319:256] := a
+dst[383:320] := b
+dst[447:384] := c
+dst[511:448] := d
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="d" etype="FP32"/>
+	<parameter type="float" varname="c" etype="FP32"/>
+	<parameter type="float" varname="b" etype="FP32"/>
+	<parameter type="float" varname="a" etype="FP32"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[31:0] := a
+dst[63:32] := b
+dst[95:64] := c
+dst[127:96] := d
+dst[159:128] := a
+dst[191:160] := b
+dst[223:192] := c
+dst[255:224] := d
+dst[287:256] := a
+dst[319:288] := b
+dst[351:320] := c
+dst[383:352] := d
+dst[415:384] := a
+dst[447:416] := b
+dst[479:448] := c
+dst[511:480] := d
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="char" varname="e63" etype="UI8"/>
+	<parameter type="char" varname="e62" etype="UI8"/>
+	<parameter type="char" varname="e61" etype="UI8"/>
+	<parameter type="char" varname="e60" etype="UI8"/>
+	<parameter type="char" varname="e59" etype="UI8"/>
+	<parameter type="char" varname="e58" etype="UI8"/>
+	<parameter type="char" varname="e57" etype="UI8"/>
+	<parameter type="char" varname="e56" etype="UI8"/>
+	<parameter type="char" varname="e55" etype="UI8"/>
+	<parameter type="char" varname="e54" etype="UI8"/>
+	<parameter type="char" varname="e53" etype="UI8"/>
+	<parameter type="char" varname="e52" etype="UI8"/>
+	<parameter type="char" varname="e51" etype="UI8"/>
+	<parameter type="char" varname="e50" etype="UI8"/>
+	<parameter type="char" varname="e49" etype="UI8"/>
+	<parameter type="char" varname="e48" etype="UI8"/>
+	<parameter type="char" varname="e47" etype="UI8"/>
+	<parameter type="char" varname="e46" etype="UI8"/>
+	<parameter type="char" varname="e45" etype="UI8"/>
+	<parameter type="char" varname="e44" etype="UI8"/>
+	<parameter type="char" varname="e43" etype="UI8"/>
+	<parameter type="char" varname="e42" etype="UI8"/>
+	<parameter type="char" varname="e41" etype="UI8"/>
+	<parameter type="char" varname="e40" etype="UI8"/>
+	<parameter type="char" varname="e39" etype="UI8"/>
+	<parameter type="char" varname="e38" etype="UI8"/>
+	<parameter type="char" varname="e37" etype="UI8"/>
+	<parameter type="char" varname="e36" etype="UI8"/>
+	<parameter type="char" varname="e35" etype="UI8"/>
+	<parameter type="char" varname="e34" etype="UI8"/>
+	<parameter type="char" varname="e33" etype="UI8"/>
+	<parameter type="char" varname="e32" etype="UI8"/>
+	<parameter type="char" varname="e31" etype="UI8"/>
+	<parameter type="char" varname="e30" etype="UI8"/>
+	<parameter type="char" varname="e29" etype="UI8"/>
+	<parameter type="char" varname="e28" etype="UI8"/>
+	<parameter type="char" varname="e27" etype="UI8"/>
+	<parameter type="char" varname="e26" etype="UI8"/>
+	<parameter type="char" varname="e25" etype="UI8"/>
+	<parameter type="char" varname="e24" etype="UI8"/>
+	<parameter type="char" varname="e23" etype="UI8"/>
+	<parameter type="char" varname="e22" etype="UI8"/>
+	<parameter type="char" varname="e21" etype="UI8"/>
+	<parameter type="char" varname="e20" etype="UI8"/>
+	<parameter type="char" varname="e19" etype="UI8"/>
+	<parameter type="char" varname="e18" etype="UI8"/>
+	<parameter type="char" varname="e17" etype="UI8"/>
+	<parameter type="char" varname="e16" etype="UI8"/>
+	<parameter type="char" varname="e15" etype="UI8"/>
+	<parameter type="char" varname="e14" etype="UI8"/>
+	<parameter type="char" varname="e13" etype="UI8"/>
+	<parameter type="char" varname="e12" etype="UI8"/>
+	<parameter type="char" varname="e11" etype="UI8"/>
+	<parameter type="char" varname="e10" etype="UI8"/>
+	<parameter type="char" varname="e9" etype="UI8"/>
+	<parameter type="char" varname="e8" etype="UI8"/>
+	<parameter type="char" varname="e7" etype="UI8"/>
+	<parameter type="char" varname="e6" etype="UI8"/>
+	<parameter type="char" varname="e5" etype="UI8"/>
+	<parameter type="char" varname="e4" etype="UI8"/>
+	<parameter type="char" varname="e3" etype="UI8"/>
+	<parameter type="char" varname="e2" etype="UI8"/>
+	<parameter type="char" varname="e1" etype="UI8"/>
+	<parameter type="char" varname="e0" etype="UI8"/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+dst[71:64] := e8
+dst[79:72] := e9
+dst[87:80] := e10
+dst[95:88] := e11
+dst[103:96] := e12
+dst[111:104] := e13
+dst[119:112] := e14
+dst[127:120] := e15
+dst[135:128] := e16
+dst[143:136] := e17
+dst[151:144] := e18
+dst[159:152] := e19
+dst[167:160] := e20
+dst[175:168] := e21
+dst[183:176] := e22
+dst[191:184] := e23
+dst[199:192] := e24
+dst[207:200] := e25
+dst[215:208] := e26
+dst[223:216] := e27
+dst[231:224] := e28
+dst[239:232] := e29
+dst[247:240] := e30
+dst[255:248] := e31
+dst[263:256] := e32
+dst[271:264] := e33
+dst[279:272] := e34
+dst[287:280] := e35
+dst[295:288] := e36
+dst[303:296] := e37
+dst[311:304] := e38
+dst[319:312] := e39
+dst[327:320] := e40
+dst[335:328] := e41
+dst[343:336] := e42
+dst[351:344] := e43
+dst[359:352] := e44
+dst[367:360] := e45
+dst[375:368] := e46
+dst[383:376] := e47
+dst[391:384] := e48
+dst[399:392] := e49
+dst[407:400] := e50
+dst[415:408] := e51
+dst[423:416] := e52
+dst[431:424] := e53
+dst[439:432] := e54
+dst[447:440] := e55
+dst[455:448] := e56
+dst[463:456] := e57
+dst[471:464] := e58
+dst[479:472] := e59
+dst[487:480] := e60
+dst[495:488] := e61
+dst[503:496] := e62
+dst[511:504] := e63
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="short" varname="e31" etype="UI16"/>
+	<parameter type="short" varname="e30" etype="UI16"/>
+	<parameter type="short" varname="e29" etype="UI16"/>
+	<parameter type="short" varname="e28" etype="UI16"/>
+	<parameter type="short" varname="e27" etype="UI16"/>
+	<parameter type="short" varname="e26" etype="UI16"/>
+	<parameter type="short" varname="e25" etype="UI16"/>
+	<parameter type="short" varname="e24" etype="UI16"/>
+	<parameter type="short" varname="e23" etype="UI16"/>
+	<parameter type="short" varname="e22" etype="UI16"/>
+	<parameter type="short" varname="e21" etype="UI16"/>
+	<parameter type="short" varname="e20" etype="UI16"/>
+	<parameter type="short" varname="e19" etype="UI16"/>
+	<parameter type="short" varname="e18" etype="UI16"/>
+	<parameter type="short" varname="e17" etype="UI16"/>
+	<parameter type="short" varname="e16" etype="UI16"/>
+	<parameter type="short" varname="e15" etype="UI16"/>
+	<parameter type="short" varname="e14" etype="UI16"/>
+	<parameter type="short" varname="e13" etype="UI16"/>
+	<parameter type="short" varname="e12" etype="UI16"/>
+	<parameter type="short" varname="e11" etype="UI16"/>
+	<parameter type="short" varname="e10" etype="UI16"/>
+	<parameter type="short" varname="e9" etype="UI16"/>
+	<parameter type="short" varname="e8" etype="UI16"/>
+	<parameter type="short" varname="e7" etype="UI16"/>
+	<parameter type="short" varname="e6" etype="UI16"/>
+	<parameter type="short" varname="e5" etype="UI16"/>
+	<parameter type="short" varname="e4" etype="UI16"/>
+	<parameter type="short" varname="e3" etype="UI16"/>
+	<parameter type="short" varname="e2" etype="UI16"/>
+	<parameter type="short" varname="e1" etype="UI16"/>
+	<parameter type="short" varname="e0" etype="UI16"/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+dst[79:64] := e4
+dst[95:80] := e5
+dst[111:96] := e6
+dst[127:112] := e7
+dst[143:128] := e8
+dst[159:144] := e9
+dst[175:160] := e10
+dst[191:176] := e11
+dst[207:192] := e12
+dst[223:208] := e13
+dst[239:224] := e14
+dst[255:240] := e15
+dst[271:256] := e16
+dst[287:272] := e17
+dst[303:288] := e18
+dst[319:304] := e19
+dst[335:320] := e20
+dst[351:336] := e21
+dst[367:352] := e22
+dst[383:368] := e23
+dst[399:384] := e24
+dst[415:400] := e25
+dst[431:416] := e26
+dst[447:432] := e27
+dst[463:448] := e28
+dst[479:464] := e29
+dst[495:480] := e30
+dst[511:496] := e31
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="e15" etype="UI32"/>
+	<parameter type="int" varname="e14" etype="UI32"/>
+	<parameter type="int" varname="e13" etype="UI32"/>
+	<parameter type="int" varname="e12" etype="UI32"/>
+	<parameter type="int" varname="e11" etype="UI32"/>
+	<parameter type="int" varname="e10" etype="UI32"/>
+	<parameter type="int" varname="e9" etype="UI32"/>
+	<parameter type="int" varname="e8" etype="UI32"/>
+	<parameter type="int" varname="e7" etype="UI32"/>
+	<parameter type="int" varname="e6" etype="UI32"/>
+	<parameter type="int" varname="e5" etype="UI32"/>
+	<parameter type="int" varname="e4" etype="UI32"/>
+	<parameter type="int" varname="e3" etype="UI32"/>
+	<parameter type="int" varname="e2" etype="UI32"/>
+	<parameter type="int" varname="e1" etype="UI32"/>
+	<parameter type="int" varname="e0" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[287:256] := e8
+dst[319:288] := e9
+dst[351:320] := e10
+dst[383:352] := e11
+dst[415:384] := e12
+dst[447:416] := e13
+dst[479:448] := e14
+dst[511:480] := e15
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="e7" etype="UI64"/>
+	<parameter type="__int64" varname="e6" etype="UI64"/>
+	<parameter type="__int64" varname="e5" etype="UI64"/>
+	<parameter type="__int64" varname="e4" etype="UI64"/>
+	<parameter type="__int64" varname="e3" etype="UI64"/>
+	<parameter type="__int64" varname="e2" etype="UI64"/>
+	<parameter type="__int64" varname="e1" etype="UI64"/>
+	<parameter type="__int64" varname="e0" etype="UI64"/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[319:256] := e4
+dst[383:320] := e5
+dst[447:384] := e6
+dst[511:448] := e7
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="e7" etype="FP64"/>
+	<parameter type="double" varname="e6" etype="FP64"/>
+	<parameter type="double" varname="e5" etype="FP64"/>
+	<parameter type="double" varname="e4" etype="FP64"/>
+	<parameter type="double" varname="e3" etype="FP64"/>
+	<parameter type="double" varname="e2" etype="FP64"/>
+	<parameter type="double" varname="e1" etype="FP64"/>
+	<parameter type="double" varname="e0" etype="FP64"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[319:256] := e4
+dst[383:320] := e5
+dst[447:384] := e6
+dst[511:448] := e7
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_set_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="e15" etype="FP32"/>
+	<parameter type="float" varname="e14" etype="FP32"/>
+	<parameter type="float" varname="e13" etype="FP32"/>
+	<parameter type="float" varname="e12" etype="FP32"/>
+	<parameter type="float" varname="e11" etype="FP32"/>
+	<parameter type="float" varname="e10" etype="FP32"/>
+	<parameter type="float" varname="e9" etype="FP32"/>
+	<parameter type="float" varname="e8" etype="FP32"/>
+	<parameter type="float" varname="e7" etype="FP32"/>
+	<parameter type="float" varname="e6" etype="FP32"/>
+	<parameter type="float" varname="e5" etype="FP32"/>
+	<parameter type="float" varname="e4" etype="FP32"/>
+	<parameter type="float" varname="e3" etype="FP32"/>
+	<parameter type="float" varname="e2" etype="FP32"/>
+	<parameter type="float" varname="e1" etype="FP32"/>
+	<parameter type="float" varname="e0" etype="FP32"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[287:256] := e8
+dst[319:288] := e9
+dst[351:320] := e10
+dst[383:352] := e11
+dst[415:384] := e12
+dst[447:416] := e13
+dst[479:448] := e14
+dst[511:480] := e15
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_setr4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="d" etype="UI32"/>
+	<parameter type="int" varname="c" etype="UI32"/>
+	<parameter type="int" varname="b" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[31:0] := d
+dst[63:32] := c
+dst[95:64] := b
+dst[127:96] := a
+dst[159:128] := d
+dst[191:160] := c
+dst[223:192] := b
+dst[255:224] := a
+dst[287:256] := d
+dst[319:288] := c
+dst[351:320] := b
+dst[383:352] := a
+dst[415:384] := d
+dst[447:416] := c
+dst[479:448] := b
+dst[511:480] := a
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_setr4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="d" etype="UI64"/>
+	<parameter type="__int64" varname="c" etype="UI64"/>
+	<parameter type="__int64" varname="b" etype="UI64"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Set packed 64-bit integers in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[63:0] := d
+dst[127:64] := c
+dst[191:128] := b
+dst[255:192] := a
+dst[319:256] := d
+dst[383:320] := c
+dst[447:384] := b
+dst[511:448] := a
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_setr4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="d" etype="FP64"/>
+	<parameter type="double" varname="c" etype="FP64"/>
+	<parameter type="double" varname="b" etype="FP64"/>
+	<parameter type="double" varname="a" etype="FP64"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[63:0] := d
+dst[127:64] := c
+dst[191:128] := b
+dst[255:192] := a
+dst[319:256] := d
+dst[383:320] := c
+dst[447:384] := b
+dst[511:448] := a
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_setr4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="d" etype="FP32"/>
+	<parameter type="float" varname="c" etype="FP32"/>
+	<parameter type="float" varname="b" etype="FP32"/>
+	<parameter type="float" varname="a" etype="FP32"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[31:0] := d
+dst[63:32] := c
+dst[95:64] := b
+dst[127:96] := a
+dst[159:128] := d
+dst[191:160] := c
+dst[223:192] := b
+dst[255:224] := a
+dst[287:256] := d
+dst[319:288] := c
+dst[351:320] := b
+dst[383:352] := a
+dst[415:384] := d
+dst[447:416] := c
+dst[479:448] := b
+dst[511:480] := a
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_setr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="e15" etype="UI32"/>
+	<parameter type="int" varname="e14" etype="UI32"/>
+	<parameter type="int" varname="e13" etype="UI32"/>
+	<parameter type="int" varname="e12" etype="UI32"/>
+	<parameter type="int" varname="e11" etype="UI32"/>
+	<parameter type="int" varname="e10" etype="UI32"/>
+	<parameter type="int" varname="e9" etype="UI32"/>
+	<parameter type="int" varname="e8" etype="UI32"/>
+	<parameter type="int" varname="e7" etype="UI32"/>
+	<parameter type="int" varname="e6" etype="UI32"/>
+	<parameter type="int" varname="e5" etype="UI32"/>
+	<parameter type="int" varname="e4" etype="UI32"/>
+	<parameter type="int" varname="e3" etype="UI32"/>
+	<parameter type="int" varname="e2" etype="UI32"/>
+	<parameter type="int" varname="e1" etype="UI32"/>
+	<parameter type="int" varname="e0" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e15
+dst[63:32] := e14
+dst[95:64] := e13
+dst[127:96] := e12
+dst[159:128] := e11
+dst[191:160] := e10
+dst[223:192] := e9
+dst[255:224] := e8
+dst[287:256] := e7
+dst[319:288] := e6
+dst[351:320] := e5
+dst[383:352] := e4
+dst[415:384] := e3
+dst[447:416] := e2
+dst[479:448] := e1
+dst[511:480] := e0
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_setr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="e7" etype="UI64"/>
+	<parameter type="__int64" varname="e6" etype="UI64"/>
+	<parameter type="__int64" varname="e5" etype="UI64"/>
+	<parameter type="__int64" varname="e4" etype="UI64"/>
+	<parameter type="__int64" varname="e3" etype="UI64"/>
+	<parameter type="__int64" varname="e2" etype="UI64"/>
+	<parameter type="__int64" varname="e1" etype="UI64"/>
+	<parameter type="__int64" varname="e0" etype="UI64"/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e7
+dst[127:64] := e6
+dst[191:128] := e5
+dst[255:192] := e4
+dst[319:256] := e3
+dst[383:320] := e2
+dst[447:384] := e1
+dst[511:448] := e0
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_setr_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="e7" etype="FP64"/>
+	<parameter type="double" varname="e6" etype="FP64"/>
+	<parameter type="double" varname="e5" etype="FP64"/>
+	<parameter type="double" varname="e4" etype="FP64"/>
+	<parameter type="double" varname="e3" etype="FP64"/>
+	<parameter type="double" varname="e2" etype="FP64"/>
+	<parameter type="double" varname="e1" etype="FP64"/>
+	<parameter type="double" varname="e0" etype="FP64"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e7
+dst[127:64] := e6
+dst[191:128] := e5
+dst[255:192] := e4
+dst[319:256] := e3
+dst[383:320] := e2
+dst[447:384] := e1
+dst[511:448] := e0
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_setr_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="e15" etype="FP32"/>
+	<parameter type="float" varname="e14" etype="FP32"/>
+	<parameter type="float" varname="e13" etype="FP32"/>
+	<parameter type="float" varname="e12" etype="FP32"/>
+	<parameter type="float" varname="e11" etype="FP32"/>
+	<parameter type="float" varname="e10" etype="FP32"/>
+	<parameter type="float" varname="e9" etype="FP32"/>
+	<parameter type="float" varname="e8" etype="FP32"/>
+	<parameter type="float" varname="e7" etype="FP32"/>
+	<parameter type="float" varname="e6" etype="FP32"/>
+	<parameter type="float" varname="e5" etype="FP32"/>
+	<parameter type="float" varname="e4" etype="FP32"/>
+	<parameter type="float" varname="e3" etype="FP32"/>
+	<parameter type="float" varname="e2" etype="FP32"/>
+	<parameter type="float" varname="e1" etype="FP32"/>
+	<parameter type="float" varname="e0" etype="FP32"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e15
+dst[63:32] := e14
+dst[95:64] := e13
+dst[127:96] := e12
+dst[159:128] := e11
+dst[191:160] := e10
+dst[223:192] := e9
+dst[255:224] := e8
+dst[287:256] := e7
+dst[319:288] := e6
+dst[351:320] := e5
+dst[383:352] := e4
+dst[415:384] := e3
+dst[447:416] := e2
+dst[479:448] := e1
+dst[511:480] := e0
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_setzero">
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m512 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="VPXORQ" form="zmm, zmm, zmm" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_setzero_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<description>Return vector of type __m512i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="VPXORQ" form="zmm, zmm, zmm" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_setzero_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<description>Return vector of type __m512d with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="VPXORQ" form="zmm, zmm, zmm" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_setzero_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<description>Return vector of type __m512 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="VPXORQ" form="zmm, zmm, zmm" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_setzero_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<description>Return vector of type __m512i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="VPXORQ" form="zmm, zmm, zmm" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_undefined">
+	<CPUID>AVX512F</CPUID>
+	<category>General Support</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m512 with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_undefined_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>General Support</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<description>Return vector of type __m512i with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_undefined_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>General Support</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<description>Return vector of type __m512d with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_undefined_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>General Support</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<description>Return vector of type __m512 with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_acos_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ACOS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_acos_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ACOS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_acos_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ACOS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_acos_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ACOS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_acosh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ACOSH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_acosh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ACOSH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_acosh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ACOSH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_acosh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ACOSH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_asin_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ASIN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_asin_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ASIN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_asin_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ASIN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_asin_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ASIN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_asinh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ASINH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_asinh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ASINH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_asinh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ASINH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_asinh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ASINH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_atan2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_atan2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_atan2_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_atan2_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_atan_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_atan_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ATAN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_atan_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_atan_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ATAN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_atanh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ATANH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_atanh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ATANH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_atanh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ATANH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_atanh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ATANH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cbrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := CubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cbrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := CubeRoot(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cbrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := CubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cbrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := CubeRoot(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cdfnorm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := CDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cdfnorm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := CDFNormal(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cdfnorm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := CDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cdfnorm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := CDFNormal(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cdfnorminv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cdfnorminv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cdfnorminv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cdfnorminv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_ceil_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_ceil_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := CEIL(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_ceil_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_ceil_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := CEIL(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cos_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cos_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := COS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cos_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cos_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := COS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cosd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := COSD(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cosd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := COSD(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cosd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := COSD(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cosd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := COSD(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cosh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := COSH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cosh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := COSH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_cosh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := COSH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_cosh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := COSH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_erf_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ERF(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_erf_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ERF(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_erfc_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_erfc_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_erf_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ERF(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_erf_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ERF(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_erfc_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+63:i] := 1.0 - ERF(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_erfc_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+63:i] := 1.0 - ERF(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_erfinv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_erfinv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_erfinv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+63:i] := 1.0 / ERF(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_erfinv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+63:i] := 1.0 / ERF(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_erfcinv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_erfcinv_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_erfcinv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_erfcinv_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_exp10_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(10.0, a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_exp10_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(10.0, a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_exp10_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(10.0), a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_exp10_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(10.0), a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_exp2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(2.0, a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_exp2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(2.0, a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_exp2_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_exp2_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_exp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_exp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(e, a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_exp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_exp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(e), a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_expm1_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i]) - 1.0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_expm1_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(e, a[i+63:i]) - 1.0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_expm1_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_expm1_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_floor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_floor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FLOOR(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_floor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_floor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FLOOR(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_hypot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_hypot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_hypot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_hypot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_div_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_div_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		IF b[i+31:i] == 0
+			#DE
+		FI
+		dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_div_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="SI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_div_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_div_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<parameter type="__m512i" varname="b" etype="SI64"/>
+	<description>Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_invsqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := InvSQRT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_invsqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := InvSQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_invsqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := InvSQRT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_invsqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := InvSQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rem_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_rem_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rem_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 63
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rem_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 31
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rem_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_log10_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_log10_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_log10_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_log10_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_log1p_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LOG(1.0 + a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_log1p_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LOG(1.0 + a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_log1p_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LOG(1.0 + a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_log1p_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LOG(1.0 + a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_log2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_log2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_log_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_log_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LOG(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_log_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_log_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LOG(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_logb_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_logb_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_logb_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_logb_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_nearbyint_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := NearbyInt(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_nearbyint_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := NearbyInt(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_nearbyint_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := NearbyInt(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_nearbyint_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := NearbyInt(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_pow_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POW(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_pow_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POW(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_pow_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POW(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_pow_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POW(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_recip_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1.0 / a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_recip_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1.0 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_recip_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_recip_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rint_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RoundToNearestEven(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_rint_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundToNearestEven(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rint_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RoundToNearestEven(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_rint_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundToNearestEven(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_svml_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_svml_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ROUND(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i] 
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_sin_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_sin_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SIN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_sin_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_sin_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SIN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_sinh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SINH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_sinh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SINH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_sinh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SINH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_sinh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SINH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_sind_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SIND(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_sind_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SIND(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_sind_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SIND(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_sind_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SIND(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_tan_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TAN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_tan_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TAN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_tan_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TAN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_tan_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TAN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_tand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TAND(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_tand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TAND(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_tand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TAND(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_tand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TAND(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_tanh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TANH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_tanh_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TANH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_tanh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TANH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_tanh_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TANH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_trunc_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TRUNCATE(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_trunc_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TRUNCATE(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_trunc_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TRUNCATE(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_trunc_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TRUNCATE(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_div_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_div_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		IF b[i+31:i] == 0
+			#DE
+		FI
+		dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_div_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_div_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_div_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rem_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_rem_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rem_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 63
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rem_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 31
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_rem_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kortestz">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0.</description>
+	<operation>dst[15:0] := k1[15:0] | k2[15:0]
+IF dst == 0
+	SetZF()
+FI
+	</operation>
+	<instruction name="KORTESTW" form="k, k" xed="KORTESTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_kortestc">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's.</description>
+	<operation>dst[15:0] := k1[15:0] | k2[15:0]
+IF PopCount(dst[15:0]) == 16
+	SetCF()
+FI
+	</operation>
+	<instruction name="KORTESTW" form="k, k" xed="KORTESTW_MASKmskw_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask2int">
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<description>Converts bit mask "k1" into an integer value, storing the results in "dst".</description>
+	<operation>
+dst := ZeroExtend32(k1)
+	</operation>
+	<instruction name="KMOVW" form="r32, k" xed="KMOVW_GPR32u32_MASKmskw_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_int2mask">
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="dst" etype="MASK"/>
+	<parameter type="int" varname="mask" etype="UI16"/>
+	<description>Converts integer "mask" into bitmask, storing the result in "dst".</description>
+	<operation>
+dst := mask[15:0]
+	</operation>
+	<instruction name="KMOVW" form="k, r32" xed="KMOVW_MASKmskw_GPR32u32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_mullox_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="TRUE" name="_mm512_mask_mullox_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_sincos_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d *" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_sincos_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d *" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<parameter type="__m512d" varname="sin_src" etype="FP64"/>
+	<parameter type="__m512d" varname="cos_src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SIN(a[i+63:i])
+		MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := sin_src[i+63:i]
+		MEM[mem_addr+i+63:mem_addr+i] := cos_src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_sincos_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512 *" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm512_mask_sincos_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512 *" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<parameter type="__m512" varname="sin_src" etype="FP32"/>
+	<parameter type="__m512" varname="cos_src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SIN(a[i+31:i])
+		MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := sin_src[i+31:i]
+		MEM[mem_addr+i+31:mem_addr+i] := cos_src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_cvtss_f32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name="VMOVSS" form="m32, xmm" xed="VMOVSS_MEMf32_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_cvtsd_f64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="VMOVSD" form="m64, xmm" xed="VMOVSD_MEMq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_cvtsi512_si32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name="VMOVD" form="r32, xmm" xed="VMOVD_GPR32u32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPD" form="zmm, zmm, zmm" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_add_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPD" form="zmm, zmm, zmm {er}" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPD" form="zmm {k}, zmm, zmm" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_add_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPD" form="zmm {k}, zmm, zmm {er}" xed="VADDPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPS" form="zmm, zmm, zmm" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_add_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPS" form="zmm, zmm, zmm {er}" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPS" form="zmm {k}, zmm, zmm" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_add_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDPS" form="zmm {k}, zmm, zmm {er}" xed="VADDPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst".</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (32*imm8[3:0])
+dst[511:0] := temp[511:0]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VALIGND" form="zmm, zmm, zmm, imm8" xed="VALIGND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (32*imm8[3:0])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VALIGND" form="zmm {k}, zmm, zmm, imm8" xed="VALIGND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_blend_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBLENDMPD" form="zmm {k}, zmm, zmm" xed="VBLENDMPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_blend_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VBLENDMPS" form="zmm {k}, zmm, zmm" xed="VBLENDMPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmp_round_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm {sae}, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpeq_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
+ENDFOR	
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmple_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] &lt;= b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmplt_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] &lt; b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpneq_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpnle_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (!(a[i+63:i] &lt;= b[i+63:i])) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpnlt_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (!(a[i+63:i] &lt; b[i+63:i])) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpord_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpunord_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmp_round_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm {sae}, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpeq_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR	
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmple_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] &lt;= b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmplt_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] &lt; b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpneq_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpnle_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (!(a[i+63:i] &lt;= b[i+63:i])) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpnlt_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (!(a[i+63:i] &lt; b[i+63:i])) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpord_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpunord_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__mmask8" varname="k1" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="VCMPPD" form="k {k}, zmm, zmm, imm8" xed="VCMPPD_MASKmskw_MASKmskw_ZMMf64_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmp_round_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm {sae}, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpeq_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
+ENDFOR	
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmple_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] &lt;= b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmplt_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] &lt; b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpneq_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpnle_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (!(a[i+31:i] &lt;= b[i+31:i])) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpnlt_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (!(a[i+31:i] &lt; b[i+31:i])) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpord_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpunord_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmp_round_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immtype="_CMP_"/>
+	<parameter type="const int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).  [sae_note]</description>
+	<operation>CASE (imm8[4:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm {sae}, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpeq_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR		
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmple_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] &lt;= b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmplt_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] &lt; b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpneq_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpnle_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (!(a[i+31:i] &lt;= b[i+31:i])) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpnlt_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (!(a[i+31:i] &lt; b[i+31:i])) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpord_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpunord_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VCMPPS" form="k {k}, zmm, zmm, imm8" xed="VCMPPS_MASKmskw_MASKmskw_ZMMf32_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="zmm, zmm, zmm" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="zmm, zmm, zmm" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="zmm, zmm, zmm" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="zmm, zmm, zmm {er}" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="zmm, zmm, zmm {er}" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="zmm, zmm, zmm {er}" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="zmm {k}, zmm, zmm" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="zmm {k}, zmm, zmm" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="zmm {k}, zmm, zmm" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE 
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="zmm {k}, zmm, zmm" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="zmm {k}, zmm, zmm" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="zmm {k}, zmm, zmm" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD213PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMADD231PD" form="zmm {k}, zmm, zmm {er}" xed="VFMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="zmm, zmm, zmm" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="zmm, zmm, zmm" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="zmm, zmm, zmm" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="zmm, zmm, zmm {er}" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="zmm, zmm, zmm {er}" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="zmm, zmm, zmm {er}" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="zmm {k}, zmm, zmm" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="zmm {k}, zmm, zmm" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="zmm {k}, zmm, zmm" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="zmm {k}, zmm, zmm" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="zmm {k}, zmm, zmm" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="zmm {k}, zmm, zmm" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD213PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMADD231PS" form="zmm {k}, zmm, zmm {er}" xed="VFMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="zmm, zmm, zmm" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="zmm, zmm, zmm" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="zmm, zmm, zmm" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="zmm, zmm, zmm {er}" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="zmm, zmm, zmm {er}" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="zmm, zmm, zmm {er}" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="zmm {k}, zmm, zmm" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="zmm {k}, zmm, zmm" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="zmm {k}, zmm, zmm" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="zmm {k}, zmm, zmm" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="zmm {k}, zmm, zmm" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="zmm {k}, zmm, zmm" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB213PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFMSUB231PD" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="zmm, zmm, zmm" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="zmm, zmm, zmm" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="zmm, zmm, zmm" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="zmm, zmm, zmm {er}" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="zmm, zmm, zmm {er}" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="zmm, zmm, zmm {er}" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="zmm {k}, zmm, zmm" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="zmm {k}, zmm, zmm" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="zmm {k}, zmm, zmm" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="zmm {k}, zmm, zmm" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="zmm {k}, zmm, zmm" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="zmm {k}, zmm, zmm" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB213PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFMSUB231PS" form="zmm {k}, zmm, zmm {er}" xed="VFMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="zmm, zmm, zmm" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="zmm, zmm, zmm" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="zmm, zmm, zmm" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fnmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".
+	 [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="zmm, zmm, zmm {er}" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="zmm, zmm, zmm {er}" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="zmm, zmm, zmm {er}" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="zmm {k}, zmm, zmm" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="zmm {k}, zmm, zmm" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="zmm {k}, zmm, zmm" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fnmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="zmm {k}, zmm, zmm" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="zmm {k}, zmm, zmm" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="zmm {k}, zmm, zmm" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fnmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD213PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMADD231PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="zmm, zmm, zmm" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="zmm, zmm, zmm" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="zmm, zmm, zmm" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fnmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".  
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="zmm, zmm, zmm {er}" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="zmm, zmm, zmm {er}" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="zmm, zmm, zmm {er}" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="zmm {k}, zmm, zmm" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="zmm {k}, zmm, zmm" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="zmm {k}, zmm, zmm" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fnmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="zmm {k}, zmm, zmm" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="zmm {k}, zmm, zmm" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="zmm {k}, zmm, zmm" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fnmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD213PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMADD231PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMADD231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="zmm, zmm, zmm" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="zmm, zmm, zmm" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="zmm, zmm, zmm" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fnmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".  
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="zmm, zmm, zmm {er}" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="zmm, zmm, zmm {er}" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="zmm, zmm, zmm {er}" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="zmm {k}, zmm, zmm" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="zmm {k}, zmm, zmm" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="zmm {k}, zmm, zmm" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fnmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="zmm {k}, zmm, zmm" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="zmm {k}, zmm, zmm" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="zmm {k}, zmm, zmm" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fnmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="__m512d" varname="c" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB132PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB213PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB213PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<instruction name="VFNMSUB231PD" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB231PD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="zmm, zmm, zmm" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="zmm, zmm, zmm" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="zmm, zmm, zmm" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_fnmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="zmm, zmm, zmm {er}" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="zmm, zmm, zmm {er}" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="zmm, zmm, zmm {er}" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="zmm {k}, zmm, zmm" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="zmm {k}, zmm, zmm" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="zmm {k}, zmm, zmm" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask3_fnmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="zmm {k}, zmm, zmm" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="zmm {k}, zmm, zmm" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="zmm {k}, zmm, zmm" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_fnmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="__m512" varname="c" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB132PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB213PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB213PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<instruction name="VFNMSUB231PS" form="zmm {k}, zmm, zmm {er}" xed="VFNMSUB231PS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="zmm, vm32z" xed="VGATHERDPS_ZMMf32_MASKmskw_MEMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="zmm {k}, vm32z" xed="VGATHERDPS_ZMMf32_MASKmskw_MEMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="zmm, zmm" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_getexp_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="zmm, zmm {sae}" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="zmm {k}, zmm" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_getexp_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPD" form="zmm {k}, zmm {sae}" xed="VGETEXPPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="zmm, zmm" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_getexp_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="zmm, zmm {sae}" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="zmm {k}, zmm" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_getexp_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETEXPPS" form="zmm {k}, zmm {sae}" xed="VGETEXPPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="zmm, zmm, imm8" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_getmant_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="zmm, zmm, imm8 {sae}" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="zmm {k}, zmm, imm8" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_getmant_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPD" form="zmm {k}, zmm, imm8 {sae}" xed="VGETMANTPD_ZMMf64_MASKmskw_ZMMf64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="zmm, zmm, imm8" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_getmant_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="zmm, zmm, imm8 {sae}" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="zmm {k}, zmm, imm8" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_getmant_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_MANTISSA_NORM_ENUM" varname="interv" etype="IMM" immtype="_MM_MANTISSA_NORM"/>
+	<parameter type="_MM_MANTISSA_SIGN_ENUM" varname="sc" etype="IMM" immtype="_MM_MANTISSA_SIGN"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][sae_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGETMANTPS" form="zmm {k}, zmm, imm8 {sae}" xed="VGETMANTPS_ZMMf32_MASKmskw_ZMMf32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<description>Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="zmm, m512" xed="VMOVAPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="zmm {k}, m512" xed="VMOVAPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Move</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="zmm {k}, zmm" xed="VMOVAPD_ZMMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_store_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPD" form="m512 {k}, zmm" xed="VMOVAPD_MEMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_store_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP64" memwidth="512"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVAPD" form="m512, zmm" xed="VMOVAPD_MEMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<description>Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="zmm, m512" xed="VMOVAPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="zmm {k}, m512" xed="VMOVAPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Move</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="zmm {k}, zmm" xed="VMOVAPS_ZMMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_store_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPS" form="m512 {k}, zmm" xed="VMOVAPS_MEMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_store_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="FP32" memwidth="512"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVAPS" form="m512, zmm" xed="VMOVAPS_MEMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<description>Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="zmm, m512" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_load_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="void const*" varname="mem_addr" etype="M512" memwidth="512"/>
+	<description>Load 512-bits of integer data from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="zmm, m512" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="zmm {k}, m512" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Move</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="zmm {k}, zmm" xed="VMOVDQA32_ZMMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA32" form="m512 {k}, zmm" xed="VMOVDQA32_MEMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="512"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVDQA32" form="m512, zmm" xed="VMOVDQA32_MEMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_store_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="M512" memwidth="512"/>
+	<parameter type="__m512i" varname="a" etype="M512"/>
+	<description>Store 512-bits of integer data from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVDQA32" form="m512, zmm" xed="VMOVDQA32_MEMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<description>Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="zmm, m512" xed="VMOVDQA64_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="zmm {k}, m512" xed="VMOVDQA64_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Move</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="zmm {k}, zmm" xed="VMOVDQA64_ZMMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA64" form="m512 {k}, zmm" xed="VMOVDQA64_MEMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="512"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name="VMOVDQA64" form="m512, zmm" xed="VMOVDQA64_MEMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPD" form="zmm {k}, zmm, zmm" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_mul_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPD" form="zmm {k}, zmm, zmm {er}" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPD" form="zmm, zmm, zmm" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mul_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPD" form="zmm, zmm, zmm {er}" xed="VMULPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPS" form="zmm {k}, zmm, zmm" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_mul_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	 [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPS" form="zmm {k}, zmm, zmm {er}" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPS" form="zmm, zmm, zmm" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mul_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMULPS" form="zmm, zmm, zmm {er}" xed="VMULPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDD" form="zmm, zmm, zmm" xed="VPADDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDD" form="zmm {k}, zmm, zmm" xed="VPADDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDD" form="zmm, zmm, zmm" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_and_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m512i" varname="a" etype="M512"/>
+	<parameter type="__m512i" varname="b" etype="M512"/>
+	<description>Compute the bitwise AND of 512 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDD" form="zmm, zmm, zmm" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDND" form="zmm, zmm, zmm" xed="VPANDND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_andnot_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m512i" varname="a" etype="M512"/>
+	<parameter type="__m512i" varname="b" etype="M512"/>
+	<description>Compute the bitwise NOT of 512 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := ((NOT a[511:0]) AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDND" form="zmm, zmm, zmm" xed="VPANDND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDND" form="zmm {k}, zmm, zmm" xed="VPANDND_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+dst[511:0] := ((NOT a[511:0]) AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDNQ" form="zmm, zmm, zmm" xed="VPANDNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDNQ" form="zmm {k}, zmm, zmm" xed="VPANDNQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDQ" form="zmm, zmm, zmm" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDQ" form="zmm {k}, zmm, zmm" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_blend_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBLENDMD" form="zmm {k}, zmm, zmm" xed="VPBLENDMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_blend_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPBLENDMQ" form="zmm {k}, zmm, zmm" xed="VPBLENDMQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPEQD" form="k, zmm, zmm" xed="VPCMPEQD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPGTD" form="k, zmm, zmm" xed="VPCMPGTD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPEQD" form="k {k}, zmm, zmm" xed="VPCMPEQD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPGTD" form="k {k}, zmm, zmm" xed="VPCMPGTD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPD" form="k {k}, zmm, zmm, imm8" xed="VPCMPD_MASKmskw_MASKmskw_ZMMi32_ZMMi32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="_MM_CMPINT_ENUM" varname="imm8" etype="IMM" immtype="_MM_CMPINT"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>CASE (imm8[2:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPUD" form="k {k}, zmm, zmm, imm8" xed="VPCMPUD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_permutevar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_mask_permutexvar_epi32", and it is recommended that you use that intrinsic name.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMD" form="zmm {k}, zmm, zmm" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_permutevar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="idx" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_permutexvar_epi32", and it is recommended that you use that intrinsic name.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMD" form="zmm, zmm, zmm" xed="VPERMD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="zmm, vm32z" xed="VPGATHERDD_ZMMu32_MASKmskw_MEMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="zmm {k}, vm32z" xed="VPGATHERDD_ZMMu32_MASKmskw_MEMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSD" form="zmm {k}, zmm, zmm" xed="VPMAXSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXSD" form="zmm, zmm, zmm" xed="VPMAXSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUD" form="zmm {k}, zmm, zmm" xed="VPMAXUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMAXUD" form="zmm, zmm, zmm" xed="VPMAXUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSD" form="zmm {k}, zmm, zmm" xed="VPMINSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINSD" form="zmm, zmm, zmm" xed="VPMINSD_ZMMi32_MASKmskw_ZMMi32_ZMMi32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUD" form="zmm {k}, zmm, zmm" xed="VPMINUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMINUD" form="zmm, zmm, zmm" xed="VPMINUD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULLD" form="zmm {k}, zmm, zmm" xed="VPMULLD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp[63:0] := a[i+31:i] * b[i+31:i]
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULLD" form="zmm, zmm, zmm" xed="VPMULLD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPORD" form="zmm {k}, zmm, zmm" xed="VPORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPORD" form="zmm, zmm, zmm" xed="VPORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_or_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m512i" varname="a" etype="M512"/>
+	<parameter type="__m512i" varname="b" etype="M512"/>
+	<description>Compute the bitwise OR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] OR b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPORD" form="zmm, zmm, zmm" xed="VPORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPORQ" form="zmm {k}, zmm, zmm" xed="VPORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the resut in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPORQ" form="zmm, zmm, zmm" xed="VPORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDD" form="vm32z, zmm" xed="VPSCATTERDD_MEMu32_MASKmskw_ZMMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDD" form="vm32z {k}, zmm" xed="VPSCATTERDD_MEMu32_MASKmskw_ZMMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFD" form="zmm {k}, zmm, imm8" xed="VPSHUFD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHUFD" form="zmm, zmm, imm8" xed="VPSHUFD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLD" form="zmm {k}, zmm, imm8" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLD" form="zmm, zmm, imm8" xed="VPSLLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLVD" form="zmm {k}, zmm, zmm" xed="VPSLLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSLLVD" form="zmm, zmm, zmm" xed="VPSLLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+		ELSE
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAD" form="zmm {k}, zmm, imm8" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="6"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAD" form="zmm, zmm, imm8" xed="VPSRAD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAVD" form="zmm {k}, zmm, zmm" xed="VPSRAVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0)
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRAVD" form="zmm, zmm, zmm" xed="VPSRAVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLD" form="zmm {k}, zmm, imm8" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLD" form="zmm, zmm, imm8" xed="VPSRLD_ZMMu32_MASKmskw_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[i+31:i] &lt; 32
+			dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLVD" form="zmm {k}, zmm, zmm" xed="VPSRLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[i+31:i] &lt; 32
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSRLVD" form="zmm, zmm, zmm" xed="VPSRLVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBD" form="zmm {k}, zmm, zmm" xed="VPSUBD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBD" form="zmm, zmm, zmm" xed="VPSUBD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTMD" form="k {k}, zmm, zmm" xed="VPTESTMD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPTESTMD" form="k, zmm, zmm" xed="VPTESTMD_MASKmskw_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPXORD" form="zmm {k}, zmm, zmm" xed="VPXORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPXORD" form="zmm, zmm, zmm" xed="VPXORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_xor_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m512i" varname="a" etype="M512"/>
+	<parameter type="__m512i" varname="b" etype="M512"/>
+	<description>Compute the bitwise XOR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] XOR b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPXORD" form="zmm, zmm, zmm" xed="VPXORD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPXORQ" form="zmm {k}, zmm, zmm" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPXORQ" form="zmm, zmm, zmm" xed="VPXORQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPS" form="vm32z, zmm" xed="VSCATTERDPS_MEMf32_MASKmskw_ZMMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPS" form="vm32z {k}, zmm" xed="VSCATTERDPS_MEMf32_MASKmskw_ZMMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPD" form="zmm {k}, zmm, zmm" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_sub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPD" form="zmm {k}, zmm, zmm {er}" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPD" form="zmm, zmm, zmm" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_sub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPD" form="zmm, zmm, zmm {er}" xed="VSUBPD_ZMMf64_MASKmskw_ZMMf64_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPS" form="zmm {k}, zmm, zmm" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_sub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPS" form="zmm {k}, zmm, zmm {er}" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPS" form="zmm, zmm, zmm" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_sub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBPS" form="zmm, zmm, zmm {er}" xed="VSUBPS_ZMMf32_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_castpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m512d to type __m512.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_castpd_si512">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m512d to type __m512i.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_castps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m512 to type __m512d.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_castps_si512">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<return type="__m512i" varname="dst" etype="M512"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m512 to type __m512i.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_castsi512_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Cast vector of type __m512i to type __m512d.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_castsi512_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Cast vector of type __m512i to type __m512.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed 32-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+dst[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := dst[31:0] + a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed 64-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+dst[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := dst[63:0] + a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+dst[63:0] := 0.0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := dst[63:0] + a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+dst[31:0] := 0.0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := dst[31:0] + a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed 32-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a".</description>
+	<operation>
+dst[31:0] := 0xFFFFFFFF
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := dst[31:0] AND a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed 64-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a".</description>
+	<operation>
+dst[63:0] := 0xFFFFFFFFFFFFFFFF
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := dst[63:0] AND a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="int" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+dst[31:0] := Int32(-0x80000000)
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := (dst[31:0] &gt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__int64" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+dst[63:0] := Int64(-0x8000000000000000)
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := (dst[63:0] &gt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+dst[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := (dst[31:0] &gt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed unsigned 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+dst[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := (dst[63:0] &gt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+dst[63:0] := Cast_FP64(0xFFEFFFFFFFFFFFFF)
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := (dst[63:0] &gt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+dst[31:0] := Cast_FP32(0xFF7FFFFF)
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := (dst[31:0] &gt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="int" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+dst[31:0] := Int32(0x7FFFFFFF)
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := (dst[31:0] &lt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__int64" varname="dst" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+dst[63:0] := Int64(0x7FFFFFFFFFFFFFFF)
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := (dst[63:0] &lt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+dst[31:0] := 0xFFFFFFFF
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := (dst[31:0] &lt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed unsigned 64-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+dst[63:0] := 0xFFFFFFFFFFFFFFFF
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := (dst[63:0] &lt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+dst[63:0] := Cast_FP64(0x7FEFFFFFFFFFFFFF)
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := (dst[63:0] &lt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+dst[31:0] := Cast_FP32(0x7F7FFFFF)
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := (dst[31:0] &lt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed 32-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+dst[31:0] := 1
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := dst[31:0] * a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_mul_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed 64-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+dst[63:0] := 1
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := dst[63:0] * a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+dst[63:0] := 1.0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := dst[63:0] * a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+dst[31:0] := FP32(1.0)
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := dst[31:0] * a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed 32-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a".</description>
+	<operation>
+dst[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[31:0] := dst[31:0] OR a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_mask_reduce_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed 64-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a".</description>
+	<operation>
+dst[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[63:0] := dst[63:0] OR a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed 32-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+dst[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := dst[31:0] + a[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed 64-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+dst[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := dst[63:0] + a[i+63:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+dst[63:0] := 0.0
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := dst[63:0] + a[i+63:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+dst[31:0] := 0.0
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := dst[31:0] + a[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed 32-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a".</description>
+	<operation>
+dst[31:0] := 0xFFFFFFFF
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := dst[31:0] AND a[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed 64-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a".</description>
+	<operation>
+dst[63:0] := 0xFFFFFFFFFFFFFFFF
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := dst[63:0] AND a[i+63:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="int" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Reduce the packed signed 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+dst[31:0] := Int32(-0x80000000)
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := (dst[31:0] &gt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__int64" varname="dst" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Reduce the packed signed 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+dst[63:0] := Int64(-0x8000000000000000)
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := (dst[63:0] &gt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed unsigned 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+dst[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := (dst[31:0] &gt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed unsigned 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+dst[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := (dst[63:0] &gt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+dst[63:0] := Cast_FP64(0xFFEFFFFFFFFFFFFF)
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := (dst[63:0] &gt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+dst[31:0] := Cast_FP32(0xFF7FFFFF)
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := (dst[31:0] &gt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="int" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<description>Reduce the packed signed 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+dst[31:0] := Int32(0x7FFFFFFF)
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := (dst[31:0] &lt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__int64" varname="dst" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="SI64"/>
+	<description>Reduce the packed signed 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+dst[63:0] := Int64(0x7FFFFFFFFFFFFFFF)
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := (dst[63:0] &lt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed unsigned 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+dst[31:0] := 0xFFFFFFFF
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := (dst[31:0] &lt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed unsigned 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+dst[63:0] := 0xFFFFFFFFFFFFFFFF
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := (dst[63:0] &lt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+dst[63:0] := Cast_FP64(0x7FEFFFFFFFFFFFFF)
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := (dst[63:0] &lt; a[i+63:i] ? dst[63:0] : a[i+63:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+dst[31:0] := Cast_FP32(0x7F7FFFFF)
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := (dst[31:0] &lt; a[i+31:i] ? dst[31:0] : a[i+31:i])
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed 32-bit integers in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+dst[31:0] := 1
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := dst[31:0] * a[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_mul_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed 64-bit integers in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+dst[63:0] := 1
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := dst[63:0] * a[i+63:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+dst[63:0] := 1.0
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := dst[63:0] * a[i+63:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+dst[31:0] := FP32(1.0)
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := dst[31:0] * a[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Reduce the packed 32-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a".</description>
+	<operation>
+dst[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	dst[31:0] := dst[31:0] OR a[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="TRUE" name="_mm512_reduce_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Reduce the packed 64-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a".</description>
+	<operation>
+dst[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	dst[63:0] := dst[63:0] OR a[i+63:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<description>Performs element-by-element bitwise AND between packed 32-bit integer elements of "v2" and "v3", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] &amp; v3[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDD" form="zmm {k}, zmm, zmm" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cvtpslo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<description>Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := j*64
+	dst[n+63:n] := Convert_FP32_To_FP64(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2PD" form="zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cvtpslo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<description>Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[l+63:l] := Convert_FP32_To_FP64(v2[i+31:i])
+	ELSE
+		dst[l+63:l] := src[l+63:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPS2PD" form="zmm {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cvtepi32lo_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="v2" etype="SI32"/>
+	<description>Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cvtepi32lo_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v2" etype="SI32"/>
+	<description>Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := j*64
+	IF k[j]
+		dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i])
+	ELSE
+		dst[n+63:n] := src[n+63:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTDQ2PD" form="zmm {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cvtepu32lo_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<description>Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := j*64
+	dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cvtepu32lo_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<description>Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i])
+	ELSE
+		dst[l+63:l] := src[l+63:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTUDQ2PD" form="zmm {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32extgather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const *" varname="base_addr" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 16 memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst". AVX512 only supports _MM_UPCONV_EPI32_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   dst[i+31:i] := MEM[addr+31:addr]
+	_MM_UPCONV_EPI32_UINT8:  dst[i+31:i] := ZeroExtend32(MEM[addr+7:addr])
+	_MM_UPCONV_EPI32_SINT8:  dst[i+31:i] := SignExtend32(MEM[addr+7:addr])
+	_MM_UPCONV_EPI32_UINT16: dst[i+31:i] := ZeroExtend32(MEM[addr+15:addr])
+	_MM_UPCONV_EPI32_SINT16: dst[i+31:i] := SignExtend32(MEM[addr+15:addr])
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="zmm, m512" xed="VPGATHERDD_ZMMu32_MASKmskw_MEMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32extgather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const *" varname="base_addr" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 16 single-precision (32-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). AVX512 only supports _MM_UPCONV_EPI32_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:   dst[i+31:i] := MEM[addr+31:addr]
+		_MM_UPCONV_EPI32_UINT8:  dst[i+31:i] := ZeroExtend32(MEM[addr+7:addr])
+		_MM_UPCONV_EPI32_SINT8:  dst[i+31:i] := SignExtend32(MEM[addr+7:addr])
+		_MM_UPCONV_EPI32_UINT16: dst[i+31:i] := ZeroExtend32(MEM[addr+15:addr])
+		_MM_UPCONV_EPI32_SINT16: dst[i+31:i] := SignExtend32(MEM[addr+15:addr])
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDD" form="zmm {k}, m512" xed="VPGATHERDD_ZMMu32_MASKmskw_MEMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32loextgather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE: dst[i+63:i] := MEM[addr+63:addr]
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="zmm, m512" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32loextgather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE: dst[i+63:i] := MEM[addr+63:addr]
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="zmm {k}, m512" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32extgather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const *" varname="base_addr" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 16 memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in "dst". AVX512 only supports _MM_UPCONV_PS_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:    dst[i+31:i] := MEM[addr+31:addr]
+	_MM_UPCONV_PS_FLOAT16: dst[i+31:i] := Convert_FP16_To_FP32(MEM[addr+15:addr])
+	_MM_UPCONV_PS_UINT8:   dst[i+31:i] := Convert_UInt8_To_FP32(MEM[addr+7:addr])
+	_MM_UPCONV_PS_SINT8:   dst[i+31:i] := Convert_Int8_To_FP32(MEM[addr+7:addr])
+	_MM_UPCONV_PS_UINT16:  dst[i+31:i] := Convert_UInt16_To_FP32(MEM[addr+15:addr])
+	_MM_UPCONV_PS_SINT16:  dst[i+31:i] := Convert_Int16_To_FP32(MEM[addr+15:addr])
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="zmm, m512" xed="VGATHERDPS_ZMMf32_MASKmskw_MEMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32extgather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const *" varname="base_addr" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 16 single-precision (32-bit) memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). AVX512 only supports _MM_UPCONV_PS_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:    dst[i+31:i] := MEM[addr+31:addr]
+		_MM_UPCONV_PS_FLOAT16: dst[i+31:i] := Convert_FP16_To_FP32(MEM[addr+15:addr])
+		_MM_UPCONV_PS_UINT8:   dst[i+31:i] := Convert_UInt8_To_FP32(MEM[addr+7:addr])
+		_MM_UPCONV_PS_SINT8:   dst[i+31:i] := Convert_Int8_To_FP32(MEM[addr+7:addr])
+		_MM_UPCONV_PS_UINT16:  dst[i+31:i] := Convert_UInt16_To_FP32(MEM[addr+15:addr])
+		_MM_UPCONV_PS_SINT16:  dst[i+31:i] := Convert_Int16_To_FP32(MEM[addr+15:addr])
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPS" form="zmm {k}, m512" xed="VGATHERDPS_ZMMf32_MASKmskw_MEMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32loextgather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_UPCONV_PD_NONE: dst[i+63:i] := MEM[addr+63:addr]
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="zmm, m512" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32loextgather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_PD_NONE:
+			dst[i+63:i] := MEM[addr+63:addr]
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="zmm {k}, m512" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32extscatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 16 packed single-precision (32-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". AVX512 only supports _MM_DOWNCONV_PS_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_DOWNCONV_PS_NONE:    MEM[addr+31:addr] := a[i+31:i]
+	_MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i])
+	_MM_DOWNCONV_PS_UINT8:   MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i])
+	_MM_DOWNCONV_PS_SINT8:   MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i])
+	_MM_DOWNCONV_PS_UINT16:  MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i])
+	_MM_DOWNCONV_PS_SINT16:  MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPS" form="m512, zmm" xed="VSCATTERDPS_MEMf32_MASKmskw_ZMMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32extscatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 16 packed single-precision (32-bit) floating-point elements in "a" according to "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are written only when the corresponding mask bit is not set). AVX512 only supports _MM_DOWNCONV_PS_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_PS_NONE:    MEM[addr+31:addr] := a[i+31:i]
+		_MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i])
+		_MM_DOWNCONV_PS_UINT8:   MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i])
+		_MM_DOWNCONV_PS_SINT8:   MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i])
+		_MM_DOWNCONV_PS_UINT16:  MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i])
+		_MM_DOWNCONV_PS_SINT16:  MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i])
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPS" form="m512 {k}, zmm" xed="VSCATTERDPS_MEMf32_MASKmskw_ZMMf32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32loextscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_DOWNCONV_PD_NONE: MEM[addr+63:addr] := a[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="m512, zmm" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32loextscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_PD_NONE: MEM[addr+63:addr] := a[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="m512 {k}, zmm" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32loextscatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed 64-bit integer elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="m512, zmm" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32loextscatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed 64-bit integer elements in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="m512 {k}, zmm" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_cvtpd_pslo">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<description>Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst". The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k := j*32
+	dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_cvtpd_pslo">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<description>Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="zmm {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32logather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="zmm, m512" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32logather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPGATHERDQ" form="zmm {k}, m512" xed="VPGATHERDQ_ZMMu64_MASKmskw_MEMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32logather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Loads 8 double-precision (64-bit) floating-point elements stored at memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	dst[i+63:i] := MEM[addr+63:addr]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="zmm, m512" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32logather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Loads 8 double-precision (64-bit) floating-point elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		dst[i+63:i] := MEM[addr+63:addr]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGATHERDPD" form="zmm {k}, m512" xed="VGATHERDPD_ZMMf64_MASKmskw_MEMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32loscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Stores 8 packed double-precision (64-bit) floating-point elements in "a" and to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="m512, zmm" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32loscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Stores 8 packed double-precision (64-bit) floating-point elements in "a" to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERDPD" form="m512 {k}, zmm" xed="VSCATTERDPD_MEMf64_MASKmskw_ZMMf64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_abs_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<description>Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ABS(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDD" form="zmm, zmm, m512" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_abs_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<description>Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(v2[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDD" form="zmm {k}, zmm, m512" xed="VPANDD_ZMMu32_MASKmskw_ZMMu32_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_abs_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<description>Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ABS(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDQ" form="zmm, zmm, m512" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_abs_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<description>Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(v2[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPANDQ" form="zmm {k}, zmm, m512" xed="VPANDQ_ZMMu64_MASKmskw_ZMMu64_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML/KNC" name="_mm512_log2_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOG2PS" form="zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML/KNC" name="_mm512_mask_log2_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOG2PS" form="zmm {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_i32extscatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 16 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. AVX512 only supports _MM_DOWNCONV_EPI32_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_DOWNCONV_EPI32_NONE:   MEM[addr+31:addr] := a[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:  MEM[addr+ 7:addr] := Truncate8(a[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:  MEM[addr+ 7:addr] := Saturate8(a[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+15:i])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDD" form="m512, zmm" xed="VPSCATTERDD_MEMu32_MASKmskw_ZMMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_i32extscatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 16 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Elements are written using writemask "k" (elements are only written when the corresponding mask bit is set; otherwise, elements are left unchanged in memory). "hint" indicates to the processor whether the data is non-temporal. AVX512 only supports _MM_DOWNCONV_EPI32_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI32_NONE:   MEM[addr+31:addr] := a[i+31:i]
+		_MM_DOWNCONV_EPI32_UINT8:  MEM[addr+ 7:addr] := Truncate8(a[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT8:  MEM[addr+ 7:addr] := Saturate8(a[i+31:i])
+		_MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+15:i])
+		ESAC
+	FI 
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDD" form="m512 {k}, zmm" xed="VPSCATTERDD_MEMu32_MASKmskw_ZMMu32_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD52LUQ" form="zmm, zmm, zmm" xed="VPMADD52LUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD52LUQ" form="zmm {k}, zmm, zmm" xed="VPMADD52LUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD52LUQ" form="zmm {z}, zmm, zmm" xed="VPMADD52LUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADD52LUQ" form="ymm, ymm, ymm" xed="VPMADD52LUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADD52LUQ" form="ymm {k}, ymm, ymm" xed="VPMADD52LUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADD52LUQ" form="ymm {z}, ymm, ymm" xed="VPMADD52LUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADD52LUQ" form="xmm, xmm, xmm" xed="VPMADD52LUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADD52LUQ" form="xmm {k}, xmm, xmm" xed="VPMADD52LUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADD52LUQ" form="xmm {z}, xmm, xmm" xed="VPMADD52LUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD52HUQ" form="zmm, zmm, zmm" xed="VPMADD52HUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD52HUQ" form="zmm {k}, zmm, zmm" xed="VPMADD52HUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD52HUQ" form="zmm {z}, zmm, zmm" xed="VPMADD52HUQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADD52HUQ" form="ymm, ymm, ymm" xed="VPMADD52HUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADD52HUQ" form="ymm {k}, ymm, ymm" xed="VPMADD52HUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMADD52HUQ" form="ymm {z}, ymm, ymm" xed="VPMADD52HUQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADD52HUQ" form="xmm, xmm, xmm" xed="VPMADD52HUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADD52HUQ" form="xmm {k}, xmm, xmm" xed="VPMADD52HUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMADD52HUQ" form="xmm {z}, xmm, xmm" xed="VPMADD52HUQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_prefetch_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache. "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j:= 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+31:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0QPS" form="vm64z" xed="VGATHERPF0QPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1QPS" form="vm64z" xed="VGATHERPF1QPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_prefetch_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are only brought into cache when their corresponding mask bit is set). "scale" should be 1, 2, 4 or 8.. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j:= 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+31:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0QPS" form="vm64z {k}" xed="VGATHERPF0QPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1QPS" form="vm64z {k}" xed="VGATHERPF1QPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_prefetch_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. Elements are prefetched into cache level "hint", where "hint" is 0 or 1. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+31:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0QPS" form="vm64z" xed="VSCATTERPF0QPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1QPS" form="vm64z" xed="VSCATTERPF1QPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_prefetch_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+31:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0QPS" form="vm64z {k}" xed="VSCATTERPF0QPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1QPS" form="vm64z {k}" xed="VSCATTERPF1QPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_prefetch_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache. "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+63:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0DPD" form="vm32y" xed="VGATHERPF0DPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1DPD" form="vm32y" xed="VGATHERPF1DPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_prefetch_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+63:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0DPD" form="vm32y {k}" xed="VGATHERPF0DPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1DPD" form="vm32y {k}" xed="VGATHERPF1DPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_prefetch_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+63:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0DPD" form="vm32y" xed="VSCATTERPF0DPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1DPD" form="vm32y" xed="VSCATTERPF1DPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_prefetch_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="vindex" etype="SI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+63:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0DPD" form="vm32y {k}" xed="VSCATTERPF0DPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1DPD" form="vm32y {k}" xed="VSCATTERPF1DPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_prefetch_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by "hint" using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+63:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0QPD" form="vm32z" xed="VGATHERPF0QPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1QPD" form="vm32z" xed="VGATHERPF1QPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_prefetch_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="base_addr" etype="FP64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by "hint" using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Prefetched elements are merged in cache using writemask "k" (elements are copied from memory when the corresponding mask bit is set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+63:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0QPD" form="vm32z {k}" xed="VGATHERPF0QPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1QPD" form="vm32z {k}" xed="VGATHERPF1QPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_prefetch_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+63:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0QPD" form="vm32z" xed="VSCATTERPF0QPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1QPD" form="vm32z" xed="VSCATTERPF1QPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_prefetch_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+63:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0QPD" form="vm32z {k}" xed="VSCATTERPF0QPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1QPD" form="vm32z {k}" xed="VSCATTERPF1QPD_MEMf64_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_prefetch_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+31:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0DPS" form="vm32y {k}" xed="VGATHERPF0DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1DPS" form="vm32y {k}" xed="VGATHERPF1DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_prefetch_i32extgather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const *" varname="base_addr" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
+The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+31:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0DPS" form="m512" xed="VGATHERPF0DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1DPS" form="m512" xed="VGATHERPF1DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_prefetch_i32extgather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="base_addr" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
+The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+31:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0DPS" form="m512 {k}" xed="VGATHERPF0DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1DPS" form="m512 {k}" xed="VGATHERPF1DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_prefetch_i32extscatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint", with a request for exclusive ownership. The "hint" parameter may be one of the following: _MM_HINT_T0 = 1 for prefetching to L1 cache, _MM_HINT_T1 = 2 for prefetching to L2 cache, _MM_HINT_T2 = 3 for prefetching to L2 cache non-temporal, _MM_HINT_NTA = 0 for prefetching to L1 cache non-temporal. The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent scatter intrinsic.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+31:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0DPS" form="m512" xed="VSCATTERPF0DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1DPS" form="m512" xed="VSCATTERPF1DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_prefetch_i32extscatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "base_addr" and 32-bit integer index vector "vindex" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
+The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. Only those elements whose corresponding mask bit in "k" is set are loaded into cache.</description>
+	<operation>
+cachev := 0
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+31:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0DPS" form="m512 {k}" xed="VSCATTERPF0DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1DPS" form="m512 {k}" xed="VSCATTERPF1DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_prefetch_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Load</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="void const*" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+31:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VGATHERPF0DPS" form="m512" xed="VGATHERPF0DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VGATHERPF1DPS" form="m512" xed="VGATHERPF1DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_prefetch_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	Prefetch(MEM[addr+31:addr], hint)
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0DPS" form="m512" xed="VSCATTERPF0DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1DPS" form="m512" xed="VSCATTERPF1DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" name="_mm512_mask_prefetch_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="IMM" hint="TRUE" immtype="_MM_HINT_PREFETCH"/>
+	<description>Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. Only those elements whose corresponding mask bit in "k" is set are loaded into the desired cache.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		Prefetch(MEM[addr+31:addr], hint)
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VSCATTERPF0DPS" form="m512 {k}" xed="VSCATTERPF0DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<instruction name="VSCATTERPF1DPS" form="m512 {k}" xed="VSCATTERPF1DPS_MEMf32_MASKmskw_AVX512PF_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_popcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTQ" form="ymm {z}, ymm" xed="VPOPCNTQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_popcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTQ" form="ymm {k}, ymm" xed="VPOPCNTQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_popcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := POPCNT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTQ" form="ymm, ymm" xed="VPOPCNTQ_YMMu64_MASKmskw_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_popcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTQ" form="xmm {z}, xmm" xed="VPOPCNTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_popcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTQ" form="xmm {k}, xmm" xed="VPOPCNTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_popcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POPCNT(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTQ" form="xmm, xmm" xed="VPOPCNTQ_XMMu64_MASKmskw_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm256_popcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := POPCNT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTD" form="ymm, ymm" xed="VPOPCNTD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm256_mask_popcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTD" form="ymm {k}, ymm" xed="VPOPCNTD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm256_maskz_popcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTD" form="ymm {z}, ymm" xed="VPOPCNTD_YMMu32_MASKmskw_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_popcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POPCNT(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTD" form="xmm, xmm" xed="VPOPCNTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_mask_popcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTD" form="xmm {k}, xmm" xed="VPOPCNTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_maskz_popcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTD" form="xmm {z}, xmm" xed="VPOPCNTD_XMMu32_MASKmskw_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_popcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := POPCNT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTD" form="zmm, zmm" xed="VPOPCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_mask_popcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTD" form="zmm {k}, zmm" xed="VPOPCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_maskz_popcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTD" form="zmm {z}, zmm" xed="VPOPCNTD_ZMMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_popcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := POPCNT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTQ" form="zmm, zmm" xed="VPOPCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_mask_popcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTQ" form="zmm {k}, zmm" xed="VPOPCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_maskz_popcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTQ" form="zmm {z}, zmm" xed="VPOPCNTQ_ZMMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_4fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__m512" varname="a0" etype="FP32"/>
+	<parameter type="__m512" varname="a1" etype="FP32"/>
+	<parameter type="__m512" varname="a2" etype="FP32"/>
+	<parameter type="__m512" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate with the corresponding elements in "src", and store the results in "dst".</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	FOR m := 0 to 3
+		addr := b + m * 32
+		dst.fp32[i] := dst.fp32[i] + a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr])
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="V4FMADDPS" form="zmm, zmm, m128" xed="V4FMADDPS_ZMMf32_MASKmskw_ZMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_mask_4fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a0" etype="FP32"/>
+	<parameter type="__m512" varname="a1" etype="FP32"/>
+	<parameter type="__m512" varname="a2" etype="FP32"/>
+	<parameter type="__m512" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate with the corresponding elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	FOR m := 0 to 3
+		addr := b + m * 32
+		IF k[i]
+			dst.fp32[i] := dst.fp32[i] + a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr])
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="V4FMADDPS" form="zmm {k}, zmm, m128" xed="V4FMADDPS_ZMMf32_MASKmskw_ZMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_maskz_4fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__m512" varname="a0" etype="FP32"/>
+	<parameter type="__m512" varname="a1" etype="FP32"/>
+	<parameter type="__m512" varname="a2" etype="FP32"/>
+	<parameter type="__m512" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate with the corresponding elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	FOR m := 0 to 3
+		addr := b + m * 32
+		IF k[i]
+			dst.fp32[i] := dst.fp32[i] + a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr])
+		ELSE
+			dst.fp32[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="V4FMADDPS" form="zmm {z}, zmm, m128" xed="V4FMADDPS_ZMMf32_MASKmskw_ZMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_4fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__m512" varname="a0" etype="FP32"/>
+	<parameter type="__m512" varname="a1" etype="FP32"/>
+	<parameter type="__m512" varname="a2" etype="FP32"/>
+	<parameter type="__m512" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate the negated intermediate result with the corresponding elements in "src", and store the results in "dst".</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	FOR m := 0 to 3
+		addr := b + m * 32
+		dst.fp32[i] := dst.fp32[i] - a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr])
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="V4FNMADDPS" form="zmm, zmm, m128" xed="V4FNMADDPS_ZMMf32_MASKmskw_ZMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_mask_4fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a0" etype="FP32"/>
+	<parameter type="__m512" varname="a1" etype="FP32"/>
+	<parameter type="__m512" varname="a2" etype="FP32"/>
+	<parameter type="__m512" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate the negated intermediate result with the corresponding elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	FOR m := 0 to 3
+		addr := b + m * 32
+		IF k[i]
+			dst.fp32[i] := dst.fp32[i] - a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr])
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="V4FNMADDPS" form="zmm {k}, zmm, m128" xed="V4FNMADDPS_ZMMf32_MASKmskw_ZMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_maskz_4fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__m512" varname="a0" etype="FP32"/>
+	<parameter type="__m512" varname="a1" etype="FP32"/>
+	<parameter type="__m512" varname="a2" etype="FP32"/>
+	<parameter type="__m512" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by the 4 corresponding packed elements in "b", accumulate the negated intermediate result with the corresponding elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	FOR m := 0 to 3
+		addr := b + m * 32
+		IF k[i]
+			dst.fp32[i] := dst.fp32[i] - a{m}.fp32[i] * Cast_FP32(MEM[addr+31:addr])
+		ELSE
+			dst.fp32[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="V4FNMADDPS" form="zmm {z}, zmm, m128" xed="V4FNMADDPS_ZMMf32_MASKmskw_ZMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_4fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__m128" varname="a0" etype="FP32"/>
+	<parameter type="__m128" varname="a1" etype="FP32"/>
+	<parameter type="__m128" varname="a2" etype="FP32"/>
+	<parameter type="__m128" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate  with the lower element in "a", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[127:0] := src[127:0]
+FOR m := 0 to 3
+	addr := b + m * 32
+	dst.fp32[0] := dst.fp32[0] + a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="V4FMADDSS" form="xmm, xmm, m128" xed="V4FMADDSS_XMMf32_MASKmskw_XMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_mask_4fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a0" etype="FP32"/>
+	<parameter type="__m128" varname="a1" etype="FP32"/>
+	<parameter type="__m128" varname="a2" etype="FP32"/>
+	<parameter type="__m128" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate  with the lower element in "a", and store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set).</description>
+	<operation>
+dst[127:0] := src[127:0]
+IF k[0]
+	FOR m := 0 to 3
+		addr := b + m * 32
+		dst.fp32[0] := dst.fp32[0] + a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr])
+	ENDFOR
+FI
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="V4FMADDSS" form="xmm {k}, xmm, m128" xed="V4FMADDSS_XMMf32_MASKmskw_XMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_maskz_4fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__m128" varname="a0" etype="FP32"/>
+	<parameter type="__m128" varname="a1" etype="FP32"/>
+	<parameter type="__m128" varname="a2" etype="FP32"/>
+	<parameter type="__m128" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate  with the lower element in "a", and store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set).</description>
+	<operation>
+dst[127:0] := src[127:0]
+IF k[0]
+	FOR m := 0 to 3
+		addr := b + m * 32
+		dst.fp32[0] := dst.fp32[0] + a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr])
+	ENDFOR
+ELSE
+	dst.fp32[0] := 0
+FI
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="V4FMADDSS" form="xmm {z}, xmm, m128" xed="V4FMADDSS_XMMf32_MASKmskw_XMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_4fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__m128" varname="a0" etype="FP32"/>
+	<parameter type="__m128" varname="a1" etype="FP32"/>
+	<parameter type="__m128" varname="a2" etype="FP32"/>
+	<parameter type="__m128" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate  the negated intermediate result with the lower element in "src", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[127:0] := src[127:0]
+FOR m := 0 to 3
+	addr := b + m * 32
+	dst.fp32[0] := dst.fp32[0] - a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="V4FNMADDSS" form="xmm, xmm, m128" xed="V4FNMADDSS_XMMf32_MASKmskw_XMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_mask_4fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a0" etype="FP32"/>
+	<parameter type="__m128" varname="a1" etype="FP32"/>
+	<parameter type="__m128" varname="a2" etype="FP32"/>
+	<parameter type="__m128" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate  the negated intermediate result with the lower element in "src", and store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set).</description>
+	<operation>
+dst[127:0] := src[127:0]
+IF k[0]
+	FOR m := 0 to 3
+		addr := b + m * 32
+		dst.fp32[0] := dst.fp32[0] - a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr])
+	ENDFOR
+FI
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="V4FNMADDSS" form="xmm {k}, xmm, m128" xed="V4FNMADDSS_XMMf32_MASKmskw_XMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_maskz_4fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__m128" varname="a0" etype="FP32"/>
+	<parameter type="__m128" varname="a1" etype="FP32"/>
+	<parameter type="__m128" varname="a2" etype="FP32"/>
+	<parameter type="__m128" varname="a3" etype="FP32"/>
+	<parameter type="__m128 *" varname="b" etype="FP32" memwidth="128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "a0" through "a3" by corresponding element in "b", accumulate  the negated intermediate result with the lower element in "src", and store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set).</description>
+	<operation>
+dst[127:0] := src[127:0]
+IF k[0]
+	FOR m := 0 to 3
+		addr := b + m * 32
+		dst.fp32[0] := dst.fp32[0] - a{m}.fp32[0] * Cast_FP32(MEM[addr+31:addr])
+	ENDFOR
+ELSE
+	dst.fp32[0] := 0
+FI
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="V4FNMADDSS" form="xmm {z}, xmm, m128" xed="V4FNMADDSS_XMMf32_MASKmskw_XMMf32_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_4dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a0" etype="SI16"/>
+	<parameter type="__m512i" varname="a1" etype="SI16"/>
+	<parameter type="__m512i" varname="a2" etype="SI16"/>
+	<parameter type="__m512i" varname="a3" etype="SI16"/>
+	<parameter type="__m128i *" varname="b" etype="SI16" memwidth="128"/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation, and store the results in "dst".</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	FOR m := 0 to 3
+		lim_base := b + m*32
+		t.dword  := MEM[lim_base+31:lim_base]
+		p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0]))
+		p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1]))
+		dst.dword[i] := dst.dword[i] + p1.dword + p2.dword
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VP4DPWSSD" form="zmm, zmm, m128" xed="VP4DPWSSD_ZMMi32_MASKmskw_ZMMi16_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_mask_4dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a0" etype="SI16"/>
+	<parameter type="__m512i" varname="a1" etype="SI16"/>
+	<parameter type="__m512i" varname="a2" etype="SI16"/>
+	<parameter type="__m512i" varname="a3" etype="SI16"/>
+	<parameter type="__m128i *" varname="b" etype="SI16" memwidth="128"/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	IF k[i]
+		FOR m := 0 to 3
+			lim_base := b + m*32
+			t.dword  := MEM[lim_base+31:lim_base]
+			p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0]))
+			p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1]))
+			dst.dword[i] := dst.dword[i] + p1.dword + p2.dword
+		ENDFOR
+	ELSE
+		dst.dword[i] := src.dword[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VP4DPWSSD" form="zmm {k}, zmm, m128" xed="VP4DPWSSD_ZMMi32_MASKmskw_ZMMi16_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_maskz_4dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a0" etype="SI16"/>
+	<parameter type="__m512i" varname="a1" etype="SI16"/>
+	<parameter type="__m512i" varname="a2" etype="SI16"/>
+	<parameter type="__m512i" varname="a3" etype="SI16"/>
+	<parameter type="__m128i *" varname="b" etype="SI16" memwidth="128"/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	IF k[i]
+		FOR m := 0 to 3
+			lim_base := b + m*32
+			t.dword  := MEM[lim_base+31:lim_base]
+			p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0]))
+			p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1]))
+			dst.dword[i] := dst.dword[i] + p1.dword + p2.dword
+		ENDFOR
+	ELSE
+		dst.dword[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VP4DPWSSD" form="zmm {z}, zmm, m128" xed="VP4DPWSSD_ZMMi32_MASKmskw_ZMMi16_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_4dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a0" etype="SI16"/>
+	<parameter type="__m512i" varname="a1" etype="SI16"/>
+	<parameter type="__m512i" varname="a2" etype="SI16"/>
+	<parameter type="__m512i" varname="a3" etype="SI16"/>
+	<parameter type="__m128i *" varname="b" etype="SI16" memwidth="128"/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation and signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	FOR m := 0 to 3
+		lim_base := b + m*32
+		t.dword  := MEM[lim_base+31:lim_base]
+		p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0]))
+		p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1]))
+		dst.dword[i] := Saturate32(dst.dword[i] + p1.dword + p2.dword)
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VP4DPWSSDS" form="zmm, zmm, m128" xed="VP4DPWSSDS_ZMMi32_MASKmskw_ZMMi16_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_mask_4dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a0" etype="SI16"/>
+	<parameter type="__m512i" varname="a1" etype="SI16"/>
+	<parameter type="__m512i" varname="a2" etype="SI16"/>
+	<parameter type="__m512i" varname="a3" etype="SI16"/>
+	<parameter type="__m128i *" varname="b" etype="SI16" memwidth="128"/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask and signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set)..</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	IF k[i]
+		FOR m := 0 to 3
+			lim_base := b + m*32
+			t.dword  := MEM[lim_base+31:lim_base]
+			p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0]))
+			p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1]))
+			dst.dword[i] := Saturate32(dst.dword[i] + p1.dword + p2.dword)
+		ENDFOR
+	ELSE
+		dst.dword[i] := src.dword[i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VP4DPWSSDS" form="zmm {k}, zmm, m128" xed="VP4DPWSSDS_ZMMi32_MASKmskw_ZMMi16_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_maskz_4dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a0" etype="SI16"/>
+	<parameter type="__m512i" varname="a1" etype="SI16"/>
+	<parameter type="__m512i" varname="a2" etype="SI16"/>
+	<parameter type="__m512i" varname="a3" etype="SI16"/>
+	<parameter type="__m128i *" varname="b" etype="SI16" memwidth="128"/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask and signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set)..</description>
+	<operation>
+dst[511:0] := src[511:0]
+FOR i := 0 to 15
+	IF k[i]
+		FOR m := 0 to 3
+			lim_base := b + m*32
+			t.dword  := MEM[lim_base+31:lim_base]
+			p1.dword := SignExtend32(a{m}.word[2*i+0]) * SignExtend32(Cast_Int16(t.word[0]))
+			p2.dword := SignExtend32(a{m}.word[2*i+1]) * SignExtend32(Cast_Int16(t.word[1]))
+			dst.dword[i] := Saturate32(dst.dword[i] + p1.dword + p2.dword)
+		ENDFOR
+	ELSE
+		dst.dword[i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VP4DPWSSDS" form="zmm {z}, zmm, m128" xed="VP4DPWSSDS_ZMMi32_MASKmskw_ZMMi16_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtne2ps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m128bh" varname="dst" etype="BF16"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	IF j &lt; 4
+		t := b.fp32[j]
+	ELSE
+		t := a.fp32[j-4]
+	FI
+	dst.word[j] := Convert_FP32_To_BF16(t)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTNE2PS2BF16" form="xmm, xmm, xmm" xed="VCVTNE2PS2BF16_XMMbf16_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtne2ps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m128bh" varname="dst" etype="BF16"/>
+	<parameter type="__m128bh" varname="src" etype="BF16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF j &lt; 4
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-4]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTNE2PS2BF16" form="xmm {k}, xmm, xmm" xed="VCVTNE2PS2BF16_XMMbf16_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtne2ps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m128bh" varname="dst" etype="BF16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		IF j &lt; 4
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-4]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTNE2PS2BF16" form="xmm {z}, xmm, xmm" xed="VCVTNE2PS2BF16_XMMbf16_MASKmskw_XMMf32_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtne2ps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m256bh" varname="dst" etype="BF16"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	IF j &lt; 8
+		t := b.fp32[j]
+	ELSE
+		t := a.fp32[j-8]
+	FI
+	dst.word[j] := Convert_FP32_To_BF16(t)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTNE2PS2BF16" form="ymm, ymm, ymm" xed="VCVTNE2PS2BF16_YMMbf16_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtne2ps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m256bh" varname="dst" etype="BF16"/>
+	<parameter type="__m256bh" varname="src" etype="BF16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF j &lt; 8
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-8]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTNE2PS2BF16" form="ymm {k}, ymm, ymm" xed="VCVTNE2PS2BF16_YMMbf16_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtne2ps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m256bh" varname="dst" etype="BF16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		IF j &lt; 8
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-8]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTNE2PS2BF16" form="ymm {z}, ymm, ymm" xed="VCVTNE2PS2BF16_YMMbf16_MASKmskw_YMMf32_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtne2ps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512bh" varname="dst" etype="BF16"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	IF j &lt; 16
+		t := b.fp32[j]
+	ELSE
+		t := a.fp32[j-16]
+	FI
+	dst.word[j] := Convert_FP32_To_BF16(t)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTNE2PS2BF16" form="zmm, zmm, zmm" xed="VCVTNE2PS2BF16_ZMMbf16_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtne2ps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512bh" varname="dst" etype="BF16"/>
+	<parameter type="__m512bh" varname="src" etype="BF16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF j &lt; 16
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-16]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTNE2PS2BF16" form="zmm {k}, zmm, zmm" xed="VCVTNE2PS2BF16_ZMMbf16_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtne2ps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m512bh" varname="dst" etype="BF16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	IF k[j]
+		IF j &lt; 16
+			t := b.fp32[j]
+		ELSE
+			t := a.fp32[j-16]
+		FI
+		dst.word[j] := Convert_FP32_To_BF16(t)
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTNE2PS2BF16" form="zmm {z}, zmm, zmm" xed="VCVTNE2PS2BF16_ZMMbf16_MASKmskw_ZMMf32_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_cvtneps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m128bh" varname="dst" etype="BF16"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTNEPS2BF16" form="xmm, xmm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_cvtneps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m128bh" varname="dst" etype="BF16"/>
+	<parameter type="__m128bh" varname="src" etype="BF16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTNEPS2BF16" form="xmm {k}, xmm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_cvtneps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m128bh" varname="dst" etype="BF16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTNEPS2BF16" form="xmm {z}, xmm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_XMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_cvtneps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m128bh" varname="dst" etype="BF16"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTNEPS2BF16" form="xmm, ymm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_cvtneps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m128bh" varname="dst" etype="BF16"/>
+	<parameter type="__m128bh" varname="src" etype="BF16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTNEPS2BF16" form="xmm {k}, ymm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_cvtneps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Convert</category>
+	<return type="__m128bh" varname="dst" etype="BF16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTNEPS2BF16" form="xmm {z}, ymm" xed="VCVTNEPS2BF16_XMMbf16_MASKmskw_YMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_cvtneps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256bh" varname="dst" etype="BF16"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTNEPS2BF16" form="ymm, zmm" xed="VCVTNEPS2BF16_YMMbf16_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_cvtneps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256bh" varname="dst" etype="BF16"/>
+	<parameter type="__m256bh" varname="src" etype="BF16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := src.word[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTNEPS2BF16" form="ymm {k}, zmm" xed="VCVTNEPS2BF16_YMMbf16_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_cvtneps_pbh">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<return type="__m256bh" varname="dst" etype="BF16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+	ELSE
+		dst.word[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTNEPS2BF16" form="ymm {z}, zmm" xed="VCVTNEPS2BF16_YMMbf16_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_dpbf16_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__m128bh" varname="a" etype="BF16"/>
+	<parameter type="__m128bh" varname="b" etype="BF16"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst".</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 3
+	dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+	dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDPBF16PS" form="xmm, xmm, xmm" xed="VDPBF16PS_XMMf32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_dpbf16_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128bh" varname="a" etype="BF16"/>
+	<parameter type="__m128bh" varname="b" etype="BF16"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDPBF16PS" form="xmm {k}, xmm, xmm" xed="VDPBF16PS_XMMf32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_dpbf16_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128" varname="src" etype="FP32"/>
+	<parameter type="__m128bh" varname="a" etype="BF16"/>
+	<parameter type="__m128bh" varname="b" etype="BF16"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 3
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VDPBF16PS" form="xmm {z}, xmm, xmm" xed="VDPBF16PS_XMMf32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_dpbf16_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__m256bh" varname="a" etype="BF16"/>
+	<parameter type="__m256bh" varname="b" etype="BF16"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst".</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 7
+	dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+	dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDPBF16PS" form="ymm, ymm, ymm" xed="VDPBF16PS_YMMf32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_dpbf16_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256bh" varname="a" etype="BF16"/>
+	<parameter type="__m256bh" varname="b" etype="BF16"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDPBF16PS" form="ymm {k}, ymm, ymm" xed="VDPBF16PS_YMMf32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_dpbf16_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256" varname="src" etype="FP32"/>
+	<parameter type="__m256bh" varname="a" etype="BF16"/>
+	<parameter type="__m256bh" varname="b" etype="BF16"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 7
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VDPBF16PS" form="ymm {z}, ymm, ymm" xed="VDPBF16PS_YMMf32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_dpbf16_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__m512bh" varname="a" etype="BF16"/>
+	<parameter type="__m512bh" varname="b" etype="BF16"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst".</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 15
+	dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+	dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDPBF16PS" form="zmm, zmm, zmm" xed="VDPBF16PS_ZMMf32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_dpbf16_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512bh" varname="a" etype="BF16"/>
+	<parameter type="__m512bh" varname="b" etype="BF16"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDPBF16PS" form="zmm {k}, zmm, zmm" xed="VDPBF16PS_ZMMf32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_dpbf16_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512_BF16</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__m512bh" varname="a" etype="BF16"/>
+	<parameter type="__m512bh" varname="b" etype="BF16"/>
+	<description>Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE make_fp32(x[15:0]) {
+	y.fp32  := 0.0
+	y[31:16] := x[15:0]
+	RETURN y
+}
+dst := src
+FOR j := 0 to 15
+	IF k[j]
+		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
+		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VDPBF16PS" form="zmm {z}, zmm, zmm" xed="VDPBF16PS_ZMMf32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_bitshuffle_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__mmask64" varname="dst" etype="MASK"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7 //Qword
+	FOR j := 0 to 7 // Byte
+		IF k[i*8+j]
+			m := c.qword[i].byte[j] &amp; 0x3F
+			dst[i*8+j] := b.qword[i].bit[m]
+		ELSE
+			dst[i*8+j] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPSHUFBITQMB" form="k {k}, zmm, zmm" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_ZMMu64_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_bitshuffle_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__mmask64" varname="dst" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 7 //Qword
+	FOR j := 0 to 7 // Byte
+		m := c.qword[i].byte[j] &amp; 0x3F
+		dst[i*8+j] := b.qword[i].bit[m]
+	ENDFOR
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VPSHUFBITQMB" form="k, zmm, zmm" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_ZMMu64_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_bitshuffle_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__mmask32" varname="dst" etype="MASK"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 3 //Qword
+	FOR j := 0 to 7 // Byte
+		IF k[i*8+j]
+			m := c.qword[i].byte[j] &amp; 0x3F
+			dst[i*8+j] := b.qword[i].bit[m]
+		ELSE
+			dst[i*8+j] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPSHUFBITQMB" form="k {k}, ymm, ymm" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_YMMu64_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_bitshuffle_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__mmask32" varname="dst" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 3 //Qword
+	FOR j := 0 to 7 // Byte
+		m := c.qword[i].byte[j] &amp; 0x3F
+		dst[i*8+j] := b.qword[i].bit[m]
+	ENDFOR
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="VPSHUFBITQMB" form="k, ymm, ymm" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_YMMu64_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_bitshuffle_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__mmask16" varname="dst" etype="MASK"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 1 //Qword
+	FOR j := 0 to 7 // Byte
+		IF k[i*8+j]
+			m := c.qword[i].byte[j] &amp; 0x3F
+			dst[i*8+j] := b.qword[i].bit[m]
+		ELSE
+			dst[i*8+j] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="VPSHUFBITQMB" form="k {k}, xmm, xmm" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_XMMu64_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_bitshuffle_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__mmask16" varname="dst" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 1 //Qword
+	FOR j := 0 to 7 // Byte
+		m := c.qword[i].byte[j] &amp; 0x3F
+		dst[i*8+j] := b.qword[i].bit[m]
+	ENDFOR
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="VPSHUFBITQMB" form="k, xmm, xmm" xed="VPSHUFBITQMB_MASKmskw_MASKmskw_XMMu64_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_popcnt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := POPCNT(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTW" form="zmm, zmm" xed="VPOPCNTW_ZMMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_mask_popcnt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTW" form="zmm {k}, zmm" xed="VPOPCNTW_ZMMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_maskz_popcnt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTW" form="zmm {z}, zmm" xed="VPOPCNTW_ZMMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm256_popcnt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := POPCNT(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTW" form="ymm, ymm" xed="VPOPCNTW_YMMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm256_mask_popcnt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTW" form="ymm {k}, ymm" xed="VPOPCNTW_YMMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm256_maskz_popcnt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTW" form="ymm {z}, ymm" xed="VPOPCNTW_YMMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_popcnt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := POPCNT(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTW" form="xmm, xmm" xed="VPOPCNTW_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_mask_popcnt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTW" form="xmm {k}, xmm" xed="VPOPCNTW_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_maskz_popcnt_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := POPCNT(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTW" form="xmm {z}, xmm" xed="VPOPCNTW_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_popcnt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := POPCNT(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTB" form="zmm, zmm" xed="VPOPCNTB_ZMMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_mask_popcnt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTB" form="zmm {k}, zmm" xed="VPOPCNTB_ZMMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm512_maskz_popcnt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPOPCNTB" form="zmm {z}, zmm" xed="VPOPCNTB_ZMMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm256_popcnt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := POPCNT(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTB" form="ymm, ymm" xed="VPOPCNTB_YMMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm256_mask_popcnt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTB" form="ymm {k}, ymm" xed="VPOPCNTB_YMMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm256_maskz_popcnt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPOPCNTB" form="ymm {z}, ymm" xed="VPOPCNTB_YMMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_popcnt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := POPCNT(a[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTB" form="xmm, xmm" xed="VPOPCNTB_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_mask_popcnt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTB" form="xmm {k}, xmm" xed="VPOPCNTB_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" vexEq="TRUE" name="_mm_maskz_popcnt_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_BITALG</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE POPCNT(a) {
+	count := 0
+	DO WHILE a &gt; 0
+		count += a[0]
+		a &gt;&gt;= 1
+	OD
+	RETURN count
+}
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := POPCNT(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPOPCNTB" form="xmm {z}, xmm" xed="VPOPCNTB_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_multishift_epi64_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 7
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		dst[q+j*8+7:q+j*8] := tmp8[7:0]
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULTISHIFTQB" form="zmm, zmm, zmm" xed="VPMULTISHIFTQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_multishift_epi64_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULTISHIFTQB" form="zmm {k}, zmm, zmm" xed="VPMULTISHIFTQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_multishift_epi64_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULTISHIFTQB" form="zmm {z}, zmm, zmm" xed="VPMULTISHIFTQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_multishift_epi64_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 3
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		dst[q+j*8+7:q+j*8] := tmp8[7:0]
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULTISHIFTQB" form="ymm, ymm, ymm" xed="VPMULTISHIFTQB_YMMu8_MASKmskw_YMMu8_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_multishift_epi64_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 3
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULTISHIFTQB" form="ymm {k}, ymm, ymm" xed="VPMULTISHIFTQB_YMMu8_MASKmskw_YMMu8_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_multishift_epi64_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 3
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPMULTISHIFTQB" form="ymm {z}, ymm, ymm" xed="VPMULTISHIFTQB_YMMu8_MASKmskw_YMMu8_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_multishift_epi64_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 1
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		dst[q+j*8+7:q+j*8] := tmp8[7:0]
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULTISHIFTQB" form="xmm, xmm, xmm" xed="VPMULTISHIFTQB_XMMu8_MASKmskw_XMMu8_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_multishift_epi64_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 1
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULTISHIFTQB" form="xmm {k}, xmm, xmm" xed="VPMULTISHIFTQB_XMMu8_MASKmskw_XMMu8_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_multishift_epi64_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 1
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPMULTISHIFTQB" form="xmm {z}, xmm, xmm" xed="VPMULTISHIFTQB_XMMu8_MASKmskw_XMMu8_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutexvar_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="idx" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	id := idx[i+5:i]*8
+	dst[i+7:i] := a[id+7:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMB" form="zmm, zmm, zmm" xed="VPERMB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutexvar_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	id := idx[i+5:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMB" form="zmm {k}, zmm, zmm" xed="VPERMB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutexvar_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	id := idx[i+5:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMB" form="zmm {z}, zmm, zmm" xed="VPERMB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutexvar_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="idx" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	id := idx[i+4:i]*8
+	dst[i+7:i] := a[id+7:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMB" form="ymm, ymm, ymm" xed="VPERMB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutexvar_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	id := idx[i+4:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMB" form="ymm {k}, ymm, ymm" xed="VPERMB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutexvar_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	id := idx[i+4:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMB" form="ymm {z}, ymm, ymm" xed="VPERMB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_permutexvar_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="idx" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	id := idx[i+3:i]*8
+	dst[i+7:i] := a[id+7:id]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMB" form="xmm, xmm, xmm" xed="VPERMB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutexvar_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	id := idx[i+3:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMB" form="xmm {k}, xmm, xmm" xed="VPERMB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutexvar_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	id := idx[i+3:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMB" form="xmm {z}, xmm, xmm" xed="VPERMB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="idx" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	off := 8*idx[i+5:i]
+	dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2B" form="zmm, zmm, zmm" xed="VPERMI2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="idx" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+5:i]
+		dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMT2B" form="zmm {k}, zmm, zmm" xed="VPERMT2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask2_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="idx" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+5:i]
+		dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := idx[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2B" form="zmm {k}, zmm, zmm" xed="VPERMI2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="idx" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+5:i]
+		dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMI2B" form="zmm {z}, zmm, zmm" xed="VPERMI2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<instruction name="VPERMT2B" form="zmm {z}, zmm, zmm" xed="VPERMT2B_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="idx" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	off := 8*idx[i+4:i]
+	dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2B" form="ymm, ymm, ymm" xed="VPERMI2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="idx" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+4:i]
+		dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMT2B" form="ymm {k}, ymm, ymm" xed="VPERMT2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask2_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="idx" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+4:i]
+		dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := idx[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2B" form="ymm {k}, ymm, ymm" xed="VPERMI2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="idx" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+4:i]
+		dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPERMI2B" form="ymm {z}, ymm, ymm" xed="VPERMI2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<instruction name="VPERMT2B" form="ymm {z}, ymm, ymm" xed="VPERMT2B_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="idx" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	off := 8*idx[i+3:i]
+	dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2B" form="xmm, xmm, xmm" xed="VPERMI2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="idx" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+3:i]
+		dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMT2B" form="xmm {k}, xmm, xmm" xed="VPERMT2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask2_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="idx" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+3:i]
+		dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := idx[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2B" form="xmm {k}, xmm, xmm" xed="VPERMI2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_permutex2var_epi8">
+	<CPUID>AVX512_VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="idx" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+3:i]
+		dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPERMI2B" form="xmm {z}, xmm, xmm" xed="VPERMI2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<instruction name="VPERMT2B" form="xmm {z}, xmm, xmm" xed="VPERMT2B_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shrdv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDVQ" form="zmm {z}, zmm, zmm" xed="VPSHRDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shrdv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDVQ" form="zmm {k}, zmm, zmm" xed="VPSHRDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shrdv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDVQ" form="zmm, zmm, zmm" xed="VPSHRDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shrdv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDVQ" form="ymm {z}, ymm, ymm" xed="VPSHRDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shrdv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDVQ" form="ymm {k}, ymm, ymm" xed="VPSHRDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shrdv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDVQ" form="ymm, ymm, ymm" xed="VPSHRDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shrdv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDVQ" form="xmm {z}, xmm, xmm" xed="VPSHRDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shrdv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDVQ" form="xmm {k}, xmm, xmm" xed="VPSHRDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shrdv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; (c[i+63:i] &amp; 63)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDVQ" form="xmm, xmm, xmm" xed="VPSHRDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shrdv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDVD" form="zmm {z}, zmm, zmm" xed="VPSHRDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shrdv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDVD" form="zmm {k}, zmm, zmm" xed="VPSHRDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shrdv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDVD" form="zmm, zmm, zmm" xed="VPSHRDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shrdv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDVD" form="ymm {z}, ymm, ymm" xed="VPSHRDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shrdv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDVD" form="ymm {k}, ymm, ymm" xed="VPSHRDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shrdv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDVD" form="ymm, ymm, ymm" xed="VPSHRDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shrdv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDVD" form="xmm {z}, xmm, xmm" xed="VPSHRDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shrdv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDVD" form="xmm {k}, xmm, xmm" xed="VPSHRDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shrdv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; (c[i+31:i] &amp; 31)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDVD" form="xmm, xmm, xmm" xed="VPSHRDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shrdv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="__m512i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDVW" form="zmm {z}, zmm, zmm" xed="VPSHRDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shrdv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="__m512i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDVW" form="zmm {k}, zmm, zmm" xed="VPSHRDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shrdv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="__m512i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDVW" form="zmm, zmm, zmm" xed="VPSHRDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shrdv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="__m256i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDVW" form="ymm {z}, ymm, ymm" xed="VPSHRDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shrdv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="__m256i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDVW" form="ymm {k}, ymm, ymm" xed="VPSHRDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shrdv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="__m256i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDVW" form="ymm, ymm, ymm" xed="VPSHRDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shrdv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="__m128i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDVW" form="xmm {z}, xmm, xmm" xed="VPSHRDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shrdv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="__m128i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDVW" form="xmm {k}, xmm, xmm" xed="VPSHRDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shrdv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="__m128i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; (c[i+15:i] &amp; 15)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDVW" form="xmm, xmm, xmm" xed="VPSHRDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shrdi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDQ" form="zmm {z}, zmm, zmm, imm8" xed="VPSHRDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shrdi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDQ" form="zmm {k}, zmm, zmm, imm8" xed="VPSHRDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shrdi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDQ" form="zmm, zmm, zmm, imm8" xed="VPSHRDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shrdi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDQ" form="ymm {z}, ymm, ymm, imm8" xed="VPSHRDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shrdi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDQ" form="ymm {k}, ymm, ymm, imm8" xed="VPSHRDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shrdi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDQ" form="ymm, ymm, ymm, imm8" xed="VPSHRDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shrdi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDQ" form="xmm {z}, xmm, xmm, imm8" xed="VPSHRDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shrdi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDQ" form="xmm {k}, xmm, xmm, imm8" xed="VPSHRDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shrdi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ((b[i+63:i] &lt;&lt; 64)[127:0] | a[i+63:i]) &gt;&gt; imm8[5:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDQ" form="xmm, xmm, xmm, imm8" xed="VPSHRDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shrdi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDD" form="zmm {z}, zmm, zmm, imm8" xed="VPSHRDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shrdi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDD" form="zmm {k}, zmm, zmm, imm8" xed="VPSHRDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shrdi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDD" form="zmm, zmm, zmm, imm8" xed="VPSHRDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shrdi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDD" form="ymm {z}, ymm, ymm, imm8" xed="VPSHRDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shrdi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDD" form="ymm {k}, ymm, ymm, imm8" xed="VPSHRDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shrdi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDD" form="ymm, ymm, ymm, imm8" xed="VPSHRDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shrdi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDD" form="xmm {z}, xmm, xmm, imm8" xed="VPSHRDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shrdi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDD" form="xmm {k}, xmm, xmm, imm8" xed="VPSHRDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shrdi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ((b[i+31:i] &lt;&lt; 32)[63:0] | a[i+31:i]) &gt;&gt; imm8[4:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDD" form="xmm, xmm, xmm, imm8" xed="VPSHRDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shrdi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDW" form="zmm {z}, zmm, zmm, imm8" xed="VPSHRDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shrdi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDW" form="zmm {k}, zmm, zmm, imm8" xed="VPSHRDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shrdi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHRDW" form="zmm, zmm, zmm, imm8" xed="VPSHRDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shrdi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDW" form="ymm {z}, ymm, ymm, imm8" xed="VPSHRDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shrdi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDW" form="ymm {k}, ymm, ymm, imm8" xed="VPSHRDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shrdi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHRDW" form="ymm, ymm, ymm, imm8" xed="VPSHRDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shrdi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDW" form="xmm {z}, xmm, xmm, imm8" xed="VPSHRDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shrdi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDW" form="xmm {k}, xmm, xmm, imm8" xed="VPSHRDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shrdi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ((b[i+15:i] &lt;&lt; 16)[31:0] | a[i+15:i]) &gt;&gt; imm8[3:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHRDW" form="xmm, xmm, xmm, imm8" xed="VPSHRDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shldv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDVQ" form="zmm {z}, zmm, zmm" xed="VPSHLDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shldv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDVQ" form="zmm {k}, zmm, zmm" xed="VPSHLDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shldv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__m512i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDVQ" form="zmm, zmm, zmm" xed="VPSHLDVQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shldv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDVQ" form="ymm {z}, ymm, ymm" xed="VPSHLDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shldv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDVQ" form="ymm {k}, ymm, ymm" xed="VPSHLDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shldv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__m256i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDVQ" form="ymm, ymm, ymm" xed="VPSHLDVQ_YMMu64_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shldv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDVQ" form="xmm {z}, xmm, xmm" xed="VPSHLDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shldv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDVQ" form="xmm {k}, xmm, xmm" xed="VPSHLDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shldv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__m128i" varname="c" etype="UI64"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; (c[i+63:i] &amp; 63)
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDVQ" form="xmm, xmm, xmm" xed="VPSHLDVQ_XMMu64_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shldv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDVD" form="zmm {z}, zmm, zmm" xed="VPSHLDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shldv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDVD" form="zmm {k}, zmm, zmm" xed="VPSHLDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shldv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDVD" form="zmm, zmm, zmm" xed="VPSHLDVD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shldv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDVD" form="ymm {z}, ymm, ymm" xed="VPSHLDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shldv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDVD" form="ymm {k}, ymm, ymm" xed="VPSHLDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shldv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="__m256i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDVD" form="ymm, ymm, ymm" xed="VPSHLDVD_YMMu32_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shldv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDVD" form="xmm {z}, xmm, xmm" xed="VPSHLDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shldv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDVD" form="xmm {k}, xmm, xmm" xed="VPSHLDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shldv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__m128i" varname="c" etype="UI32"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; (c[i+31:i] &amp; 31)
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDVD" form="xmm, xmm, xmm" xed="VPSHLDVD_XMMu32_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shldv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="__m512i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDVW" form="zmm {z}, zmm, zmm" xed="VPSHLDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shldv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="__m512i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDVW" form="zmm {k}, zmm, zmm" xed="VPSHLDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shldv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="__m512i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDVW" form="zmm, zmm, zmm" xed="VPSHLDVW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shldv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="__m256i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDVW" form="ymm {z}, ymm, ymm" xed="VPSHLDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shldv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="__m256i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDVW" form="ymm {k}, ymm, ymm" xed="VPSHLDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shldv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="__m256i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDVW" form="ymm, ymm, ymm" xed="VPSHLDVW_YMMu16_MASKmskw_YMMu16_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shldv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="__m128i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDVW" form="xmm {z}, xmm, xmm" xed="VPSHLDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shldv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="__m128i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDVW" form="xmm {k}, xmm, xmm" xed="VPSHLDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shldv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="__m128i" varname="c" etype="UI16"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; (c[i+15:i] &amp; 15)
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDVW" form="xmm, xmm, xmm" xed="VPSHLDVW_XMMu16_MASKmskw_XMMu16_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shldi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDQ" form="zmm {z}, zmm, zmm, imm8" xed="VPSHLDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shldi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDQ" form="zmm {k}, zmm, zmm, imm8" xed="VPSHLDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shldi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDQ" form="zmm, zmm, zmm, imm8" xed="VPSHLDQ_ZMMu64_MASKmskw_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shldi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDQ" form="ymm {z}, ymm, ymm, imm8" xed="VPSHLDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shldi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDQ" form="ymm {k}, ymm, ymm, imm8" xed="VPSHLDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shldi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDQ" form="ymm, ymm, ymm, imm8" xed="VPSHLDQ_YMMu64_MASKmskw_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shldi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDQ" form="xmm {z}, xmm, xmm, imm8" xed="VPSHLDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shldi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+		dst[i+63:i] := tmp[127:64]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDQ" form="xmm {k}, xmm, xmm, imm8" xed="VPSHLDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shldi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ((a[i+63:i] &lt;&lt; 64)[127:0] | b[i+63:i]) &lt;&lt; imm8[5:0]
+	dst[i+63:i] := tmp[127:64]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDQ" form="xmm, xmm, xmm, imm8" xed="VPSHLDQ_XMMu64_MASKmskw_XMMu64_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shldi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDD" form="zmm {z}, zmm, zmm, imm8" xed="VPSHLDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shldi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDD" form="zmm {k}, zmm, zmm, imm8" xed="VPSHLDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shldi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDD" form="zmm, zmm, zmm, imm8" xed="VPSHLDD_ZMMu32_MASKmskw_ZMMu32_ZMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shldi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDD" form="ymm {z}, ymm, ymm, imm8" xed="VPSHLDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shldi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDD" form="ymm {k}, ymm, ymm, imm8" xed="VPSHLDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shldi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI32"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDD" form="ymm, ymm, ymm, imm8" xed="VPSHLDD_YMMu32_MASKmskw_YMMu32_YMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shldi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDD" form="xmm {z}, xmm, xmm, imm8" xed="VPSHLDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shldi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+		dst[i+31:i] := tmp[63:32]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDD" form="xmm {k}, xmm, xmm, imm8" xed="VPSHLDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shldi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp[63:0] := ((a[i+31:i] &lt;&lt; 32)[63:0] | b[i+31:i]) &lt;&lt; imm8[4:0]
+	dst[i+31:i] := tmp[63:32]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDD" form="xmm, xmm, xmm, imm8" xed="VPSHLDD_XMMu32_MASKmskw_XMMu32_XMMu32_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_shldi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDW" form="zmm {z}, zmm, zmm, imm8" xed="VPSHLDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_shldi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDW" form="zmm {k}, zmm, zmm, imm8" xed="VPSHLDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_shldi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Shift</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<parameter type="__m512i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSHLDW" form="zmm, zmm, zmm, imm8" xed="VPSHLDW_ZMMu16_MASKmskw_ZMMu16_ZMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_shldi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDW" form="ymm {z}, ymm, ymm, imm8" xed="VPSHLDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_shldi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDW" form="ymm {k}, ymm, ymm, imm8" xed="VPSHLDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_shldi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<parameter type="__m256i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPSHLDW" form="ymm, ymm, ymm, imm8" xed="VPSHLDW_YMMu16_MASKmskw_YMMu16_YMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_shldi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDW" form="xmm {z}, xmm, xmm, imm8" xed="VPSHLDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_shldi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDW" form="xmm {k}, xmm, xmm, imm8" xed="VPSHLDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_shldi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst").</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] &lt;&lt; 16)[31:0] | b[i+15:i]) &lt;&lt; imm8[3:0]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPSHLDW" form="xmm, xmm, xmm, imm8" xed="VPSHLDW_XMMu16_MASKmskw_XMMu16_XMMu16_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expandloadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI16" memwidth="512"/>
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="zmm {z}, m512" xed="VPEXPANDW_ZMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expandloadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI16" memwidth="512"/>
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="zmm {k}, m512" xed="VPEXPANDW_ZMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expand_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="zmm {z}, zmm" xed="VPEXPANDW_ZMMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expand_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="zmm {k}, zmm" xed="VPEXPANDW_ZMMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expandloadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI16" memwidth="256"/>
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="ymm {z}, m256" xed="VPEXPANDW_YMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expandloadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI16" memwidth="256"/>
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="ymm {k}, m256" xed="VPEXPANDW_YMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expand_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="ymm {z}, ymm" xed="VPEXPANDW_YMMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expand_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="ymm {k}, ymm" xed="VPEXPANDW_YMMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expandloadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI16" memwidth="128"/>
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="xmm {z}, m128" xed="VPEXPANDW_XMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expandloadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI16" memwidth="128"/>
+	<description>Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="xmm {k}, m128" xed="VPEXPANDW_XMMu16_MASKmskw_MEMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expand_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="xmm {z}, xmm" xed="VPEXPANDW_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expand_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[m+15:m]
+		m := m + 16
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDW" form="xmm {k}, xmm" xed="VPEXPANDW_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expandloadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI8" memwidth="512"/>
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="zmm {z}, m512" xed="VPEXPANDB_ZMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expandloadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI8" memwidth="512"/>
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="zmm {k}, m512" xed="VPEXPANDB_ZMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expandloadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI8" memwidth="256"/>
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="ymm {z}, m256" xed="VPEXPANDB_YMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expandloadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI8" memwidth="256"/>
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="ymm {k}, m256" xed="VPEXPANDB_YMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expandloadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI8" memwidth="128"/>
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="xmm {z}, m128" xed="VPEXPANDB_XMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expandloadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="const void*" varname="mem_addr" etype="UI8" memwidth="128"/>
+	<description>Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="xmm {k}, m128" xed="VPEXPANDB_XMMu8_MASKmskw_MEMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_expand_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="zmm {z}, zmm" xed="VPEXPANDB_ZMMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_expand_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="zmm {k}, zmm" xed="VPEXPANDB_ZMMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_expand_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="ymm {z}, ymm" xed="VPEXPANDB_YMMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_expand_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="ymm {k}, ymm" xed="VPEXPANDB_YMMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_expand_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="xmm {z}, xmm" xed="VPEXPANDB_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_expand_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[m+7:m]
+		m := m + 8
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPEXPANDB" form="xmm {k}, xmm" xed="VPEXPANDB_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compressstoreu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="512"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 16
+m := base_addr
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		MEM[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSW" form="m512 {k}, zmm" xed="VPCOMPRESSW_MEMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compressstoreu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="256"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 16
+m := base_addr
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		MEM[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSW" form="m256 {k}, ymm" xed="VPCOMPRESSW_MEMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compressstoreu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI16" memwidth="128"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 16
+m := base_addr
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		MEM[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSW" form="m128 {k}, xmm" xed="VPCOMPRESSW_MEMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_compress_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCOMPRESSW" form="zmm {z}, zmm" xed="VPCOMPRESSW_ZMMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compress_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI16"/>
+	<parameter type="__m512i" varname="src" etype="UI16"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI16"/>
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCOMPRESSW" form="zmm {k}, zmm" xed="VPCOMPRESSW_ZMMu16_MASKmskw_ZMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_compress_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCOMPRESSW" form="ymm {z}, ymm" xed="VPCOMPRESSW_YMMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compress_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI16"/>
+	<parameter type="__m256i" varname="src" etype="UI16"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI16"/>
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCOMPRESSW" form="ymm {k}, ymm" xed="VPCOMPRESSW_YMMu16_MASKmskw_YMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_compress_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCOMPRESSW" form="xmm {z}, xmm" xed="VPCOMPRESSW_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compress_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="src" etype="UI16"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 16
+m := 0
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[m+size-1:m] := a[i+15:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCOMPRESSW" form="xmm {k}, xmm" xed="VPCOMPRESSW_XMMu16_MASKmskw_XMMu16_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compressstoreu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="512"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 8
+m := base_addr
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		MEM[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSB" form="m512 {k}, zmm" xed="VPCOMPRESSB_MEMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compressstoreu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="256"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 8
+m := base_addr
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		MEM[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSB" form="m256 {k}, ymm" xed="VPCOMPRESSB_MEMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compressstoreu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI8" memwidth="128"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 8
+m := base_addr
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		MEM[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPCOMPRESSB" form="m128 {k}, xmm" xed="VPCOMPRESSB_MEMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_compress_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCOMPRESSB" form="zmm {z}, zmm" xed="VPCOMPRESSB_ZMMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_compress_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCOMPRESSB" form="zmm {k}, zmm" xed="VPCOMPRESSB_ZMMu8_MASKmskw_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_compress_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCOMPRESSB" form="ymm {z}, ymm" xed="VPCOMPRESSB_YMMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_compress_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCOMPRESSB" form="ymm {k}, ymm" xed="VPCOMPRESSB_YMMu8_MASKmskw_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_compress_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCOMPRESSB" form="xmm {z}, xmm" xed="VPCOMPRESSB_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_compress_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512_VBMI2</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 8
+m := 0
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[m+size-1:m] := a[i+7:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPCOMPRESSB" form="xmm {k}, xmm" xed="VPCOMPRESSB_XMMu8_MASKmskw_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPWSSDS" form="zmm {z}, zmm, zmm" xed="VPDPWSSDS_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPWSSDS" form="zmm {k}, zmm, zmm" xed="VPDPWSSDS_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPWSSDS" form="zmm, zmm, zmm" xed="VPDPWSSDS_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPWSSDS" form="ymm {z}, ymm, ymm" xed="VPDPWSSDS_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPWSSDS" form="ymm {k}, ymm, ymm" xed="VPDPWSSDS_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPWSSDS" form="ymm, ymm, ymm" xed="VPDPWSSDS_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPWSSDS" form="xmm {z}, xmm, xmm" xed="VPDPWSSDS_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPWSSDS" form="xmm {k}, xmm, xmm" xed="VPDPWSSDS_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_dpwssds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPWSSDS" form="xmm, xmm, xmm" xed="VPDPWSSDS_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPWSSD" form="zmm {z}, zmm, zmm" xed="VPDPWSSD_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPWSSD" form="zmm {k}, zmm, zmm" xed="VPDPWSSD_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="SI16"/>
+	<parameter type="__m512i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPWSSD" form="zmm, zmm, zmm" xed="VPDPWSSD_ZMMi32_MASKmskw_ZMMi16_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPWSSD" form="ymm {z}, ymm, ymm" xed="VPDPWSSD_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPWSSD" form="ymm {k}, ymm, ymm" xed="VPDPWSSD_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="SI16"/>
+	<parameter type="__m256i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPWSSD" form="ymm, ymm, ymm" xed="VPDPWSSD_YMMi32_MASKmskw_YMMi16_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPWSSD" form="xmm {z}, xmm, xmm" xed="VPDPWSSD_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+		tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPWSSD" form="xmm {k}, xmm, xmm" xed="VPDPWSSD_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_dpwssd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j])
+	tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1])
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPWSSD" form="xmm, xmm, xmm" xed="VPDPWSSD_XMMi32_MASKmskw_XMMi16_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_dpbusds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPBUSDS" form="zmm {z}, zmm, zmm" xed="VPDPBUSDS_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_dpbusds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPBUSDS" form="zmm {k}, zmm, zmm" xed="VPDPBUSDS_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_dpbusds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPBUSDS" form="zmm, zmm, zmm" xed="VPDPBUSDS_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_dpbusds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPBUSDS" form="ymm {z}, ymm, ymm" xed="VPDPBUSDS_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_dpbusds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPBUSDS" form="ymm {k}, ymm, ymm" xed="VPDPBUSDS_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_dpbusds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPBUSDS" form="ymm, ymm, ymm" xed="VPDPBUSDS_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_dpbusds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPBUSDS" form="xmm {z}, xmm, xmm" xed="VPDPBUSDS_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_dpbusds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPBUSDS" form="xmm {k}, xmm, xmm" xed="VPDPBUSDS_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_dpbusds_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPBUSDS" form="xmm, xmm, xmm" xed="VPDPBUSDS_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_maskz_dpbusd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPBUSD" form="zmm {z}, zmm, zmm" xed="VPDPBUSD_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_mask_dpbusd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPBUSD" form="zmm {k}, zmm, zmm" xed="VPDPBUSD_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_dpbusd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPDPBUSD" form="zmm, zmm, zmm" xed="VPDPBUSD_ZMMi32_MASKmskw_ZMMu8_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_maskz_dpbusd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPBUSD" form="ymm {z}, ymm, ymm" xed="VPDPBUSD_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_mask_dpbusd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPBUSD" form="ymm {k}, ymm, ymm" xed="VPDPBUSD_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_dpbusd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="SI32"/>
+	<parameter type="__m256i" varname="src" etype="SI32"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPDPBUSD" form="ymm, ymm, ymm" xed="VPDPBUSD_YMMi32_MASKmskw_YMMu8_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_maskz_dpbusd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPBUSD" form="xmm {z}, xmm, xmm" xed="VPDPBUSD_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_mask_dpbusd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	IF k[j]
+		tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+		tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+		tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+		tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+		dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+	ELSE
+		dst.dword[j] := src.dword[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPBUSD" form="xmm {k}, xmm, xmm" xed="VPDPBUSD_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_dpbusd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VNNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="src" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
+	tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1]))
+	tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2]))
+	tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3]))
+	dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VPDPBUSD" form="xmm, xmm, xmm" xed="VPDPBUSD_XMMi32_MASKmskw_XMMu8_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_2intersect_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Mask</category>
+	<return type="void"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__mmask8*" varname="k1" etype="MASK" memwidth="8"/>
+	<parameter type="__mmask8*" varname="k2" etype="MASK" memwidth="8"/>
+	<description>Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 3
+	FOR j := 0 TO 3
+		match := (a.dword[i] == b.dword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction name="VP2INTERSECTD" form="k, xmm, xmm" xed="VP2INTERSECTD_MASKmskw_XMMu32_XMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_2intersect_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Mask</category>
+	<return type="void"/>
+	<parameter type="__m256i" varname="a" etype="UI32"/>
+	<parameter type="__m256i" varname="b" etype="UI32"/>
+	<parameter type="__mmask8*" varname="k1" etype="MASK" memwidth="8"/>
+	<parameter type="__mmask8*" varname="k2" etype="MASK" memwidth="8"/>
+	<description>Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 7
+	FOR j := 0 TO 7
+		match := (a.dword[i] == b.dword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction name="VP2INTERSECTD" form="k, ymm, ymm" xed="VP2INTERSECTD_MASKmskw_YMMu32_YMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_2intersect_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__mmask16*" varname="k1" etype="MASK" memwidth="16"/>
+	<parameter type="__mmask16*" varname="k2" etype="MASK" memwidth="16"/>
+	<description>Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+15:k1] := 0
+MEM[k2+15:k2] := 0
+FOR i := 0 TO 15
+	FOR j := 0 TO 15
+		match := (a.dword[i] == b.dword[j] ? 1 : 0)
+		MEM[k1+15:k1].bit[i] |= match
+		MEM[k2+15:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction name="VP2INTERSECTD" form="k, zmm, zmm" xed="VP2INTERSECTD_MASKmskw_ZMMu32_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm_2intersect_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Mask</category>
+	<return type="void"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<parameter type="__mmask8*" varname="k1" etype="MASK" memwidth="8"/>
+	<parameter type="__mmask8*" varname="k2" etype="MASK" memwidth="8"/>
+	<description>Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 1
+	FOR j := 0 TO 1
+		match := (a.qword[i] == b.qword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction name="VP2INTERSECTQ" form="k, xmm, xmm" xed="VP2INTERSECTQ_MASKmskw_XMMu64_XMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm256_2intersect_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Mask</category>
+	<return type="void"/>
+	<parameter type="__m256i" varname="a" etype="UI64"/>
+	<parameter type="__m256i" varname="b" etype="UI64"/>
+	<parameter type="__mmask8*" varname="k1" etype="MASK" memwidth="8"/>
+	<parameter type="__mmask8*" varname="k2" etype="MASK" memwidth="8"/>
+	<description>Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 3
+	FOR j := 0 TO 3
+		match := (a.qword[i] == b.qword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction name="VP2INTERSECTQ" form="k, ymm, ymm" xed="VP2INTERSECTQ_MASKmskw_YMMu64_YMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" name="_mm512_2intersect_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512_VP2INTERSECT</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<return type="void"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="__m512i" varname="b" etype="UI64"/>
+	<parameter type="__mmask8*" varname="k1" etype="MASK" memwidth="8"/>
+	<parameter type="__mmask8*" varname="k2" etype="MASK" memwidth="8"/>
+	<description>Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers.</description>
+	<operation>
+MEM[k1+7:k1] := 0
+MEM[k2+7:k2] := 0
+FOR i := 0 TO 7
+	FOR j := 0 TO 7
+		match := (a.qword[i] == b.qword[j] ? 1 : 0)
+		MEM[k1+7:k1].bit[i] |= match
+		MEM[k2+7:k2].bit[j] |= match
+	ENDFOR
+ENDFOR
+	</operation>
+	<instruction name="VP2INTERSECTQ" form="k, zmm, zmm" xed="VP2INTERSECTQ_MASKmskw_ZMMu64_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bextr_u32">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="start" etype="UI32"/>
+	<parameter type="unsigned int" varname="len" etype="UI32"/>
+	<description>Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start".</description>
+	<operation>
+tmp[511:0] := a
+dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]])
+	</operation>
+	<instruction name="BEXTR" form="r32, r32, r32" xed="BEXTR_VGPR32d_VGPR32d_VGPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bextr2_u32">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="control" etype="UI32"/>
+	<description>Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control".</description>
+	<operation>
+start := control[7:0]
+len := control[15:8]
+tmp[511:0] := a
+dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]])
+	</operation>
+	<instruction name="BEXTR" form="r32, r32, r32" xed="BEXTR_VGPR32d_VGPR32d_VGPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bextr_u64">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="start" etype="UI32"/>
+	<parameter type="unsigned int" varname="len" etype="UI32"/>
+	<description>Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start".</description>
+	<operation>
+tmp[511:0] := a
+dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]])
+	</operation>
+	<instruction name="BEXTR" form="r64, r64, r64" xed="BEXTR_VGPR64q_VGPR64q_VGPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bextr2_u64">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="control" etype="UI64"/>
+	<description>Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control"..</description>
+	<operation>
+start := control[7:0]
+len := control[15:8]
+tmp[511:0] := a
+dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]])
+	</operation>
+	<instruction name="BEXTR" form="r64, r64, r64" xed="BEXTR_VGPR64q_VGPR64q_VGPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_blsi_u32">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Extract the lowest set bit from unsigned 32-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a".</description>
+	<operation>
+dst := (-a) AND a
+	</operation>
+	<instruction name="BLSI" form="r32, r32" xed="BLSI_VGPR32d_VGPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_blsi_u64">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Extract the lowest set bit from unsigned 64-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a".</description>
+	<operation>
+dst := (-a) AND a
+	</operation>
+	<instruction name="BLSI" form="r64, r64" xed="BLSI_VGPR64q_VGPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_blsmsk_u32">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 32-bit integer "a".</description>
+	<operation>
+dst := (a - 1) XOR a
+	</operation>
+	<instruction name="BLSMSK" form="r32, r32" xed="BLSMSK_VGPR32d_VGPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_blsmsk_u64">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 64-bit integer "a".</description>
+	<operation>
+dst := (a - 1) XOR a
+	</operation>
+	<instruction name="BLSMSK" form="r64, r64" xed="BLSMSK_VGPR64q_VGPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_blsr_u32">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a".</description>
+	<operation>
+dst := (a - 1) AND a
+	</operation>
+	<instruction name="BLSR" form="r32, r32" xed="BLSR_VGPR32d_VGPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_blsr_u64">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a".</description>
+	<operation>
+dst := (a - 1) AND a
+	</operation>
+	<instruction name="BLSR" form="r64, r64" xed="BLSR_VGPR64q_VGPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_andn_u32">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="b" etype="UI32"/>
+	<description>Compute the bitwise NOT of 32-bit integer "a" and then AND with b, and store the results in dst.</description>
+	<operation>
+dst[31:0] := ((NOT a[31:0]) AND b[31:0])
+	</operation>
+	<instruction name="ANDN" form="r32, r32, r32" xed="ANDN_VGPR32d_VGPR32d_VGPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_andn_u64">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of 64-bit integer "a" and then AND with b, and store the results in dst.</description>
+	<operation>
+dst[63:0] := ((NOT a[63:0]) AND b[63:0])
+	</operation>
+	<instruction name="ANDN" form="r64, r64, r64" xed="ANDN_VGPR64q_VGPR64q_VGPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_tzcnt_u32">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 32) AND a[tmp] == 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction name="TZCNT" form="r32, r32" xed="TZCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_tzcnt_u64">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 64) AND a[tmp] == 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction name="TZCNT" form="r64, r64" xed="TZCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_tzcnt_32">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 32) AND a[tmp] == 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction name="TZCNT" form="r32, r32" xed="TZCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_tzcnt_64">
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 64) AND a[tmp] == 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction name="TZCNT" form="r64, r64" xed="TZCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bzhi_u32">
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="index" etype="UI32"/>
+	<description>Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index".</description>
+	<operation>
+n := index[7:0]
+dst := a
+IF (n &lt; 32)
+	dst[31:n] := 0
+FI
+	</operation>
+	<instruction name="BZHI" form="r32, r32, r32" xed="BZHI_VGPR32d_VGPR32d_VGPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bzhi_u64">
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned int" varname="index" etype="UI32"/>
+	<description>Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index".</description>
+	<operation>
+n := index[7:0]
+dst := a
+IF (n &lt; 64)
+	dst[63:n] := 0
+FI
+	</operation>
+	<instruction name="BZHI" form="r64, r64, r64" xed="BZHI_VGPR64q_VGPR64q_VGPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_pdep_u32">
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="mask" etype="UI32"/>
+	<description>Deposit contiguous low bits from unsigned 32-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 32
+	IF mask[m] == 1
+		dst[m] := tmp[k]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction name="PDEP" form="r32, r32, r32" xed="PDEP_VGPR32d_VGPR32d_VGPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_pdep_u64">
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="mask" etype="UI64"/>
+	<description>Deposit contiguous low bits from unsigned 64-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 64
+	IF mask[m] == 1
+		dst[m] := tmp[k]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction name="PDEP" form="r64, r64, r64" xed="PDEP_VGPR64q_VGPR64q_VGPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_pext_u32">
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="mask" etype="UI32"/>
+	<description>Extract bits from unsigned 32-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 32
+	IF mask[m] == 1
+		dst[k] := tmp[m]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction name="PEXT" form="r32, r32, r32" xed="PEXT_VGPR32d_VGPR32d_VGPR32d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_pext_u64">
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="mask" etype="UI64"/>
+	<description>Extract bits from unsigned 64-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 64
+	IF mask[m] == 1
+		dst[k] := tmp[m]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction name="PEXT" form="r64, r64, r64" xed="PEXT_VGPR64q_VGPR64q_VGPR64q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mulx_u32">
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Arithmetic</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="b" etype="UI32"/>
+	<parameter type="unsigned int*" varname="hi" etype="UI32" memwidth="32"/>
+	<description>Multiply unsigned 32-bit integers "a" and "b", store the low 32-bits of the result in "dst", and store the high 32-bits in "hi". This does not read or write arithmetic flags.</description>
+	<operation>
+dst[31:0] := (a * b)[31:0]
+MEM[hi+31:hi] := (a * b)[63:32]
+	</operation>
+	<instruction name="MULX" form="r32, r32, m32" xed="MULX_VGPR32d_VGPR32d_MEMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mulx_u64">
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Arithmetic</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="b" etype="UI64"/>
+	<parameter type="unsigned __int64*" varname="hi" etype="UI64" memwidth="64"/>
+	<description>Multiply unsigned 64-bit integers "a" and "b", store the low 64-bits of the result in "dst", and store the high 64-bits in "hi". This does not read or write arithmetic flags.</description>
+	<operation>
+dst[63:0] := (a * b)[63:0]
+MEM[hi+63:hi]  := (a * b)[127:64]
+	</operation>
+	<instruction name="MULX" form="r64, r64, m64" xed="MULX_VGPR64q_VGPR64q_MEMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_incsspd">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a".</description>
+	<operation>
+SSP := SSP + a[7:0] * 4
+	</operation>
+	<instruction name="INCSSPD" form="r32" xed="INCSSPD_GPR32u8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_incsspq">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Increment the shadow stack pointer by 8 times the value specified in bits [7:0] of "a".</description>
+	<operation>
+SSP := SSP + a[7:0] * 8
+	</operation>
+	<instruction name="INCSSPQ" form="r64" xed="INCSSPQ_GPR64u8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdsspd_i32">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__int32" varname="dst" etype="UI32"/>
+	<parameter type="void"/>
+	<description>Read the low 32-bits of the current shadow stack pointer, and store the result in "dst".</description>
+	<operation>dst := SSP[31:0]
+	</operation>
+	<instruction name="RDSSPD" form="r32" xed="RDSSPD_GPR32u32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdsspq_i64">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="void"/>
+	<description>Read the current shadow stack pointer, and store the result in "dst".</description>
+	<operation>dst := SSP[63:0]
+	</operation>
+	<instruction name="RDSSPQ" form="r64" xed="RDSSPQ_GPR64u64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_saveprevssp">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Save the previous shadow stack pointer context.</description>
+	<instruction name="SAVEPREVSSP" xed="SAVEPREVSSP"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rstorssp">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="void *" varname="p"/>
+	<description>Restore the saved shadow stack pointer from the shadow stack restore token previously created on shadow stack by saveprevssp.</description>
+	<instruction name="RSTORSSP" form="m64" xed="RSTORSSP_MEMu64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_wrssd">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="__int32" varname="val" etype="UI32"/>
+	<parameter type="void *" varname="p"/>
+	<description>Write 32-bit value in "val" to a shadow stack page in memory specified by "p".</description>
+	<instruction name="WRSSD" form="m32, r32" xed="WRSSD_MEMu32_GPR32u32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_wrssq">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="__int64" varname="val" etype="UI64"/>
+	<parameter type="void *" varname="p"/>
+	<description>Write 64-bit value in "val" to a shadow stack page in memory specified by "p".</description>
+	<instruction name="WRSSQ" form="m64, r64" xed="WRSSQ_MEMu64_GPR64u64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_wrussd">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="__int32" varname="val" etype="UI32"/>
+	<parameter type="void *" varname="p"/>
+	<description>Write 32-bit value in "val" to a user shadow stack page in memory specified by "p".</description>
+	<instruction name="WRUSSD" form="m32, r32" xed="WRUSSD_MEMu32_GPR32u32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_wrussq">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="__int64" varname="val" etype="UI64"/>
+	<parameter type="void *" varname="p"/>
+	<description>Write 64-bit value in "val" to a user shadow stack page in memory specified by "p".</description>
+	<instruction name="WRUSSQ" form="m64, r64" xed="WRUSSQ_MEMu64_GPR64u64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_setssbsy">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Mark shadow stack pointed to by IA32_PL0_SSP as busy.</description>
+	<instruction name="SETSSBSY" xed="SETSSBSY"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_clrssbsy">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="void *" varname="p"/>
+	<description>Mark shadow stack pointed to by "p" as not busy.</description>
+	<instruction name="CLRSSBSY" form="m64" xed="CLRSSBSY_MEMu64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_get_ssp">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__int32" varname="dst" etype="UI32"/>
+	<parameter type="void"/>
+	<description>If CET is enabled, read the low 32-bits of the current shadow stack pointer, and store the result in "dst". Otherwise return 0.</description>
+	<operation>dst := SSP[31:0]
+	</operation>
+	<instruction name="RDSSPD" form="r32" xed="RDSSPD_GPR32u32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_get_ssp">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="void"/>
+	<description>If CET is enabled, read the current shadow stack pointer, and store the result in "dst". Otherwise return 0.</description>
+	<operation>dst := SSP[63:0]
+	</operation>
+	<instruction name="RDSSPQ" form="r64" xed="RDSSPQ_GPR64u64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_inc_ssp">
+	<CPUID>CET_SS</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a".</description>
+	<operation>
+SSP := SSP + a[7:0] * 4
+	</operation>
+	<instruction name="INCSSPD" form="r32" xed="INCSSPD_GPR32u8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_cldemote">
+	<CPUID>CLDEMOTE</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="void const *" varname="p"/>
+	<description>Hint to hardware that the cache line that contains "p" should be demoted from the cache closest to the processor core to a level more distant from the processor core.</description>
+	<instruction name="CLDEMOTE" form="m8" xed="CLDEMOTE_MEMu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_clflushopt">
+	<CPUID>CLFLUSHOPT</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void const *" varname="p"/>
+	<description>Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy.</description>
+	<instruction name="CLFLUSHOPT" form="m8" xed="CLFLUSHOPT_MEMmprefetch"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_clwb">
+	<CPUID>CLWB</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void const *" varname="p"/>
+	<description>Write back to memory the cache line that contains "p" from any level of the cache hierarchy in the cache coherence domain.</description>
+	<instruction name="CLWB" form="m8" xed="CLWB_MEMmprefetch"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="xmm, xmm, xmm" xed="VFMADD132PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMADD213PD" form="xmm, xmm, xmm" xed="VFMADD213PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMADD231PD" form="xmm, xmm, xmm" xed="VFMADD231PD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADD132PD" form="ymm, ymm, ymm" xed="VFMADD132PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMADD213PD" form="ymm, ymm, ymm" xed="VFMADD213PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMADD231PD" form="ymm, ymm, ymm" xed="VFMADD231PD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="xmm, xmm, xmm" xed="VFMADD132PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMADD213PS" form="xmm, xmm, xmm" xed="VFMADD213PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMADD231PS" form="xmm, xmm, xmm" xed="VFMADD231PS_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADD132PS" form="ymm, ymm, ymm" xed="VFMADD132PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMADD213PS" form="ymm, ymm, ymm" xed="VFMADD213PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMADD231PS" form="ymm, ymm, ymm" xed="VFMADD231PS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SD" form="xmm, xmm, xmm" xed="VFMADD132SD_XMMdq_XMMq_XMMq"/>
+	<instruction name="VFMADD213SD" form="xmm, xmm, xmm" xed="VFMADD213SD_XMMdq_XMMq_XMMq"/>
+	<instruction name="VFMADD231SD" form="xmm, xmm, xmm" xed="VFMADD231SD_XMMdq_XMMq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADD132SS" form="xmm, xmm, xmm" xed="VFMADD132SS_XMMdq_XMMd_XMMd"/>
+	<instruction name="VFMADD213SS" form="xmm, xmm, xmm" xed="VFMADD213SS_XMMdq_XMMd_XMMd"/>
+	<instruction name="VFMADD231SS" form="xmm, xmm, xmm" xed="VFMADD231SS_XMMdq_XMMd_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF ((j &amp; 1) == 0) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="xmm, xmm, xmm" xed="VFMADDSUB132PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMADDSUB213PD" form="xmm, xmm, xmm" xed="VFMADDSUB213PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMADDSUB231PD" form="xmm, xmm, xmm" xed="VFMADDSUB231PD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF ((j &amp; 1) == 0) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PD" form="ymm, ymm, ymm" xed="VFMADDSUB132PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMADDSUB213PD" form="ymm, ymm, ymm" xed="VFMADDSUB213PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMADDSUB231PD" form="ymm, ymm, ymm" xed="VFMADDSUB231PD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF ((j &amp; 1) == 0) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="xmm, xmm, xmm" xed="VFMADDSUB132PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMADDSUB213PS" form="xmm, xmm, xmm" xed="VFMADDSUB213PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMADDSUB231PS" form="xmm, xmm, xmm" xed="VFMADDSUB231PS_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF ((j &amp; 1) == 0) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMADDSUB132PS" form="ymm, ymm, ymm" xed="VFMADDSUB132PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMADDSUB213PS" form="ymm, ymm, ymm" xed="VFMADDSUB213PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMADDSUB231PS" form="ymm, ymm, ymm" xed="VFMADDSUB231PS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="xmm, xmm, xmm" xed="VFMSUB132PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMSUB213PD" form="xmm, xmm, xmm" xed="VFMSUB213PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMSUB231PD" form="xmm, xmm, xmm" xed="VFMSUB231PD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUB132PD" form="ymm, ymm, ymm" xed="VFMSUB132PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMSUB213PD" form="ymm, ymm, ymm" xed="VFMSUB213PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMSUB231PD" form="ymm, ymm, ymm" xed="VFMSUB231PD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="xmm, xmm, xmm" xed="VFMSUB132PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMSUB213PS" form="xmm, xmm, xmm" xed="VFMSUB213PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMSUB231PS" form="xmm, xmm, xmm" xed="VFMSUB231PS_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUB132PS" form="ymm, ymm, ymm" xed="VFMSUB132PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMSUB213PS" form="ymm, ymm, ymm" xed="VFMSUB213PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMSUB231PS" form="ymm, ymm, ymm" xed="VFMSUB231PS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SD" form="xmm, xmm, xmm" xed="VFMSUB132SD_XMMdq_XMMq_XMMq"/>
+	<instruction name="VFMSUB213SD" form="xmm, xmm, xmm" xed="VFMSUB213SD_XMMdq_XMMq_XMMq"/>
+	<instruction name="VFMSUB231SD" form="xmm, xmm, xmm" xed="VFMSUB231SD_XMMdq_XMMq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUB132SS" form="xmm, xmm, xmm" xed="VFMSUB132SS_XMMdq_XMMd_XMMd"/>
+	<instruction name="VFMSUB213SS" form="xmm, xmm, xmm" xed="VFMSUB213SS_XMMdq_XMMd_XMMd"/>
+	<instruction name="VFMSUB231SS" form="xmm, xmm, xmm" xed="VFMSUB231SS_XMMdq_XMMd_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF ((j &amp; 1) == 0) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="xmm, xmm, xmm" xed="VFMSUBADD132PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMSUBADD213PD" form="xmm, xmm, xmm" xed="VFMSUBADD213PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMSUBADD231PD" form="xmm, xmm, xmm" xed="VFMSUBADD231PD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF ((j &amp; 1) == 0) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PD" form="ymm, ymm, ymm" xed="VFMSUBADD132PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMSUBADD213PD" form="ymm, ymm, ymm" xed="VFMSUBADD213PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMSUBADD231PD" form="ymm, ymm, ymm" xed="VFMSUBADD231PD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF ((j &amp; 1) == 0) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="xmm, xmm, xmm" xed="VFMSUBADD132PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMSUBADD213PS" form="xmm, xmm, xmm" xed="VFMSUBADD213PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFMSUBADD231PS" form="xmm, xmm, xmm" xed="VFMSUBADD231PS_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF ((j &amp; 1) == 0) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFMSUBADD132PS" form="ymm, ymm, ymm" xed="VFMSUBADD132PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMSUBADD213PS" form="ymm, ymm, ymm" xed="VFMSUBADD213PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFMSUBADD231PS" form="ymm, ymm, ymm" xed="VFMSUBADD231PS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="xmm, xmm, xmm" xed="VFNMADD132PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFNMADD213PD" form="xmm, xmm, xmm" xed="VFNMADD213PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFNMADD231PD" form="xmm, xmm, xmm" xed="VFNMADD231PD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMADD132PD" form="ymm, ymm, ymm" xed="VFNMADD132PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFNMADD213PD" form="ymm, ymm, ymm" xed="VFNMADD213PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFNMADD231PD" form="ymm, ymm, ymm" xed="VFNMADD231PD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="xmm, xmm, xmm" xed="VFNMADD132PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFNMADD213PS" form="xmm, xmm, xmm" xed="VFNMADD213PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFNMADD231PS" form="xmm, xmm, xmm" xed="VFNMADD231PS_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMADD132PS" form="ymm, ymm, ymm" xed="VFNMADD132PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFNMADD213PS" form="ymm, ymm, ymm" xed="VFNMADD213PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFNMADD231PS" form="ymm, ymm, ymm" xed="VFNMADD231PS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fnmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SD" form="xmm, xmm, xmm" xed="VFNMADD132SD_XMMdq_XMMq_XMMq"/>
+	<instruction name="VFNMADD213SD" form="xmm, xmm, xmm" xed="VFNMADD213SD_XMMdq_XMMq_XMMq"/>
+	<instruction name="VFNMADD231SD" form="xmm, xmm, xmm" xed="VFNMADD231SD_XMMdq_XMMq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMADD132SS" form="xmm, xmm, xmm" xed="VFNMADD132SS_XMMdq_XMMd_XMMd"/>
+	<instruction name="VFNMADD213SS" form="xmm, xmm, xmm" xed="VFNMADD213SS_XMMdq_XMMd_XMMd"/>
+	<instruction name="VFNMADD231SS" form="xmm, xmm, xmm" xed="VFNMADD231SS_XMMdq_XMMd_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="xmm, xmm, xmm" xed="VFNMSUB132PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFNMSUB213PD" form="xmm, xmm, xmm" xed="VFNMSUB213PD_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFNMSUB231PD" form="xmm, xmm, xmm" xed="VFNMSUB231PD_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256d" varname="dst" etype="FP64"/>
+	<parameter type="__m256d" varname="a" etype="FP64"/>
+	<parameter type="__m256d" varname="b" etype="FP64"/>
+	<parameter type="__m256d" varname="c" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMSUB132PD" form="ymm, ymm, ymm" xed="VFNMSUB132PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFNMSUB213PD" form="ymm, ymm, ymm" xed="VFNMSUB213PD_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFNMSUB231PD" form="ymm, ymm, ymm" xed="VFNMSUB231PD_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="xmm, xmm, xmm" xed="VFNMSUB132PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFNMSUB213PS" form="xmm, xmm, xmm" xed="VFNMSUB213PS_XMMdq_XMMdq_XMMdq"/>
+	<instruction name="VFNMSUB231PS" form="xmm, xmm, xmm" xed="VFNMSUB231PS_XMMdq_XMMdq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm256_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="__m256" varname="b" etype="FP32"/>
+	<parameter type="__m256" varname="c" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VFNMSUB132PS" form="ymm, ymm, ymm" xed="VFNMSUB132PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFNMSUB213PS" form="ymm, ymm, ymm" xed="VFNMSUB213PS_YMMqq_YMMqq_YMMqq"/>
+	<instruction name="VFNMSUB231PS" form="ymm, ymm, ymm" xed="VFNMSUB231PS_YMMqq_YMMqq_YMMqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fnmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="c" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SD" form="xmm, xmm, xmm" xed="VFNMSUB132SD_XMMdq_XMMq_XMMq"/>
+	<instruction name="VFNMSUB213SD" form="xmm, xmm, xmm" xed="VFNMSUB213SD_XMMdq_XMMq_XMMq"/>
+	<instruction name="VFNMSUB231SD" form="xmm, xmm, xmm" xed="VFNMSUB231SD_XMMdq_XMMq_XMMq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="FMA" name="_mm_fnmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="c" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VFNMSUB132SS" form="xmm, xmm, xmm" xed="VFNMSUB132SS_XMMdq_XMMd_XMMd"/>
+	<instruction name="VFNMSUB213SS" form="xmm, xmm, xmm" xed="VFNMSUB213SS_XMMdq_XMMd_XMMd"/>
+	<instruction name="VFNMSUB231SS" form="xmm, xmm, xmm" xed="VFNMSUB231SS_XMMdq_XMMd_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>FP16C</CPUID>
+	<category>Convert</category>
+	<return type="__m256" varname="dst" etype="FP32"/>
+	<parameter type="__m128i" varname="a" etype="FP16"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="ymm, xmm" xed="VCVTPH2PS_YMMqq_XMMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>FP16C</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="FP16"/>
+	<parameter type="__m256" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[sae_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm, ymm, imm8" xed="VCVTPS2PH_XMMdq_YMMqq_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>FP16C</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128i" varname="a" etype="FP16"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VCVTPH2PS" form="xmm, xmm" xed="VCVTPH2PS_XMMdq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>FP16C</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="FP16"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="sae" etype="IMM" hint="TRUE" immtype="_MM_FROUND_SAE"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[sae_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="VCVTPS2PH" form="xmm, xmm, imm8" xed="VCVTPS2PH_XMMq_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_readfsbase_u32">
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<description>Read the FS segment base register and store the 32-bit result in "dst".</description>
+	<operation>dst[31:0] := FS_Segment_Base_Register
+dst[63:32] := 0
+	</operation>
+	<instruction name="RDFSBASE" form="r32" xed="RDFSBASE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_readfsbase_u64">
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<description>Read the FS segment base register and store the 64-bit result in "dst".</description>
+	<operation>dst[63:0] := FS_Segment_Base_Register
+	</operation>
+	<instruction name="RDFSBASE" form="r64" xed="RDFSBASE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_readgsbase_u32">
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<description>Read the GS segment base register and store the 32-bit result in "dst".</description>
+	<operation>dst[31:0] := GS_Segment_Base_Register
+dst[63:32] := 0
+	</operation>
+	<instruction name="RDGSBASE" form="r32" xed="RDGSBASE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_readgsbase_u64">
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<description>Read the GS segment base register and store the 64-bit result in "dst".</description>
+	<operation>dst[63:0] := GS_Segment_Base_Register
+	</operation>
+	<instruction name="RDGSBASE" form="r64" xed="RDGSBASE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_writefsbase_u32">
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Write the unsigned 32-bit integer "a" to the FS segment base register.</description>
+	<operation>
+FS_Segment_Base_Register[31:0] := a[31:0]
+FS_Segment_Base_Register[63:32] := 0
+	</operation>
+	<instruction name="WRFSBASE" form="r32" xed="WRFSBASE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_writefsbase_u64">
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Write the unsigned 64-bit integer "a" to the FS segment base register.</description>
+	<operation>
+FS_Segment_Base_Register[63:0] := a[63:0]
+	</operation>
+	<instruction name="WRFSBASE" form="r64" xed="WRFSBASE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_writegsbase_u32">
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Write the unsigned 32-bit integer "a" to the GS segment base register.</description>
+	<operation>
+GS_Segment_Base_Register[31:0] := a[31:0]
+GS_Segment_Base_Register[63:32] := 0
+	</operation>
+	<instruction name="WRGSBASE" form="r32" xed="WRGSBASE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_writegsbase_u64">
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Write the unsigned 64-bit integer "a" to the GS segment base register.</description>
+	<operation>
+GS_Segment_Base_Register[63:0] := a[63:0]
+	</operation>
+	<instruction name="WRGSBASE" form="r64" xed="WRGSBASE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_fxrstor">
+	<CPUID>FXSR</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr" memwidth="4096"/>
+	<description>Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary.</description>
+	<operation>state_x87_fpu_mmx_sse := fxrstor(MEM[mem_addr+512*8:mem_addr])
+	</operation>
+	<instruction name="FXRSTOR" form="m512" xed="FXRSTOR_MEMmfpxenv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_fxrstor64">
+	<CPUID>FXSR</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr" memwidth="4096"/>
+	<description>Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary.</description>
+	<operation>state_x87_fpu_mmx_sse := fxrstor64(MEM[mem_addr+512*8:mem_addr])
+	</operation>
+	<instruction name="FXRSTOR64" form="m512" xed="FXRSTOR64_MEMmfpxenv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_fxsave">
+	<CPUID>FXSR</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr" memwidth="4096"/>
+	<description>Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.</description>
+	<operation>MEM[mem_addr+512*8:mem_addr] := fxsave(state_x87_fpu_mmx_sse)
+	</operation>
+	<instruction name="FXSAVE" form="m512" xed="FXSAVE_MEMmfpxenv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_fxsave64">
+	<CPUID>FXSR</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr" memwidth="4096"/>
+	<description>Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.</description>
+	<operation>MEM[mem_addr+512*8:mem_addr] := fxsave64(state_x87_fpu_mmx_sse)
+	</operation>
+	<instruction name="FXSAVE64" form="m512" xed="FXSAVE64_MEMmfpxenv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_maskz_gf2p8mul_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 63
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGF2P8MULB" form="zmm {z}, zmm, zmm" xed="VGF2P8MULB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_mask_gf2p8mul_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="src" etype="UI8"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 63
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := src.byte[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGF2P8MULB" form="zmm {k}, zmm, zmm" xed="VGF2P8MULB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_gf2p8mul_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI8"/>
+	<parameter type="__m512i" varname="a" etype="UI8"/>
+	<parameter type="__m512i" varname="b" etype="UI8"/>
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 63
+	dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGF2P8MULB" form="zmm, zmm, zmm" xed="VGF2P8MULB_ZMMu8_MASKmskw_ZMMu8_ZMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_maskz_gf2p8mul_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 31
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGF2P8MULB" form="ymm {z}, ymm, ymm" xed="VGF2P8MULB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_mask_gf2p8mul_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="src" etype="UI8"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 31
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := src.byte[j]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGF2P8MULB" form="ymm {k}, ymm, ymm" xed="VGF2P8MULB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_gf2p8mul_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI8"/>
+	<parameter type="__m256i" varname="a" etype="UI8"/>
+	<parameter type="__m256i" varname="b" etype="UI8"/>
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 31
+	dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGF2P8MULB" form="ymm, ymm, ymm" xed="VGF2P8MULB_YMMu8_MASKmskw_YMMu8_YMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_maskz_gf2p8mul_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 15
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGF2P8MULB" form="xmm {z}, xmm, xmm" xed="VGF2P8MULB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_mask_gf2p8mul_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="src" etype="UI8"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 15
+	IF k[j]
+		dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+	ELSE
+		dst.byte[j] := src.byte[j]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGF2P8MULB" form="xmm {k}, xmm, xmm" xed="VGF2P8MULB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_gf2p8mul_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1.</description>
+	<operation>
+DEFINE gf2p8mul_byte(src1byte, src2byte) {
+	tword := 0
+	FOR i := 0 to 7
+		IF src2byte.bit[i]
+			tword := tword XOR (src1byte &lt;&lt; i)
+		FI
+	ENDFOR
+	FOR i := 14 downto 8
+		p := 0x11B &lt;&lt; (i-8)
+		IF tword.bit[i]
+			tword := tword XOR p
+		FI
+	ENDFOR
+	RETURN tword.byte[0]
+}
+FOR j := 0 TO 15
+	dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGF2P8MULB" form="xmm, xmm, xmm" xed="VGF2P8MULB_XMMu8_MASKmskw_XMMu8_XMMu8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_maskz_gf2p8affine_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="x" etype="UI64"/>
+	<parameter type="__m512i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEQB" form="zmm {z}, zmm, zmm, imm8" xed="VGF2P8AFFINEQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_mask_gf2p8affine_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="x" etype="UI64"/>
+	<parameter type="__m512i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEQB" form="zmm {k}, zmm, zmm, imm8" xed="VGF2P8AFFINEQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_gf2p8affine_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="x" etype="UI64"/>
+	<parameter type="__m512i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst".</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEQB" form="zmm, zmm, zmm, imm8" xed="VGF2P8AFFINEQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_maskz_gf2p8affine_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="x" etype="UI64"/>
+	<parameter type="__m256i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEQB" form="ymm {z}, ymm, ymm, imm8" xed="VGF2P8AFFINEQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_mask_gf2p8affine_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="x" etype="UI64"/>
+	<parameter type="__m256i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEQB" form="ymm {k}, ymm, ymm, imm8" xed="VGF2P8AFFINEQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_gf2p8affine_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="x" etype="UI64"/>
+	<parameter type="__m256i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst".</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEQB" form="ymm, ymm, ymm, imm8" xed="VGF2P8AFFINEQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_maskz_gf2p8affine_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="x" etype="UI64"/>
+	<parameter type="__m128i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEQB" form="xmm {z}, xmm, xmm, imm8" xed="VGF2P8AFFINEQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_mask_gf2p8affine_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="x" etype="UI64"/>
+	<parameter type="__m128i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEQB" form="xmm {k}, xmm, xmm, imm8" xed="VGF2P8AFFINEQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_gf2p8affine_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="x" etype="UI64"/>
+	<parameter type="__m128i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst".</description>
+	<operation>
+DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEQB" form="xmm, xmm, xmm, imm8" xed="VGF2P8AFFINEQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_maskz_gf2p8affineinv_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="x" etype="UI64"/>
+	<parameter type="__m512i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEINVQB" form="zmm {z}, zmm, zmm, imm8" xed="VGF2P8AFFINEINVQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_mask_gf2p8affineinv_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask64" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="x" etype="UI64"/>
+	<parameter type="__m512i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[b]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEINVQB" form="zmm {k}, zmm, zmm, imm8" xed="VGF2P8AFFINEINVQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_gf2p8affineinv_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="x" etype="UI64"/>
+	<parameter type="__m512i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst".</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 7
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEINVQB" form="zmm, zmm, zmm, imm8" xed="VGF2P8AFFINEINVQB_ZMMu8_MASKmskw_ZMMu8_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_maskz_gf2p8affineinv_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="x" etype="UI64"/>
+	<parameter type="__m256i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEINVQB" form="ymm {z}, ymm, ymm, imm8" xed="VGF2P8AFFINEINVQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_mask_gf2p8affineinv_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="src" etype="UI64"/>
+	<parameter type="__mmask32" varname="k" etype="MASK"/>
+	<parameter type="__m256i" varname="x" etype="UI64"/>
+	<parameter type="__m256i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEINVQB" form="ymm {k}, ymm, ymm, imm8" xed="VGF2P8AFFINEINVQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_gf2p8affineinv_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m256i" varname="dst" etype="UI64"/>
+	<parameter type="__m256i" varname="x" etype="UI64"/>
+	<parameter type="__m256i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst".</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 3
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEINVQB" form="ymm, ymm, ymm, imm8" xed="VGF2P8AFFINEINVQB_YMMu8_MASKmskw_YMMu8_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_maskz_gf2p8affineinv_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="x" etype="UI64"/>
+	<parameter type="__m128i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEINVQB" form="xmm {z}, xmm, xmm, imm8" xed="VGF2P8AFFINEINVQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_mask_gf2p8affineinv_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="src" etype="UI64"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m128i" varname="x" etype="UI64"/>
+	<parameter type="__m128i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		IF k[j*8+i]
+			dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+		ELSE
+			dst.qword[j].byte[i] := src.qword[j].byte[i]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEINVQB" form="xmm {k}, xmm, xmm, imm8" xed="VGF2P8AFFINEINVQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_gf2p8affineinv_epi64_epi8">
+	<type>Integer</type>
+	<CPUID>GFNI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="x" etype="UI64"/>
+	<parameter type="__m128i" varname="A" etype="UI64"/>
+	<parameter type="int" varname="b" etype="IMM" immwidth="8"/>
+	<description>Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst".</description>
+	<operation>DEFINE parity(x) {
+	t := 0
+	FOR i := 0 to 7
+		t := t XOR x.bit[i]
+	ENDFOR
+	RETURN t
+}
+DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) {
+	FOR i := 0 to 7
+		retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i]
+	ENDFOR
+	RETURN retbyte
+}
+FOR j := 0 TO 1
+	FOR i := 0 to 7
+		dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b)
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="VGF2P8AFFINEINVQB" form="xmm, xmm, xmm, imm8" xed="VGF2P8AFFINEINVQB_XMMu8_MASKmskw_XMMu8_XMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_invpcid">
+	<CPUID>INVPCID</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="type" etype="UI32"/>
+	<parameter type="void*" varname="descriptor" memwidth="128"/>
+	<description>Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by "descriptor" based on the invalidation type specified in "type". 
+	The PCID "descriptor" is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved.
+	The types supported are:
+		0) Individual-address invalidation: If "type" is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in "descriptor", except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs.
+		1) Single-context invalidation: If "type" is 1, the logical processor invalidates all mappings tagged with the PCID specified in "descriptor" except global translations. In some cases, it may invalidate mappings for other PCIDs as well.
+		2) All-context invalidation: If "type" is 2, the logical processor invalidates all mappings tagged with any PCID.
+		3) All-context invalidation, retaining global translations: If "type" is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring "descriptor". The instruction may also invalidate global translations as well.</description>
+	<operation>
+CASE type[1:0] OF
+0: // individual-address invalidation retaining global translations
+	OP_PCID := MEM[descriptor+11:descriptor]
+	ADDR := MEM[descriptor+127:descriptor+64]
+	BREAK
+1: // single PCID invalidation retaining globals
+	OP_PCID := MEM[descriptor+11:descriptor]
+	// invalidate all mappings tagged with OP_PCID except global translations
+	BREAK
+2: // all PCID invalidation
+	// invalidate all mappings tagged with any PCID
+	BREAK
+3: // all PCID invalidation retaining global translations
+	// invalidate all mappings tagged with any PCID except global translations
+	BREAK
+ESAC
+	</operation>
+	<instruction name="INVPCID" form="r32, m128" xed="INVPCID_GPR32_MEMdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_prefetch">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="char const*" varname="p" etype="UI8"/>
+	<parameter type="int" varname="i" etype="IMM" immwidth="2"/>
+	<description>Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i".</description>
+	<instruction name="VPREFETCH0" form="m8"/>
+	<instruction name="VPREFETCH1" form="m8"/>
+	<instruction name="VPREFETCH2" form="m8"/>
+	<instruction name="VPREFETCHNTA" form="m8"/>
+	<instruction name="VPREFETCHE0" form="m8"/>
+	<instruction name="VPREFETCHE1" form="m8"/>
+	<instruction name="VPREFETCHE2" form="m8"/>
+	<instruction name="VPREFETCHENTA" form="m8"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kandn">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := (NOT a[15:0]) AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KANDN" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kand">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KAND" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kmov">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<description>Copy 16-bit mask "a" to "k".</description>
+	<operation>
+k[15:0] := a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KMOV" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_knot">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<description>Compute the bitwise NOT of 16-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KNOT" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kor">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] OR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KOR" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kxnor">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT (a[15:0] XOR b[15:0])
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KXNOR" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kxor">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="a" etype="MASK"/>
+	<parameter type="__mmask16" varname="b" etype="MASK"/>
+	<description>Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] XOR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name="KXOR" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPLTD" form="k, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Compare</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="SI32"/>
+	<parameter type="__m512i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="VPCMPLTD" form="k {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extload_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="void const *" varname="mt" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="_MM_BROADCAST32_ENUM" varname="bc" etype="UI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to single-precision (32-bit) floating-point elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	CASE bc OF
+	_MM_BROADCAST32_NONE:
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:
+			n	 := j*32
+			dst[i+31:i] := addr[n+31:n]
+		_MM_UPCONV_PS_FLOAT16:
+			n	 := j*16
+			dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n])
+		_MM_UPCONV_PS_UINT8:
+			n	 := j*8
+			dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n])
+		_MM_UPCONV_PS_SINT8:
+			n	 := j*8
+			dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n])
+		_MM_UPCONV_PS_UINT16:
+			n	 := j*16
+			dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n])
+		_MM_UPCONV_PS_SINT16:
+			n	 := j*16
+			dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n])
+		ESAC
+	_MM_BROADCAST_1X16:
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:
+			n	 := j*32
+			dst[i+31:i] := addr[31:0]
+		_MM_UPCONV_PS_FLOAT16:
+			n	 := j*16
+			dst[i+31:i] := Convert_FP16_To_FP32(addr[15:0])
+		_MM_UPCONV_PS_UINT8:
+			n	 := j*8
+			dst[i+31:i] := Convert_UInt8_To_FP32(addr[7:0])
+		_MM_UPCONV_PS_SINT8:
+			n	 := j*8
+			dst[i+31:i] := Convert_Int8_To_FP32(addr[7:0])
+		_MM_UPCONV_PS_UINT16:
+			n	 := j*16
+			dst[i+31:i] := Convert_UInt16_To_FP32(addr[15:0])
+		_MM_UPCONV_PS_SINT16:
+			n	 := j*16
+			dst[i+31:i] := Convert_Int16_To_FP32(addr[15:0])
+		ESAC
+	_MM_BROADCAST_4X16:
+		mod := j%4
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:
+			n := mod*32
+			dst[i+31:i] := addr[n+31:n]
+		_MM_UPCONV_PS_FLOAT16:
+			n := mod*16
+			dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n])
+		_MM_UPCONV_PS_UINT8:
+			n := mod*8
+			dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n])
+		_MM_UPCONV_PS_SINT8:
+			n := mod*8
+			dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n])
+		_MM_UPCONV_PS_UINT16:
+			n := mod*16
+			dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n])
+		_MM_UPCONV_PS_SINT16:
+			n := mod*16
+			dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n])
+		ESAC
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="zmm, m512" xed="VMOVAPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<instruction name="VBROADCASTF32X4" form="zmm, m512" xed="VBROADCASTF32X4_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<instruction name="VBROADCASTSS" form="zmm, m512" xed="VBROADCASTSS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extload_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="_MM_BROADCAST32_ENUM" varname="bc" etype="UI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to single-precision (32-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		CASE bc OF
+		_MM_BROADCAST32_NONE:
+			CASE conv OF
+			_MM_UPCONV_PS_NONE:
+				n	 := j*32
+				dst[i+31:i] := addr[n+31:n]
+			_MM_UPCONV_PS_FLOAT16:
+				n	 := j*16
+				dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n])
+			_MM_UPCONV_PS_UINT8:
+				n	 := j*8
+				dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n])
+			_MM_UPCONV_PS_SINT8:
+				n	 := j*8
+				dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n])
+			_MM_UPCONV_PS_UINT16:
+				n	 := j*16
+				dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n])
+			_MM_UPCONV_PS_SINT16:
+				n	 := j*16
+				dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n])
+			ESAC
+		_MM_BROADCAST_1X16:
+			CASE conv OF
+			_MM_UPCONV_PS_NONE:
+				n	 := j*32
+				dst[i+31:i] := addr[31:0]
+			_MM_UPCONV_PS_FLOAT16:
+				n	 := j*16
+				dst[i+31:i] := Convert_FP16_To_FP32(addr[15:0])
+			_MM_UPCONV_PS_UINT8:
+				n	 := j*8
+				dst[i+31:i] := Convert_UInt8_To_FP32(addr[7:0])
+			_MM_UPCONV_PS_SINT8:
+				n	 := j*8
+				dst[i+31:i] := Convert_Int8_To_FP32(addr[7:0])
+			_MM_UPCONV_PS_UINT16:
+				n	 := j*16
+				dst[i+31:i] := Convert_UInt16_To_FP32(addr[15:0])
+			_MM_UPCONV_PS_SINT16:
+				n	 := j*16
+				dst[i+31:i] := Convert_Int16_To_FP32(addr[15:0])
+			ESAC
+		_MM_BROADCAST_4X16:
+			mod := j%4
+			CASE conv OF
+			_MM_UPCONV_PS_NONE:
+				n := mod*32
+				dst[i+31:i] := addr[n+31:n]
+			_MM_UPCONV_PS_FLOAT16:
+				n := mod*16
+				dst[i+31:i] := Convert_FP16_To_FP32(addr[n+15:n])
+			_MM_UPCONV_PS_UINT8:
+				n := mod*8
+				dst[i+31:i] := Convert_UInt8_To_FP32(addr[n+7:n])
+			_MM_UPCONV_PS_SINT8:
+				n := mod*8
+				dst[i+31:i] := Convert_Int8_To_FP32(addr[n+7:n])
+			_MM_UPCONV_PS_UINT16:
+				n := mod*16
+				dst[i+31:i] := Convert_UInt16_To_FP32(addr[n+15:n])
+			_MM_UPCONV_PS_SINT16:
+				n := mod*16
+				dst[i+31:i] := Convert_Int16_To_FP32(addr[n+15:n])
+			ESAC
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPS" form="zmm {k}, m512" xed="VMOVAPS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<instruction name="VBROADCASTF32X4" form="zmm {k}, m512" xed="VBROADCASTF32X4_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<instruction name="VBROADCASTSS" form="zmm {k}, m512" xed="VBROADCASTSS_ZMMf32_MASKmskw_MEMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extload_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="void const *" varname="mt" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="_MM_BROADCAST32_ENUM" varname="bc" etype="UI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 32-bit integer elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	CASE bc OF
+	_MM_BROADCAST32_NONE:
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:
+			n	 := j*32
+			dst[i+31:i] := addr[n+31:n]
+		_MM_UPCONV_EPI32_UINT8:
+			n	 := j*8
+			dst[i+31:i] := ZeroExtend32(addr[n+7:n])
+		_MM_UPCONV_EPI32_SINT8:
+			n	 := j*8
+			dst[i+31:i] := SignExtend32(addr[n+7:n])
+		_MM_UPCONV_EPI32_UINT16:
+			n	 := j*16
+			dst[i+31:i] := ZeroExtend32(addr[n+15:n])
+		_MM_UPCONV_EPI32_SINT16:
+			n	 := j*16
+			dst[i+31:i] := SignExtend32(addr[n+15:n])
+		ESAC
+	_MM_BROADCAST_1X16:
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:
+			n	 := j*32
+			dst[i+31:i] := addr[31:0]
+		_MM_UPCONV_EPI32_UINT8:
+			n	 := j*8
+			dst[i+31:i] := ZeroExtend32(addr[7:0])
+		_MM_UPCONV_EPI32_SINT8:
+			n	 := j*8
+			dst[i+31:i] := SignExtend32(addr[7:0])
+		_MM_UPCONV_EPI32_UINT16:
+			n	 := j*16
+			dst[i+31:i] := ZeroExtend32(addr[15:0])
+		_MM_UPCONV_EPI32_SINT16:
+			n	 := j*16
+			dst[i+31:i] := SignExtend32(addr[15:0])
+		ESAC
+	_MM_BROADCAST_4X16:
+		mod := j%4
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:
+			n := mod*32
+			dst[i+31:i] := addr[n+31:n]
+		_MM_UPCONV_EPI32_UINT8:
+			n := mod*8
+			dst[i+31:i] := ZeroExtend32(addr[n+7:n])
+		_MM_UPCONV_EPI32_SINT8:
+			n := mod*8
+			dst[i+31:i] := SignExtend32(addr[n+7:n])
+		_MM_UPCONV_EPI32_UINT16:
+			n := mod*16
+			dst[i+31:i] := ZeroExtend32(addr[n+15:n])
+		_MM_UPCONV_EPI32_SINT16:
+			n := mod*16
+			dst[i+31:i] := SignExtend32(addr[n+15:n])
+		ESAC
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="zmm, m512" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<instruction name="VBROADCASTI32X4" form="zmm, m512" xed="VBROADCASTI32X4_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<instruction name="VPBROADCASTD" form="zmm, m512" xed="VPBROADCASTD_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extload_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="_MM_BROADCAST32_ENUM" varname="bc" etype="UI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 32-bit integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		CASE bc OF
+		_MM_BROADCAST32_NONE:
+			CASE conv OF
+			_MM_UPCONV_EPI32_NONE:
+				n	 := j*32
+				dst[i+31:i] := addr[n+31:n]
+			_MM_UPCONV_EPI32_UINT8:
+				n	 := j*8
+				dst[i+31:i] := ZeroExtend32(addr[n+7:n])
+			_MM_UPCONV_EPI32_SINT8:
+				n	 := j*8
+				dst[i+31:i] := SignExtend32(addr[n+7:n])
+			_MM_UPCONV_EPI32_UINT16:
+				n	 := j*16
+				dst[i+31:i] := ZeroExtend32(addr[n+15:n])
+			_MM_UPCONV_EPI32_SINT16:
+				n	 := j*16
+				dst[i+31:i] := SignExtend32(addr[n+15:n])
+			ESAC
+		_MM_BROADCAST_1X16:
+			CASE conv OF
+			_MM_UPCONV_EPI32_NONE:
+				n	 := j*32
+				dst[i+31:i] := addr[31:0]
+			_MM_UPCONV_EPI32_UINT8:
+				n	 := j*8
+				dst[i+31:i] := ZeroExtend32(addr[7:0])
+			_MM_UPCONV_EPI32_SINT8:
+				n	 := j*8
+				dst[i+31:i] := SignExtend32(addr[7:0])
+			_MM_UPCONV_EPI32_UINT16:
+				n	 := j*16
+				dst[i+31:i] := ZeroExtend32(addr[15:0])
+			_MM_UPCONV_EPI32_SINT16:
+				n	 := j*16
+				dst[i+31:i] := SignExtend32(addr[15:0])
+			ESAC
+		_MM_BROADCAST_4X16:
+			mod := j%4
+			CASE conv OF
+			_MM_UPCONV_EPI32_NONE:
+				n := mod*32
+				dst[i+31:i] := addr[n+31:n]
+			_MM_UPCONV_EPI32_UINT8:
+				n := mod*8
+				dst[i+31:i] := ZeroExtend32(addr[n+7:n])
+			_MM_UPCONV_EPI32_SINT8:
+				n := mod*8
+				dst[i+31:i] := SignExtend32(addr[n+7:n])
+			_MM_UPCONV_EPI32_UINT16:
+				n := mod*16
+				dst[i+31:i] := ZeroExtend32(addr[n+15:n])
+			_MM_UPCONV_EPI32_SINT16:
+				n := mod*16
+				dst[i+31:i] := SignExtend32(addr[n+15:n])
+			ESAC
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA32" form="zmm {k}, m512" xed="VMOVDQA32_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<instruction name="VBROADCASTI32X4" form="zmm {k}, m512" xed="VBROADCASTI32X4_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<instruction name="VPBROADCASTD" form="zmm {k}, m512" xed="VPBROADCASTD_ZMMu32_MASKmskw_MEMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extload_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="void const *" varname="mt" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="_MM_BROADCAST64_ENUM" varname="bc" etype="UI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to double-precision (64-bit) floating-point elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	CASE bc OF
+	_MM_BROADCAST64_NONE:
+		CASE conv OF
+		_MM_UPCONV_PD_NONE:
+			n := j*64
+			dst[i+63:i] := addr[n+63:n]
+		ESAC
+	_MM_BROADCAST_1X8:
+		CASE conv OF
+		_MM_UPCONV_PD_NONE:
+			n := j*64
+			dst[i+63:i] := addr[63:0]
+		ESAC
+	_MM_BROADCAST_4X8:
+		mod := j%4
+		CASE conv OF
+		_MM_UPCONV_PD_NONE:
+			n := mod*64
+			dst[i+63:i] := addr[n+63:n]
+		ESAC
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="zmm, m512" xed="VMOVAPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<instruction name="VBROADCASTF64X4" form="zmm, m512" xed="VBROADCASTF64X4_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<instruction name="VBROADCASTSD" form="zmm, m512" xed="VBROADCASTSD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extload_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="_MM_BROADCAST64_ENUM" varname="bc" etype="UI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		CASE bc OF
+		_MM_BROADCAST64_NONE:
+			CASE conv OF
+			_MM_UPCONV_PD_NONE:
+				n := j*64
+				dst[i+63:i] := addr[n+63:n]
+			ESAC
+		_MM_BROADCAST_1X8:
+			CASE conv OF
+			_MM_UPCONV_PD_NONE:
+				n := j*64
+				dst[i+63:i] := addr[63:0]
+			ESAC
+		_MM_BROADCAST_4X8:
+			mod := j%4
+			CASE conv OF
+			_MM_UPCONV_PD_NONE:
+				n := mod*64
+				dst[i+63:i] := addr[n+63:n]
+			ESAC
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVAPD" form="zmm {k}, m512" xed="VMOVAPD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<instruction name="VBROADCASTF64X4" form="zmm {k}, m512" xed="VBROADCASTF64X4_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<instruction name="VBROADCASTSD" form="zmm {k}, m512" xed="VBROADCASTSD_ZMMf64_MASKmskw_MEMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extload_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="void const *" varname="mt" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="_MM_BROADCAST64_ENUM" varname="bc" etype="UI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 64-bit integer elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	CASE bc OF
+	_MM_BROADCAST64_NONE:
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE:
+			n := j*64
+			dst[i+63:i] := addr[n+63:n]
+		ESAC
+	_MM_BROADCAST_1X8:
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE:
+			n := j*64
+			dst[i+63:i] := addr[63:0]
+		ESAC
+	_MM_BROADCAST_4X8:
+		mod := j%4
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE:
+			n := mod*64
+			dst[i+63:i] := addr[n+63:n]
+		ESAC
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="zmm, m512" xed="VMOVDQA64_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<instruction name="VBROADCASTI64X4" form="zmm, m512" xed="VBROADCASTI64X4_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<instruction name="VPBROADCASTQ" form="zmm, m512" xed="VPBROADCASTQ_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extload_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="_MM_BROADCAST64_ENUM" varname="bc" etype="UI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 64-bit integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		CASE bc OF
+		_MM_BROADCAST64_NONE:
+			CASE conv OF
+			_MM_UPCONV_EPI64_NONE:
+				n := j*64
+				dst[i+63:i] := addr[n+63:n]
+			ESAC
+		_MM_BROADCAST_1X8:
+			CASE conv OF
+			_MM_UPCONV_EPI64_NONE:
+				n := j*64
+				dst[i+63:i] := addr[63:0]
+			ESAC
+		_MM_BROADCAST_4X8:
+			mod := j%4
+			CASE conv OF
+			_MM_UPCONV_EPI64_NONE:
+				n := mod*64
+				dst[i+63:i] := addr[n+63:n]
+			ESAC
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VMOVDQA64" form="m512 {k}, zmm" xed="VMOVDQA64_MEMu64_MASKmskw_ZMMu64_AVX512"/>
+	<instruction name="VBROADCASTI64X4" form="zmm {k}, m512" xed="VBROADCASTI64X4_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<instruction name="VPBROADCASTQ" form="zmm {k}, m512" xed="VPBROADCASTQ_ZMMu64_MASKmskw_MEMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_swizzle_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v" etype="FP32"/>
+	<parameter type="_MM_SWIZZLE_ENUM" varname="s" etype="UI32"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4xsingle-precision (32-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst".</description>
+	<operation>CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 7
+		i := j*64
+		dst[i+31:i]    := v[i+63:i+32]
+		dst[i+63:i+32] := v[i+31:i]
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+95:i+64]
+		dst[i+63:i+32]  := v[i+127:i+96]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+63:i+32]
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+31:i]
+		dst[i+63:i+32]  := v[i+31:i]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+31:i]
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+63:i+32]
+		dst[i+63:i+32]  := v[i+63:i+32]
+		dst[i+95:i+64]  := v[i+63:i+32]
+		dst[i+127:i+96] := v[i+63:i+32]
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+95:i+64]
+		dst[i+63:i+32]  := v[i+95:i+64]
+		dst[i+95:i+64]  := v[i+95:i+64]
+		dst[i+127:i+96] := v[i+95:i+64]
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+127:i+96]
+		dst[i+63:i+32]  := v[i+127:i+96]
+		dst[i+95:i+64]  := v[i+127:i+96]
+		dst[i+127:i+96] := v[i+127:i+96]
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+63:i+32]
+		dst[i+63:i+32]  := v[i+95:i+64]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+127:i+96]
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_swizzle_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="v" etype="FP64"/>
+	<parameter type="_MM_SWIZZLE_ENUM" varname="s" etype="UI64"/>
+	<description>Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst".</description>
+	<operation>CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 3
+		i := j*64
+		dst[i+63:i]     := v[i+127:i+64]
+		dst[i+127:i+64] := v[i+63:i]
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]      := v[i+191:i+128]
+		dst[i+127:i+64]  := v[i+255:i+192]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+127:i+64]
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]      := v[i+63:i]
+		dst[i+127:i+64]  := v[i+63:i]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+63:i]
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]      := v[i+127:i+63]
+		dst[i+127:i+64]  := v[i+127:i+63]
+		dst[i+191:i+128] := v[i+127:i+63]
+		dst[i+255:i+192] := v[i+127:i+63]
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]      := v[i+191:i+128]
+		dst[i+127:i+64]  := v[i+191:i+128]
+		dst[i+191:i+128] := v[i+191:i+128]
+		dst[i+255:i+192] := v[i+191:i+128]
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+255:i+192]
+		dst[i+127:i+64]  := v[i+255:i+192]
+		dst[i+191:i+128] := v[i+255:i+192]
+		dst[i+255:i+192] := v[i+255:i+192]
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+127:i+64]
+		dst[i+127:i+64]  := v[i+191:i+128]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+255:i+192]
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_swizzle_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v" etype="UI32"/>
+	<parameter type="_MM_SWIZZLE_ENUM" varname="s" etype="UI32"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4x 32-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst".</description>
+	<operation>CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 7
+		i := j*64
+		dst[i+31:i]    := v[i+63:i+32]
+		dst[i+63:i+32] := v[i+31:i]
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+95:i+64]
+		dst[i+63:i+32]  := v[i+127:i+96]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+63:i+32]
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+31:i]
+		dst[i+63:i+32]  := v[i+31:i]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+31:i]
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+63:i+32]
+		dst[i+63:i+32]  := v[i+63:i+32]
+		dst[i+95:i+64]  := v[i+63:i+32]
+		dst[i+127:i+96] := v[i+63:i+32]
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+95:i+64]
+		dst[i+63:i+32]  := v[i+95:i+64]
+		dst[i+95:i+64]  := v[i+95:i+64]
+		dst[i+127:i+96] := v[i+95:i+64]
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+127:i+96]
+		dst[i+63:i+32]  := v[i+127:i+96]
+		dst[i+95:i+64]  := v[i+127:i+96]
+		dst[i+127:i+96] := v[i+127:i+96]
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+63:i+32]
+		dst[i+63:i+32]  := v[i+95:i+64]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+127:i+96]
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_swizzle_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="v" etype="UI64"/>
+	<parameter type="_MM_SWIZZLE_ENUM" varname="s" etype="UI64"/>
+	<description>Performs a swizzle transformation of each of the two groups of packed 4x64-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst".</description>
+	<operation>CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 3
+		i := j*64
+		dst[i+63:i]	    := v[i+127:i+64]
+		dst[i+127:i+64] := v[i+63:i]
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+191:i+128]
+		dst[i+127:i+64]  := v[i+255:i+192]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+127:i+64]
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+63:i]
+		dst[i+127:i+64]  := v[i+63:i]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+63:i]
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+127:i+63]
+		dst[i+127:i+64]  := v[i+127:i+63]
+		dst[i+191:i+128] := v[i+127:i+63]
+		dst[i+255:i+192] := v[i+127:i+63]
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+191:i+128]
+		dst[i+127:i+64]  := v[i+191:i+128]
+		dst[i+191:i+128] := v[i+191:i+128]
+		dst[i+255:i+192] := v[i+191:i+128]
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+255:i+192]
+		dst[i+127:i+64]  := v[i+255:i+192]
+		dst[i+191:i+128] := v[i+255:i+192]
+		dst[i+255:i+192] := v[i+255:i+192]
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+127:i+64]
+		dst[i+127:i+64]  := v[i+191:i+128]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+255:i+192]
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_swizzle_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v" etype="FP32"/>
+	<parameter type="_MM_SWIZZLE_ENUM" varname="s" etype="UI32"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4x single-precision (32-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 7
+		i := j*64
+		IF k[j*2]
+			dst[i+31:i]	:= v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	:= src[i+31:i]
+		FI
+		IF k[j*2+1]
+			dst[i+63:i+32] := v[i+31:i]
+		ELSE
+			dst[i+63:i+32] := src[i+63:i+32]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+95:i+64]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+127:i+96]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+63:i+32]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+31:i]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+31:i]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+31:i]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+63:i+32]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+63:i+32]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+63:i+32]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+95:i+64]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+95:i+64]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+95:i+64]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+95:i+64]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+127:i+96]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+127:i+96]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+127:i+96]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+127:i+96]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+95:i+64]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+127:i+96]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_swizzle_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v" etype="FP64"/>
+	<parameter type="_MM_SWIZZLE_ENUM" varname="s" etype="UI64"/>
+	<description>Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 3
+		i := j*64
+		IF k[j*2]
+			dst[i+63:i]	 := v[i+127:i+64]
+		ELSE
+			dst[i+63:i]	 := src[i+63:i]
+		FI
+		IF k[j*2+1]
+			dst[i+127:i+64] := v[i+63:i]
+		ELSE
+			dst[i+127:i+64] := src[i+127:i+64]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+191:i+128]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+255:i+192]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+127:i+64]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+63:i]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+63:i]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+63:i]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+127:i+63]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+127:i+63]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+127:i+63]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+127:i+63]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+191:i+128]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+191:i+128]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+191:i+128]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+191:i+128]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+255:i+192]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+255:i+192]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+255:i+192]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+255:i+192]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+127:i+64]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+191:i+128]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+255:i+192]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_swizzle_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v" etype="UI32"/>
+	<parameter type="_MM_SWIZZLE_ENUM" varname="s" etype="UI32"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4x32-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 7
+		i := j*64
+		IF k[j*2]
+			dst[i+31:i]	:= v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	:= src[i+31:i]
+		FI
+		IF k[j*2+1]
+			dst[i+63:i+32] := v[i+31:i]
+		ELSE
+			dst[i+63:i+32] := src[i+63:i+32]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+95:i+64]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+127:i+96]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+63:i+32]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+31:i]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+31:i]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+31:i]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+63:i+32]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+63:i+32]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+63:i+32]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+95:i+64]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+95:i+64]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+95:i+64]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+95:i+64]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+127:i+96]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+127:i+96]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+127:i+96]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+127:i+96]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+95:i+64]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+127:i+96]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_swizzle_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v" etype="UI64"/>
+	<parameter type="_MM_SWIZZLE_ENUM" varname="s" etype="UI64"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4x64-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 3
+		i := j*64
+		IF k[j*2]
+			dst[i+63:i]	 := v[i+127:i+64]
+		ELSE
+			dst[i+63:i]	 := src[i+63:i]
+		FI
+		IF k[j*2+1]
+			dst[i+127:i+64] := v[i+63:i]
+		ELSE
+			dst[i+127:i+64] := src[i+127:i+64]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+191:i+128]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+255:i+192]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+127:i+64]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+63:i]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+63:i]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+63:i]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+127:i+63]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+127:i+63]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+127:i+63]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+127:i+63]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+191:i+128]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+191:i+128]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+191:i+128]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+191:i+128]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+255:i+192]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+255:i+192]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+255:i+192]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+255:i+192]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+127:i+64]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+191:i+128]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+255:i+192]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extstore_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP32"/>
+	<parameter type="__m512" varname="v" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Downconverts packed single-precision (32-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]		
+FOR j := 0 to 15
+	i := j*32
+	CASE conv OF
+	_MM_DOWNCONV_PS_NONE:
+		addr[i+31:i] := v[i+31:i]
+	_MM_DOWNCONV_PS_FLOAT16:
+		n := j*16
+		addr[n+15:n] := Convert_FP32_To_FP16(v[i+31:i])
+	_MM_DOWNCONV_PS_UINT8:
+		n := j*8
+		addr[n+7:n] := Convert_FP32_To_UInt8(v[i+31:i])
+	_MM_DOWNCONV_PS_SINT8:
+		n := j*8
+		addr[n+7:n] := Convert_FP32_To_Int8(v[i+31:i])
+	_MM_DOWNCONV_PS_UINT16:
+		n := j*16
+		addr[n+15:n] := Convert_FP32_To_UInt16(v[i+31:i])
+	_MM_DOWNCONV_PS_SINT16:
+		n := j*16
+		addr[n+15:n] := Convert_FP32_To_Int16(v[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPS" form="m512, zmm" xed="VMOVAPS_MEMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extstore_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI32"/>
+	<parameter type="__m512i" varname="v" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Downconverts packed 32-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	CASE conv OF
+	_MM_DOWNCONV_EPI32_NONE:
+		addr[i+31:i] := v[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:
+		n := j*8
+		addr[n+7:n] := Int32ToUInt8(v[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:
+		n := j*8
+		addr[n+7:n] := Int32ToSInt8(v[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16:
+		n := j*16
+		addr[n+15:n] := Int32ToUInt16(v[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16:
+		n := j*16
+		addr[n+15:n] := Int32ToSInt16(v[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA32" form="m512, zmm" xed="VMOVDQA32_MEMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extstore_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP64"/>
+	<parameter type="__m512d" varname="v" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Downconverts packed double-precision (64-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_PS_NONE:
+		addr[i+63:i] := v[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPD" form="m512, zmm" xed="VMOVAPD_MEMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extstore_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI64"/>
+	<parameter type="__m512i" varname="v" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Downconverts packed 64-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA64" form="m512, zmm" xed="VMOVDQA64_MEMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extstore_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Downconverts packed single-precision (32-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" using writemask "k" (elements are not written to memory when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_PS_NONE:
+			mt[i+31:i] := v[i+31:i]
+		_MM_DOWNCONV_PS_FLOAT16:
+			n := j*16
+			mt[n+15:n] := Convert_FP32_To_FP16(v[i+31:i])
+		_MM_DOWNCONV_PS_UINT8:
+			n := j*8
+			mt[n+7:n] := Convert_FP32_To_UInt8(v[i+31:i])
+		_MM_DOWNCONV_PS_SINT8:
+			n := j*8
+			mt[n+7:n] := Convert_FP32_To_Int8(v[i+31:i])
+		_MM_DOWNCONV_PS_UINT16:
+			n := j*16
+			mt[n+15:n] := Convert_FP32_To_UInt16(v[i+31:i])
+		_MM_DOWNCONV_PS_SINT16:
+			n := j*16
+			mt[n+15:n] := Convert_FP32_To_Int16(v[i+31:i])
+		ESAC
+	 FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPS" form="m512 {k}, zmm" xed="VMOVAPS_MEMf32_MASKmskw_ZMMf32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extstore_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Downconverts packed double-precision (64-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]		
+FOR j := 0 to 7
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_PD_NONE:
+		IF k[j]
+			mt[i+63:i] := v[i+63:i]
+		FI
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="VMOVAPD" form="m512 {k}, zmm" xed="VMOVAPD_MEMf64_MASKmskw_ZMMf64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extstore_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Downconverts packed 32-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI32_NONE:
+			addr[i+31:i] := v[i+31:i]
+		_MM_DOWNCONV_EPI32_UINT8:
+			n := j*8
+			addr[n+7:n] := Int32ToUInt8(v[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT8:
+			n := j*8
+			addr[n+7:n] := Int32ToSInt8(v[i+31:i])
+		_MM_DOWNCONV_EPI32_UINT16:
+			n := j*16
+			addr[n+15:n] := Int32ToUInt16(v[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT16:
+			n := j*16
+			addr[n+15:n] := Int32ToSInt16(v[i+31:i])
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA32" form="m512 {k}, zmm" xed="VMOVDQA32_MEMu32_MASKmskw_ZMMu32_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extstore_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Downconverts packed 64-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VMOVDQA64" form="m512 {k}, zmm" xed="VMOVDQA64_MEMu64_MASKmskw_ZMMu64_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_storenr_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP32" memwidth="512"/>
+	<parameter type="__m512" varname="v" etype="FP32"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements from "v" to memory address "mt" with a no-read hint to the processor.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	addr[i+31:i] := v[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VMOVNRAPS" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_storenr_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP64" memwidth="512"/>
+	<parameter type="__m512d" varname="v" etype="FP64"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements from "v" to memory address "mt" with a no-read hint to the processor.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	addr[i+63:i] := v[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VMOVNRAPD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_storenrngo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP32" memwidth="512"/>
+	<parameter type="__m512" varname="v" etype="FP32"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements from "v" to memory address "mt" with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them).</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	addr[i+31:i] := v[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="VMOVNRNGOAPS" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_storenrngo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP64" memwidth="512"/>
+	<parameter type="__m512d" varname="v" etype="FP64"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements from "v" to memory address "mt" with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them).</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	addr[i+63:i] := v[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VMOVNRNGOAPD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_adc_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="k2_res" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element addition of packed 32-bit integers in "v2" and "v3" and the corresponding bit in "k2", storing the result of the addition in "dst" and the result of the carry in "k2_res".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	k2_res[j]   := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADCD" form="zmm, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_adc_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="k2_res" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element addition of packed 32-bit integers in "v2" and "v3" and the corresponding bit in "k2", storing the result of the addition in "dst" and the result of the carry in "k2_res" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k2_res[j]   := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
+	ELSE
+		dst[i+31:i] := v2[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADCD" form="zmm {k}, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_addn_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512d" varname="v3" etype="FP64"/>
+	<description>Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDNPD" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_addn_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512d" varname="v3" etype="FP64"/>
+	<description>Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDNPD" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_addn_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<description>Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDNPS" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_addn_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<description>Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDNPS" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_addn_round_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512d" varname="v3" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element by element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDNPD" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_addn_round_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512d" varname="v3" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element by element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDNPD" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_addn_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element by element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDNPS" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_addn_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element by element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDNPS" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_subr_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512d" varname="v3" etype="FP64"/>
+	<description>Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBRPD" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_subr_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512d" varname="v3" etype="FP64"/>
+	<description>Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBRPD" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_subr_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<description>Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBRPS" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_subr_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<description>Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBRPS" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_subr_round_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512d" varname="v3" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBRPD" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_subr_round_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512d" varname="v3" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBRPD" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_subr_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBRPS" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_subr_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSUBRPS" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_subr_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3" storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBRD" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_subr_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set)</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBRD" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_addsetc_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="k2_res" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the resultant carry in "k2_res" (carry flag) and the addition results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSETCD" form="zmm, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_addsetc_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k_old" etype="MASK"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="k2_res" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the resultant carry in "k2_res" (carry flag) and the addition results in "dst" using writemask "k" (elements are copied from "v2" and "k_old" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	ELSE
+		dst[i+31:i] := v2[i+31:i]
+		k2_res[j] := k_old[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSETCD" form="zmm {k}, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_addsets_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="sign" etype="MASK" memwidth="16"/>
+	<description>Performs an element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSETSD" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_addsets_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="sign" etype="MASK" memwidth="16"/>
+	<description>Performs an element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+		sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPADDSETSD" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_addsets_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<parameter type="__mmask16 *" varname="sign" etype="MASK" memwidth="16"/>
+	<description>Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDSETSPS" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_addsets_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<parameter type="__mmask16 *" varname="sign" etype="MASK" memwidth="16"/>
+	<description>Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+		sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDSETSPS" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_addsets_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<parameter type="__mmask16 *" varname="sign" etype="MASK" memwidth="16"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDSETSPS" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_addsets_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512" varname="v3" etype="FP32"/>
+	<parameter type="__mmask16 *" varname="sign" etype="MASK" memwidth="16"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+		sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VADDSETSPS" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_subsetb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="borrow" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v3" from "v2", storing the results in "dst" and the nth borrow bit in the nth position of "borrow" (borrow flag).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] - v3[i+31:i]
+	borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBSETBD" form="zmm, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_subsetb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k_old" etype="MASK"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="borrow" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v3" from "v2", storing the results in "dst" and the nth borrow bit in the nth position of "borrow" (borrow flag). Results are stored using writemask "k" (elements are copied from "v2" and "k_old" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] - v3[i+31:i]
+		borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i])
+	ELSE
+		dst[i+31:i] := v3[i+31:i]
+		borrow[j] := k_old[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBSETBD" form="zmm {k}, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_subrsetb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="borrow" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3", storing the results in "dst" and "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+	borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBRSETBD" form="zmm, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_subrsetb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k_old" etype="MASK"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="borrow" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3", storing the results in "dst" and "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are written using writemask "k" (elements are copied from "k" to "k_old" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		diff := v3[i+31:i] - v2[i+31:i]
+		borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i])
+		dst[i+31:i] := diff
+		v2[i+31:i] := diff
+	ELSE
+		borrow[j] := k_old[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSUBRSETBD" form="zmm {k}, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_sbb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="borrow" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v3" as well as the corresponding bit from "k" from "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k[j]
+	borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSBBD" form="zmm, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_sbb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="borrow" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v3" as well as the corresponding bit from "k2" from "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k2[j]
+		borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j])
+	ELSE
+		dst[i+31:i] := v2[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSBBD" form="zmm {k}, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_sbbr_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="borrow" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v2" as well as the corresponding bit from "k" from "v3". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k[j]
+	borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSBBRD" form="zmm, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_sbbr_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<parameter type="__mmask16 *" varname="borrow" etype="MASK" memwidth="16"/>
+	<description>Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v2" as well as the corresponding bit from "k2" from "v3". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k2[j]
+		borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j])
+	ELSE
+		dst[i+31:i] := v2[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPSBBRD" form="zmm {k}, k, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_cvt_roundpd_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed single-precision (32-bit) floating-point elements, storing the results in "dst". Results are written to the lower half of "dst", and the upper half locations are set to '0'.
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k := j*32
+	dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_cvt_roundpd_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed single-precision (32-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are written to the lower half of "dst", and the upper half locations are set to '0'.
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTPD2PS" form="zmm {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_cvtfxpnt_roundpd_epu32lo">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements, storing the results in "dst". Results are written to the lower half of "dst", and the upper half locations are set to '0'.
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k := j*32
+	dst[k+31:k] := Convert_FP64_To_Int32(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTFXPNTPD2UDQ" form="zmm, zmm, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_cvtfxpnt_roundpd_epu32lo">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are written to the lower half of "dst", and the upper half locations are set to '0'.
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_FP64_To_Int32(v2[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTFXPNTPD2UDQ" form="zmm {k}, zmm, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_cvtfxpnt_round_adjustps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI32"/>
+	<description>Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in "v2" to packed 32-bit integer elements and performs an optional exponent adjust using "expadj", storing the results in "dst".
+	[round_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i]
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+	_MM_EXPADJ_4:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+	_MM_EXPADJ_5:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+	_MM_EXPADJ_8:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+	_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+	_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+	_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+	_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+	ESAC
+	dst[i+31:i] := Float32ToInt32(dst[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTFXPNTPS2DQ" form="zmm, zmm, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_cvtfxpnt_round_adjustps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI32"/>
+	<description>Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements and performing an optional exponent adjust using "expadj", storing the results in "dst".
+	[round_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i]
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+	_MM_EXPADJ_4:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+	_MM_EXPADJ_5:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+	_MM_EXPADJ_8:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+	_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+	_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+	_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+	_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+	ESAC
+	dst[i+31:i] := Float32ToUInt32(dst[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTFXPNTPS2UDQ" form="zmm, zmm, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_cvtfxpnt_round_adjustepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI32"/>
+	<description>Performs element-by-element conversion of packed 32-bit unsigned integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst".
+	[round_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := UInt32ToFloat32(v2[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+	_MM_EXPADJ_4:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+	_MM_EXPADJ_5:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+	_MM_EXPADJ_8:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+	_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+	_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+	_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+	_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTFXPNTUDQ2PS" form="zmm, zmm, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_cvtfxpnt_round_adjustepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI32"/>
+	<description>Performs element-by-element conversion of packed 32-bit unsigned integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Int32ToFloat32(v2[i+31:i])
+		CASE expadj OF
+		_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+		_MM_EXPADJ_4:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+		_MM_EXPADJ_5:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+		_MM_EXPADJ_8:	 dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+		_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+		_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+		_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+		_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTFXPNTUDQ2PS" form="zmm {k}, zmm, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_exp223_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<description>Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in "v2" with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := exp223(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP223PS" form="zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_exp223_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<description>Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in "v2" with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := exp223(v2[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VEXP223PS" form="zmm {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_fixupnan_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512i" varname="v3" etype="UI64"/>
+	<description>Fixes up NaN's from packed double-precision (64-bit) floating-point elements in "v1" and "v2", storing the results in "dst" and storing the quietized NaN's from "v1" in "v3".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i])
+	v3[i+63:i] := QuietizeNaNs(v1[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPNANPD" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_fixupnan_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="__m512i" varname="v3" etype="UI64"/>
+	<description>Fixes up NaN's from packed double-precision (64-bit) floating-point elements in "v1" and "v2", storing the results in "dst" using writemask "k" (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from "v1" are stored in "v3".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i])
+		v3[i+63:i] := QuietizeNaNs(v1[i+63:i])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPNANPD" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_fixupnan_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<description>Fixes up NaN's from packed single-precision (32-bit) floating-point elements in "v1" and "v2", storing the results in "dst" and storing the quietized NaN's from "v1" in "v3".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i])
+	v3[i+31:i] := QuietizeNaNs(v1[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPNANPS" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_fixupnan_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v2" etype="FP32"/>
+	<parameter type="__m512i" varname="v3" etype="UI32"/>
+	<description>Fixes up NaN's from packed single-precision (32-bit) floating-point elements in "v1" and "v2", storing the results in "dst" using writemask "k" (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from "v1" are stored in "v3".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i])
+		v3[i+31:i] := QuietizeNaNs(v1[i+31:i])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFIXUPNANPS" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extloadunpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="void const *" varname="mt" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_EPI32_UINT8:
+		RETURN ZeroExtend32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_SINT8:
+		RETURN SignExtend32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_UINT16:
+		RETURN ZeroExtend32(MEM[addr + 2*offset])
+	_MM_UPCONV_EPI32_SINT16:
+		RETURN SignExtend32(MEM[addr + 2*offset])
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		RETURN 4
+	_MM_UPCONV_EPI32_UINT8:
+		RETURN 1
+	_MM_UPCONV_EPI32_SINT8:
+		RETURN 1
+	_MM_UPCONV_EPI32_UINT16:
+		RETURN 2
+	_MM_UPCONV_EPI32_SINT16:
+		RETURN 2
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*upSize % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHD" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extloadunpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_EPI32_UINT8:
+		RETURN ZeroExtend32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_SINT8:
+		RETURN SignExtend32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_UINT16:
+		RETURN ZeroExtend32(MEM[addr + 2*offset])
+	_MM_UPCONV_EPI32_SINT16:
+		RETURN SignExtend32(MEM[addr + 2*offset])
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		RETURN 4
+	_MM_UPCONV_EPI32_UINT8:
+		RETURN 1
+	_MM_UPCONV_EPI32_SINT8:
+		RETURN 1
+	_MM_UPCONV_EPI32_UINT16:
+		RETURN 2
+	_MM_UPCONV_EPI32_SINT16:
+		RETURN 2
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*upSize % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHD" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extloadunpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="void const *" varname="mt" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_EPI32_UINT8:
+		RETURN ZeroExtend32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_SINT8:
+		RETURN SignExtend32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_UINT16:
+		RETURN ZeroExtend32(MEM[addr + 2*offset])
+	_MM_UPCONV_EPI32_SINT16:
+		RETURN SignExtend32(MEM[addr + 2*offset])
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		RETURN 4
+	_MM_UPCONV_EPI32_UINT8:
+		RETURN 1
+	_MM_UPCONV_EPI32_SINT8:
+		RETURN 1
+	_MM_UPCONV_EPI32_UINT16:
+		RETURN 2
+	_MM_UPCONV_EPI32_SINT16:
+		RETURN 2
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * upSize) % 64 == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLD" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extloadunpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_EPI32_UINT8:
+		RETURN ZeroExtend32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_SINT8:
+		RETURN SignExtend32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_UINT16:
+		RETURN ZeroExtend32(MEM[addr + 2*offset])
+	_MM_UPCONV_EPI32_SINT16:
+		RETURN SignExtend32(MEM[addr + 2*offset])
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		RETURN 4
+	_MM_UPCONV_EPI32_UINT8:
+		RETURN 1
+	_MM_UPCONV_EPI32_SINT8:
+		RETURN 1
+	_MM_UPCONV_EPI32_UINT16:
+		RETURN 2
+	_MM_UPCONV_EPI32_SINT16:
+		RETURN 2
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * upSize) % 64 == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLD" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extloadunpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="void const *" varname="mt" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN MEM[addr + 8*offset]
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN 8
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*upSize) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHQ" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extloadunpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN MEM[addr + 8*offset]
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN 8
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*upSize) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHQ" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extloadunpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="void const *" varname="mt" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN MEM[addr + 8*offset]
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN 8
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+	loadOffset := loadOffset + 1
+	IF (addr + loadOffset*upSize % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLQ" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extloadunpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN MEM[addr + 8*offset]
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN 8
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*64
+		dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+		loadOffset := loadOffset + 1
+		IF (addr + loadOffset*upSize % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLQ" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extloadunpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="void const *" varname="mt" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Convert_UInt8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Convert_Int8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset])
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		RETURN 4
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN 2
+	_MM_UPCONV_PS_UINT8:
+		RETURN 1
+	_MM_UPCONV_PS_SINT8:
+		RETURN 1
+	_MM_UPCONV_PS_UINT16:
+		RETURN 2
+	_MM_UPCONV_PS_SINT16:
+		RETURN 2
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*upSize % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHPS" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extloadunpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Convert_UInt8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Convert_Int8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset])
+	ESAC
+}
+DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Convert_UInt8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Convert_Int8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset])
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*upSize % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHPS" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extloadunpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="void const *" varname="mt" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Convert_UInt8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Convert_Int8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset])
+	ESAC
+}
+DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Convert_UInt8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Convert_Int8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset])
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * upSize) % 64 == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLPS" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extloadunpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Convert_UInt8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Convert_Int8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset])
+	ESAC
+}
+DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP16_To_FP32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Convert_UInt8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Convert_Int8_To_FP32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Convert_UInt16_To_FP32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Convert_Int16_To_FP32(MEM[addr + 2*offset])
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr := MEM[mt]
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * upSize) % 64 == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLPS" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extloadunpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="void const *" varname="mt" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:
+		RETURN MEM[addr + 8*offset]
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:
+		RETURN 8
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*upSize) % 64 == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHPD" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extloadunpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:
+		RETURN MEM[addr + 8*offset]
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:
+		RETURN 8
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*upSize) % 64 == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHPD" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extloadunpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="void const *" varname="mt" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:
+		RETURN MEM[addr + 8*offset]
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:
+		RETURN 8
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * upSize) % 64 == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLPD" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extloadunpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elemenst are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE UPCONVERT(addr, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:
+		RETURN MEM[addr + 8*offset]
+	ESAC
+}
+DEFINE UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:
+		RETURN 8
+	ESAC
+}
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*64
+		dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * upSize) % 64 == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLPD" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extpackstorehi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI32"/>
+	<parameter type="__m512i" varname="v1" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_DOWNCONV_EPI32_NONE:
+		RETURN element[31:0]
+	_MM_DOWNCONV_EPI32_UINT8:
+		RETURN Truncate8(element[31:0])
+	_MM_DOWNCONV_EPI32_SINT8:
+		RETURN Saturate8(element[31:0])
+	_MM_DOWNCONV_EPI32_UINT16:
+		RETURN Truncate16(element[31:0])
+	_MM_DOWNCONV_EPI32_SINT16:
+		RETURN Saturate16(element[31:0])
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_DOWNCONV_EPI32_NONE:
+		RETURN 4
+	_MM_DOWNCONV_EPI32_UINT8:
+		RETURN 1
+	_MM_DOWNCONV_EPI32_SINT8:
+		RETURN 1
+	_MM_DOWNCONV_EPI32_UINT16:
+		RETURN 2
+	_MM_DOWNCONV_EPI32_SINT16:
+		RETURN 2
+	ESAC
+}
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		tmp := DOWNCONVERT(v1[i+31:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		4: MEM[storeAddr] := tmp[31:0]
+		2: MEM[storeAddr] := tmp[15:0]
+		1: MEM[storeAddr] := tmp[7:0]
+		ESAC
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTOREHD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extpackstorehi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v1" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresonding mask bit is not set).</description>
+	<operation>DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_DOWNCONV_EPI32_NONE:
+		RETURN element[31:0]
+	_MM_DOWNCONV_EPI32_UINT8:
+		RETURN Truncate8(element[31:0])
+	_MM_DOWNCONV_EPI32_SINT8:
+		RETURN Saturate8(element[31:0])
+	_MM_DOWNCONV_EPI32_UINT16:
+		RETURN Truncate16(element[31:0])
+	_MM_DOWNCONV_EPI32_SINT16:
+		RETURN Saturate16(element[31:0])
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_DOWNCONV_EPI32_NONE:
+		RETURN 4
+	_MM_DOWNCONV_EPI32_UINT8:
+		RETURN 1
+	_MM_DOWNCONV_EPI32_SINT8:
+		RETURN 1
+	_MM_DOWNCONV_EPI32_UINT16:
+		RETURN 2
+	_MM_DOWNCONV_EPI32_SINT16:
+		RETURN 2
+	ESAC
+}
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			tmp := DOWNCONVERT(v1[i+31:i], conv)
+			storeAddr := addr + storeOffset * downSize
+			CASE downSize OF
+			4: MEM[storeAddr] := tmp[31:0]
+			2: MEM[storeAddr] := tmp[15:0]
+			1: MEM[storeAddr] := tmp[7:0]
+			ESAC
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTOREHD" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extpackstorelo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI32"/>
+	<parameter type="__m512i" varname="v1" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_DOWNCONV_EPI32_NONE:
+		RETURN element[31:0]
+	_MM_DOWNCONV_EPI32_UINT8:
+		RETURN Truncate8(element[31:0])
+	_MM_DOWNCONV_EPI32_SINT8:
+		RETURN Saturate8(element[31:0])
+	_MM_DOWNCONV_EPI32_UINT16:
+		RETURN Truncate16(element[31:0])
+	_MM_DOWNCONV_EPI32_SINT16:
+		RETURN Saturate16(element[31:0])
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_DOWNCONV_EPI32_NONE:
+		RETURN 4
+	_MM_DOWNCONV_EPI32_UINT8:
+		RETURN 1
+	_MM_DOWNCONV_EPI32_SINT8:
+		RETURN 1
+	_MM_DOWNCONV_EPI32_UINT16:
+		RETURN 2
+	_MM_DOWNCONV_EPI32_SINT16:
+		RETURN 2
+	ESAC
+}
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 15
+	i := j*32
+	tmp := DOWNCONVERT(v1[i+31:i], conv)
+	storeAddr := addr + storeOffset * downSize
+	CASE downSize OF
+	4: MEM[storeAddr] := tmp[31:0]
+	2: MEM[storeAddr] := tmp[15:0]
+	1: MEM[storeAddr] := tmp[7:0]
+	ESAC
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset * downSize) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTORELD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extpackstorelo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v1" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are written to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_DOWNCONV_EPI32_NONE:
+		RETURN element[31:0]
+	_MM_DOWNCONV_EPI32_UINT8:
+		RETURN Truncate8(element[31:0])
+	_MM_DOWNCONV_EPI32_SINT8:
+		RETURN Saturate8(element[31:0])
+	_MM_DOWNCONV_EPI32_UINT16:
+		RETURN Truncate16(element[31:0])
+	_MM_DOWNCONV_EPI32_SINT16:
+		RETURN Saturate16(element[31:0])
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_DOWNCONV_EPI32_NONE:
+		RETURN 4
+	_MM_DOWNCONV_EPI32_UINT8:
+		RETURN 1
+	_MM_DOWNCONV_EPI32_SINT8:
+		RETURN 1
+	_MM_DOWNCONV_EPI32_UINT16:
+		RETURN 2
+	_MM_DOWNCONV_EPI32_SINT16:
+		RETURN 2
+	ESAC
+}
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		tmp := DOWNCONVERT(v1[i+31:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		4: MEM[storeAddr] := tmp[31:0]
+		2: MEM[storeAddr] := tmp[15:0]
+		1: MEM[storeAddr] := tmp[7:0]
+		ESAC
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset * downSize) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTORELD" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extpackstorehi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI64"/>
+	<parameter type="__m512i" varname="v1" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN element[63:0]
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN 8
+	ESAC
+}
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		tmp := DOWNCONVERT(v1[i+63:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		8: MEM[storeAddr] := tmp[63:0]
+		ESAC
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTOREHQ" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extpackstorehi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v1" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (mt-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresonding mask bit is not set).</description>
+	<operation>DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN element[63:0]
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN 8
+	ESAC
+}
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			tmp := DOWNCONVERT(v1[i+63:i], conv)
+			storeAddr := addr + storeOffset * downSize
+			CASE downSize OF
+			8: MEM[storeAddr] := tmp[63:0]
+			ESAC
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTOREHQ" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extpackstorelo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI64"/>
+	<parameter type="__m512i" varname="v1" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN element[63:0]
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN 8
+	ESAC
+}
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 7
+	i := j*63
+	tmp := DOWNCONVERT(v1[i+63:i], conv)
+	storeAddr := addr + storeOffset * downSize
+	CASE downSize OF
+	8: MEM[storeAddr] := tmp[63:0]
+	ESAC
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset * downSize) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTORELQ" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extpackstorelo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v1" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped whent he corresponding mask bit is not set).</description>
+	<operation>
+DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN element[63:0]
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_EPI64_NONE:
+		RETURN 8
+	ESAC
+}
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*63
+		tmp := DOWNCONVERT(v1[i+63:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		8: MEM[storeAddr] := tmp[63:0]
+		ESAC
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset * downSize) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTORELQ" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extpackstorehi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP32"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PS_NONE:
+		RETURN element[31:0]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP32_To_FP16(element[31:0])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Truncate8(element[31:0])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Saturate8(element[31:0])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Truncate16(element[31:0])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Saturate16(element[31:0])
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PS_NONE:
+		RETURN 4
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN 2
+	_MM_UPCONV_PS_UINT8:
+		RETURN 1
+	_MM_UPCONV_PS_SINT8:
+		RETURN 1
+	_MM_UPCONV_PS_UINT16:
+		RETURN 2
+	_MM_UPCONV_PS_SINT16:
+		RETURN 2
+	ESAC
+}
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		tmp := DOWNCONVERT(v1[i+31:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		4: MEM[storeAddr] := tmp[31:0]
+		2: MEM[storeAddr] := tmp[15:0]
+		1: MEM[storeAddr] := tmp[7:0]
+		ESAC
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTOREHPS" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extpackstorehi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PS_NONE:
+		RETURN element[31:0]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP32_To_FP16(element[31:0])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Truncate8(element[31:0])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Saturate8(element[31:0])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Truncate16(element[31:0])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Saturate16(element[31:0])
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PS_NONE:
+		RETURN 4
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN 2
+	_MM_UPCONV_PS_UINT8:
+		RETURN 1
+	_MM_UPCONV_PS_SINT8:
+		RETURN 1
+	_MM_UPCONV_PS_UINT16:
+		RETURN 2
+	_MM_UPCONV_PS_SINT16:
+		RETURN 2
+	ESAC
+}
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			tmp := DOWNCONVERT(v1[i+31:i], conv)
+			storeAddr := addr + storeOffset * downSize
+			CASE downSize OF
+			4: MEM[storeAddr] := tmp[31:0]
+			2: MEM[storeAddr] := tmp[15:0]
+			1: MEM[storeAddr] := tmp[7:0]
+			ESAC
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTOREHPS" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extpackstorelo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP32"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PS_NONE:
+		RETURN element[31:0]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP32_To_FP16(element[31:0])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Truncate8(element[31:0])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Saturate8(element[31:0])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Truncate16(element[31:0])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Saturate16(element[31:0])
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PS_NONE:
+		RETURN 4
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN 2
+	_MM_UPCONV_PS_UINT8:
+		RETURN 1
+	_MM_UPCONV_PS_SINT8:
+		RETURN 1
+	_MM_UPCONV_PS_UINT16:
+		RETURN 2
+	_MM_UPCONV_PS_SINT16:
+		RETURN 2
+	ESAC
+}
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 15
+	i := j*32
+	tmp := DOWNCONVERT(v1[i+31:i], conv)
+	storeAddr := addr + storeOffset * downSize
+	CASE downSize OF
+	4: MEM[storeAddr] := tmp[31:0]
+	2: MEM[storeAddr] := tmp[15:0]
+	1: MEM[storeAddr] := tmp[7:0]
+	ESAC
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset * downSize) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTORELPS" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extpackstorelo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PS_NONE:
+		RETURN element[31:0]
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN Convert_FP32_To_FP16(element[31:0])
+	_MM_UPCONV_PS_UINT8:
+		RETURN Truncate8(element[31:0])
+	_MM_UPCONV_PS_SINT8:
+		RETURN Saturate8(element[31:0])
+	_MM_UPCONV_PS_UINT16:
+		RETURN Truncate16(element[31:0])
+	_MM_UPCONV_PS_SINT16:
+		RETURN Saturate16(element[31:0])
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PS_NONE:
+		RETURN 4
+	_MM_UPCONV_PS_FLOAT16:
+		RETURN 2
+	_MM_UPCONV_PS_UINT8:
+		RETURN 1
+	_MM_UPCONV_PS_SINT8:
+		RETURN 1
+	_MM_UPCONV_PS_UINT16:
+		RETURN 2
+	_MM_UPCONV_PS_SINT16:
+		RETURN 2
+	ESAC
+}
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		tmp := DOWNCONVERT(v1[i+31:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		4: MEM[storeAddr] := tmp[31:0]
+		2: MEM[storeAddr] := tmp[15:0]
+		1: MEM[storeAddr] := tmp[7:0]
+		ESAC
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset * downSize) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTORELPS" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extpackstorehi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP64"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PD_NONE:
+		RETURN element[63:0]
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PD_NONE:
+		RETURN 8
+	ESAC
+}
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		tmp := DOWNCONVERT(v1[i+63:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		8: MEM[storeAddr] := tmp[63:0]
+		ESAC
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTOREHPD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extpackstorehi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PD_NONE:
+		RETURN element[63:0]
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PD_NONE:
+		RETURN 8
+	ESAC
+}
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			tmp := DOWNCONVERT(v1[i+63:i], conv)
+			storeAddr := addr + storeOffset * downSize
+			CASE downSize OF
+			8: MEM[storeAddr] := tmp[63:0]
+			ESAC
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTOREHPD" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_extpackstorelo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP64"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PD_NONE:
+		RETURN element[63:0]
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PD_NONE:
+		RETURN 8
+	ESAC
+}
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 7
+	i := j*63
+	tmp := DOWNCONVERT(v1[i+63:i], conv)
+	storeAddr := addr + storeOffset * downSize
+	CASE downSize OF
+	8: MEM[storeAddr] := tmp[63:0]
+	ESAC
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset * downSize) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTORELPD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_extpackstorelo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mt" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE"/>
+	<description>Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE DOWNCONVERT(element, convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PD_NONE:
+		RETURN element[63:0]
+	ESAC
+}
+DEFINE DOWNCONVERTSIZE(convertTo) {
+	CASE convertTo OF
+	_MM_UPCONV_PD_NONE:
+		RETURN 8
+	ESAC
+}
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr := mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*63
+		tmp := DOWNCONVERT(v1[i+63:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		8: MEM[storeAddr] := tmp[63:0]
+		ESAC
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset * downSize) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPACKSTORELPD" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_i32loscatter_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+	MEM[addr+63:addr] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="m512, zmm" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_i32loscatter_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="base_addr" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI32"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements whose corresponding mask bit is not set are not written to memory).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+		MEM[addr+63:addr] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPSCATTERDQ" form="m512 {k}, zmm" xed="VPSCATTERDQ_MEMu64_MASKmskw_ZMMu64_AVX512_VL512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_loadunpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="void const*" varname="mt" etype="UI32" memwidth="512"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src".</description>
+	<operation>dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr := mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*4 % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		tmp := MEM[addr + loadOffset*4]
+		dst[i+31:i] := tmp[i+31:i]
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHD" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_loadunpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const *" varname="mt" etype="UI32" memwidth="512"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr := mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*4 % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			tmp := MEM[addr + loadOffset*4]
+			dst[i+31:i] := tmp[i+31:i]
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHD" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_loadunpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="void const*" varname="mt" etype="UI32" memwidth="512"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr := mt
+FOR j := 0 to 15
+	i := j*32
+	tmp := MEM[addr + loadOffset*4]
+	dst[i+31:i] := tmp[i+31:i]
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * 4) % 64 == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLD" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_loadunpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mt" etype="UI32" memwidth="512"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr := mt
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := MEM[addr + loadOffset*4]
+		dst[i+31:i] := tmp[i+31:i]
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * 4) % 64 == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLD" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_loadunpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="void const*" varname="mt" etype="UI64" memwidth="512"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr := mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*8) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		tmp := MEM[addr + loadOffset*8]
+		dst[i+63:i] := tmp[i+63:i]
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHQ" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_loadunpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mt" etype="UI64" memwidth="512"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr := mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*8) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			tmp := MEM[addr + loadOffset*8]
+			dst[i+63:i] := tmp[i+63:i]
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHQ" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_loadunpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="void const*" varname="mt" etype="UI64" memwidth="512"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr := mt
+FOR j := 0 to 7
+	i := j*64
+	tmp := MEM[addr + loadOffset*8]
+	dst[i+63:i] := tmp[i+63:i]
+	loadOffset := loadOffset + 1
+	IF (addr + loadOffset*8 % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLQ" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_loadunpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mt" etype="UI64" memwidth="512"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr := mt
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := MEM[addr + loadOffset*8]
+		dst[i+63:i] := tmp[i+63:i]
+		loadOffset := loadOffset + 1
+		IF (addr + loadOffset*8 % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLQ" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_loadunpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="void const*" varname="mt" etype="FP32" memwidth="512"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr := mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*4 % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		tmp := MEM[addr + loadOffset*4]
+		dst[i+31:i] := tmp[i+31:i]
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHPS" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_loadunpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mt" etype="FP32" memwidth="512"/>
+	<description>Loads the high-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr := mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*4 % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			tmp := MEM[addr + loadOffset*4]
+			dst[i+31:i] := tmp[i+31:i]
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHPS" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_loadunpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="void const*" varname="mt" etype="FP32" memwidth="512"/>
+	<description>Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr := mt
+FOR j := 0 to 15
+	i := j*32
+	tmp := MEM[addr + loadOffset*4]
+	dst[i+31:i] := tmp[i+31:i]
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * 4) % 64 == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLPS" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_loadunpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mt" etype="FP32" memwidth="512"/>
+	<description>Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr := mt
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := MEM[addr + loadOffset*4]
+		dst[i+31:i] := tmp[i+31:i]
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * 4) % 64 == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLPS" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_loadunpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="void const*" varname="mt" etype="FP64" memwidth="512"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr := mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*8) % 64 == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		tmp := MEM[addr + loadOffset*8]
+		dst[i+63:i] := tmp[i+63:i]
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHPD" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_loadunpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mt" etype="FP64" memwidth="512"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr := mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*8) % 64 == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			tmp := MEM[addr + loadOffset*8]
+			dst[i+63:i] := tmp[i+63:i]
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKHPD" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_loadunpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="void const*" varname="mt" etype="FP64" memwidth="512"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr := mt
+FOR j := 0 to 7
+	i := j*64
+	tmp := MEM[addr + loadOffset*8]
+	dst[i+63:i] := tmp[i+63:i]
+	loadOffset := loadOffset + 1
+	IF ((addr + 8*loadOffset) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLPD" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_loadunpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="void const*" varname="mt" etype="FP64" memwidth="512"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr := mt
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := MEM[addr + loadOffset*8]
+		dst[i+63:i] := tmp[i+63:i]
+		loadOffset := loadOffset + 1
+		IF ((addr + 8*loadOffset) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOADUNPACKLPD" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_packstorehi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="UI32" memwidth="512"/>
+	<parameter type="__m512i" varname="v1" etype="UI32"/>
+	<description>Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := 0
+addr := mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == 0
+		IF ((addr + (storeOffset + 1)*4) % 64) == 0
+			foundNext64BytesBoundary := 1
+		FI
+	ELSE
+		i := j*32
+		MEM[addr + storeOffset*4] := v1[i+31:i]
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTOREHD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_packstorehi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="UI32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v1" etype="UI32"/>
+	<description>Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := 0
+addr := mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == 0
+			IF ((addr + (storeOffset + 1)*4) % 64) == 0
+				foundNext64BytesBoundary := 1
+			FI
+		ELSE
+			i := j*32
+			MEM[addr + storeOffset*4] := v1[i+31:i]
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTOREHD" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_packstorelo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="UI32" memwidth="512"/>
+	<parameter type="__m512i" varname="v1" etype="UI32"/>
+	<description>Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt").</description>
+	<operation>
+storeOffset := 0
+addr := mt
+FOR j := 0 to 15
+	i := j*32
+	MEM[addr + storeOffset*4] := v1[i+31:i]
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset*4) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTORELD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_packstorelo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="UI32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v1" etype="UI32"/>
+	<description>Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+addr := mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		MEM[addr + storeOffset*4] := v1[i+31:i]
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset*4) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTORELD" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_packstorehi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="UI64" memwidth="512"/>
+	<parameter type="__m512i" varname="v1" etype="UI64"/>
+	<description>Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := 0
+addr := mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == 0
+		IF ((addr + (storeOffset + 1)*8) % 64) == 0
+			foundNext64BytesBoundary := 1
+		FI
+	ELSE
+		i := j*64
+		MEM[addr + storeOffset*8] := v1[i+63:i]
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTOREHQ" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_packstorehi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="UI64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v1" etype="UI64"/>
+	<description>Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := 0
+addr := mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == 0
+			IF ((addr + (storeOffset + 1)*8) % 64) == 0
+				foundNext64BytesBoundary := 1
+			FI
+		ELSE
+			i := j*64
+			MEM[addr + storeOffset*8] := v1[i+63:i]
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTOREHQ" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_packstorelo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="UI64" memwidth="512"/>
+	<parameter type="__m512i" varname="v1" etype="UI64"/>
+	<description>Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt").</description>
+	<operation>
+storeOffset := 0
+addr := mt
+FOR j := 0 to 7
+	i := j*64
+	MEM[addr + storeOffset*8] := v1[i+63:i]
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset*8) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTORELQ" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_packstorelo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="UI64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="v1" etype="UI64"/>
+	<description>Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+addr := mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*64
+		MEM[addr + storeOffset*8] := v1[i+63:i]
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset*8) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTORELQ" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_packstorehi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="FP32" memwidth="512"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := 0
+addr := mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == 0
+		IF ((addr + (storeOffset + 1)*4) % 64) == 0
+			foundNext64BytesBoundary := 1
+		FI
+	ELSE
+		i := j*32
+		MEM[addr + storeOffset*4] := v1[i+31:i]
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTOREHPS" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_packstorehi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="FP32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := 0
+addr := mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == 0
+			IF ((addr + (storeOffset + 1)*4) % 64) == 0
+				foundNext64BytesBoundary := 1
+			FI
+		ELSE
+			i := j*32
+			MEM[addr + storeOffset*4] := v1[i+31:i]
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTOREHPS" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_packstorelo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="FP32" memwidth="512"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt").</description>
+	<operation>
+storeOffset := 0
+addr := mt
+FOR j := 0 to 15
+	i := j*32
+	MEM[addr + storeOffset*4] := v1[i+31:i]
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset*4) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTORELPS" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_packstorelo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="FP32" memwidth="512"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="v1" etype="FP32"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+addr := mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		MEM[addr + storeOffset*4] := v1[i+31:i]
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset*4) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTORELPS" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_packstorehi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="FP64" memwidth="512"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := 0
+addr := mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == 0
+		IF ((addr + (storeOffset + 1)*8) % 64) == 0
+			foundNext64BytesBoundary := 1
+		FI
+	ELSE
+		i := j*64
+		MEM[addr + storeOffset*4] := v1[i+63:i]
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTOREHPD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_packstorehi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="FP64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := 0
+addr := mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == 0
+			IF ((addr + (storeOffset + 1)*8) % 64) == 0
+				foundNext64BytesBoundary := 1
+			FI
+		ELSE
+			i := j*64
+			MEM[addr + storeOffset*4] := v1[i+63:i]
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTOREHPD" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_packstorelo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="FP64" memwidth="512"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt").</description>
+	<operation>
+storeOffset := 0
+addr := mt
+FOR j := 0 to 7
+	i := j*64
+	MEM[addr + storeOffset*8] := v1[i+63:i]
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset*8) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTORELPD" form="m512, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_packstorelo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mt" etype="FP64" memwidth="512"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v1" etype="FP64"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+addr := mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*64
+		MEM[addr + storeOffset*8] := v1[i+63:i]
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset*8) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+	</operation>
+	<instruction name="VPACKSTORELPD" form="m512 {k}, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_countbits_32">
+	<CPUID>KNCNI</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="r1" etype="UI32"/>
+	<description>Counts the number of set bits in 32-bit unsigned integer "r1", returning the results in "dst".</description>
+	<operation>dst[31:0] := PopCount(r1[31:0])
+	</operation>
+	<instruction name="POPCNT" form="r32, r32" xed="POPCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_countbits_64">
+	<CPUID>KNCNI</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="r1" etype="UI64"/>
+	<description>Counts the number of set bits in 64-bit unsigned integer "r1", returning the results in "dst".</description>
+	<operation>dst[63:0] := PopCount(r1[63:0])
+	</operation>
+	<instruction name="POPCNT" form="r64, r64" xed="POPCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kmovlhb">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="dst" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Inserts the low byte of mask "k2" into the high byte of "dst", and copies the low byte of "k1" to the low byte of "dst".</description>
+	<operation>
+dst[7:0] := k1[7:0]
+dst[15:8] := k2[7:0]
+	</operation>
+	<instruction name="KMERGE2L1L" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_cvtfxpnt_roundpd_epi32lo">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector "v2" to 32-bit integer elements, storing them in the lower half of "dst". The elements in the upper half of "dst" are set to 0.
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k := j*32
+	dst[k+31:k] := Convert_FP64_To_Int32(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTFXPNTPD2DQ" form="zmm, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_cvtfxpnt_roundpd_epi32lo">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512i" varname="dst" etype="SI32"/>
+	<parameter type="__m512i" varname="src" etype="SI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="v2" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector "v2" to 32-bit integer elements, storing them in the lower half of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements in the upper half of "dst" are set to 0.
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_FP64_To_Int32(v2[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTFXPNTPD2DQ" form="zmm {k}, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_cvtfxpnt_round_adjustepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="v2" etype="UI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI32"/>
+	<description>Performs element-by-element conversion of packed 32-bit integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst".
+	[round_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := Int32ToFloat32(v2[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+	_MM_EXPADJ_4:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+	_MM_EXPADJ_5:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+	_MM_EXPADJ_8:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+	_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+	_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+	_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+	_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VCVTFXPNTDQ2PS" form="zmm, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_log2ae23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a" with absolute error of 2^(-23) and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOG2PS" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_log2ae23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a" with absolute error of 2^(-23) and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VLOG2PS" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_fmadd_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<description>Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD231D" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_fmadd_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<description>Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD231D" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask3_fmadd_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="__m512i" varname="c" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<description>Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD231D" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_fmadd233_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Multiply packed 32-bit integer elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	base := (j &amp; ~0x3) * 32
+	scale[31:0] := b[base+63:base+32]
+	bias[31:0]  := b[base+31:base]
+	dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD233D" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_fmadd233_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Multiply packed 32-bit integer elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		base := (j &amp; ~0x3) * 32
+		scale[31:0] := b[base+63:base+32]
+		bias[31:0]  := b[base+31:base]
+		dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMADD233D" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_fmadd233_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	base := (j &amp; ~0x3) * 32
+	scale[31:0] := b[base+63:base+32]
+	bias[31:0]  := b[base+31:base]
+	dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD233PS" form="zmm, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_fmadd233_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		base := (j &amp; ~0x3) * 32
+		scale[31:0] := b[base+63:base+32]
+		bias[31:0]  := b[base+31:base]
+		dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD233PS" form="zmm {k}, zmm, zmm"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_maxabs_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMAXABSPS" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_maxabs_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMAXABSPS" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_gmax_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Determines the maximum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMAXPS" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_gmax_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMAXPS" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_gmaxabs_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMAXABSPS" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_gmaxabs_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FpMax(ABS(a[i+31:i]), ABS(b[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMAXABSPS" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_gmax_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Determines the maximum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMAXPD" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_gmax_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMAXPD" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_gmin_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Determines the minimum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMINPS" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_gmin_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMINPS" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_gmin_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Determines the minimum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMINPD" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_gmin_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="__m512d" varname="b" etype="FP64"/>
+	<description>Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VGMINPD" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mulhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Performs element-by-element multiplication between packed 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) &gt;&gt; 32
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHD" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_mulhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Performs element-by-element multiplication between packed 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) &gt;&gt; 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHD" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mulhi_epu32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Performs element-by-element multiplication between packed unsigned 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) &gt;&gt; 32
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHUD" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_mulhi_epu32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Performs element-by-element multiplication between packed unsigned 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) &gt;&gt; 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPMULHUD" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_permute4f128_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Permutes 128-bit blocks of the packed 32-bit integer vector "a" using constant "imm8". The results are stored in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control)  {
+	CASE control[1:0] OF
+	0: tmp[127:0] := src[127:0]
+	1: tmp[127:0] := src[255:128]
+	2: tmp[127:0] := src[383:256]
+	3: tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+FOR j := 0 to 3
+	i := j*128
+	n := j*2
+	dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMF32X4" form="zmm, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_permute4f128_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Permutes 128-bit blocks of the packed 32-bit integer vector "a" using constant "imm8". The results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control)  {
+	CASE control[1:0] OF
+	0: tmp[127:0] := src[127:0]
+	1: tmp[127:0] := src[255:128]
+	2: tmp[127:0] := src[383:256]
+	3: tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp[511:0] := 0
+FOR j := 0 to 3
+	i := j*128
+	n := j*2
+	tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
+ENDFOR
+FOR j := 0 to 15
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMF32X4" form="zmm {k}, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_rcp23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of precision, storing the results in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRCP23PS" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_rcp23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of precision, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRCP23PS" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value using "expadj" and in the direction of "rounding", and store the results as packed single-precision floating-point elements in "dst".
+	[round_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+	_MM_EXPADJ_4:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+	_MM_EXPADJ_5:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+	_MM_EXPADJ_8:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+	_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+	_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+	_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+	_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VROUNDPS" form="zmm, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value using "expadj" and in the direction of "rounding", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ROUND(a[i+31:i])
+		CASE expadj OF
+		_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+		_MM_EXPADJ_4:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+		_MM_EXPADJ_5:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+		_MM_EXPADJ_8:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+		_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+		_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+		_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+		_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VROUNDPS" form="zmm {k}, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_roundfxpnt_adjust_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI32"/>
+	<description>Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst".
+	[round_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+	_MM_EXPADJ_4:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+	_MM_EXPADJ_5:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+	_MM_EXPADJ_8:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+	_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+	_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+	_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+	_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDFXPNTPS" form="zmm, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_roundfxpnt_adjust_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI32"/>
+	<description>Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ROUND(a[i+31:i])
+		CASE expadj OF
+		_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+		_MM_EXPADJ_4:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+		_MM_EXPADJ_5:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+		_MM_EXPADJ_8:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+		_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+		_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+		_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+		_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDFXPNTPS" form="zmm {k}, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_roundfxpnt_adjust_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI64"/>
+	<description>Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst".
+	[round_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+	_MM_EXPADJ_4:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+	_MM_EXPADJ_5:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+	_MM_EXPADJ_8:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+	_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+	_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+	_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+	_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDFXPNTPD" form="zmm, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_roundfxpnt_adjust_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<parameter type="_MM_EXP_ADJ_ENUM" varname="expadj" etype="UI64"/>
+	<description>Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ROUND(a[i+63:i])
+		CASE expadj OF
+		_MM_EXPADJ_NONE: dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 0)
+		_MM_EXPADJ_4:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 4)
+		_MM_EXPADJ_5:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 5)
+		_MM_EXPADJ_8:	dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 8)
+		_MM_EXPADJ_16:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 16)
+		_MM_EXPADJ_24:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 24)
+		_MM_EXPADJ_31:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 31)
+		_MM_EXPADJ_32:   dst[i+31:i] := dst[i+31:i] * (2 &lt;&lt; 32)
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRNDFXPNTPD" form="zmm {k}, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_rsqrt23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of accuracy and stores the result in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := Sqrt(1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRSQRT23PS" form="zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_rsqrt23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of accuracy and stores the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Sqrt(1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VRSQRT23PS" form="zmm {k}, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_scale_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * POW(2.0, FP32(b[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEPS" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_scale_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<description>Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * POW(2.0, FP32(b[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEPS" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_scale_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst". Intermediate elements are rounded using "rounding".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * POW(2.0,FP32(b[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEPS" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_scale_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512i" varname="b" etype="UI32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exp, where the exp is the corresponding 32-bit integer element in "b", storing results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are rounded using constant "rounding".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * POW(2.0, FP32(b[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VSCALEPS" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_reduce_gmin_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst".</description>
+	<operation>min := a[31:0]
+FOR j := 1 to 15
+	i := j*32
+	dst := FpMin(min, a[i+31:i])
+ENDFOR
+dst := min
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_reduce_gmin_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst" using writemask "k" (elements are ignored when the corresponding mask bit is not set).</description>
+	<operation>min := a[31:0]
+FOR j := 1 to 15
+	i := j*32
+	IF k[j]
+		CONTINUE
+	ELSE
+		dst := FpMin(min, a[i+31:i])
+	FI
+ENDFOR
+dst := min
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_reduce_gmin_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst".</description>
+	<operation>min := a[63:0]
+FOR j := 1 to 7
+	i := j*64
+	dst := FpMin(min, a[i+63:i])
+ENDFOR
+dst := min
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_reduce_gmin_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).</description>
+	<operation>min := a[63:0]
+FOR j := 1 to 7
+	i := j*64
+	IF k[j]
+		CONTINUE
+	ELSE
+		dst := FpMin(min, a[i+63:i])
+	FI
+ENDFOR
+dst := min
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_reduce_gmax_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst".</description>
+	<operation>max := a[31:0]
+FOR j := 1 to 15
+	i := j*32
+	dst := FpMax(max, a[i+31:i])
+ENDFOR
+dst := max
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_reduce_gmax_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<description>Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).</description>
+	<operation>max := a[31:0]
+FOR j := 1 to 15
+	i := j*32
+	IF k[j]
+		CONTINUE
+	ELSE
+		dst := FpMax(max, a[i+31:i])
+	FI
+ENDFOR
+dst := max
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_reduce_gmax_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst".</description>
+	<operation>max := a[63:0]
+FOR j := 1 to 7
+	i := j*64
+	dst := FpMax(max, a[i+63:i])
+ENDFOR
+dst := max
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_reduce_gmax_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<description>Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).</description>
+	<operation>max := a[63:0]
+FOR j := 1 to 7
+	i := j*64
+	IF k[j]
+		CONTINUE
+	ELSE
+		dst := FpMax(max, a[i+63:i])
+	FI
+ENDFOR
+dst := max
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_tzcnti_32">
+	<CPUID>KNCNI</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="a" etype="SI32"/>
+	<parameter type="unsigned int" varname="x" etype="UI32"/>
+	<description>Count the number of trailing zero bits in unsigned 32-bit integer "x" starting at bit "a", and return that count in "dst".</description>
+	<operation>
+tmp := a
+IF tmp &lt; 0
+	tmp := 0
+FI
+dst := 0
+IF tmp &gt; 31
+	dst := 32
+ELSE
+	DO WHILE ((tmp &lt; 32) AND x[tmp] == 0)
+		tmp := tmp + 1
+		dst := dst + 1
+	OD
+FI
+	</operation>
+	<instruction name="TZCNTI" form="r32, r32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_tzcnti_64">
+	<CPUID>KNCNI</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="a" etype="SI64"/>
+	<parameter type="unsigned __int64" varname="x" etype="UI64"/>
+	<description>Count the number of trailing zero bits in unsigned 64-bit integer "x" starting at bit "a", and return that count in "dst".</description>
+	<operation>
+tmp := a
+IF tmp &lt; 0
+	tmp := 0
+FI
+dst := 0
+IF tmp &gt; 63
+	dst := 64
+ELSE
+	DO WHILE ((tmp &lt; 64) AND x[tmp] == 0)
+		tmp := tmp + 1
+		dst := dst + 1
+	OD
+FI
+	</operation>
+	<instruction name="TZCNTI" form="r64, r64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_delay_32">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="r1" etype="UI32"/>
+	<description>Stalls a thread without blocking other threads for 32-bit unsigned integer "r1" clock cycles.</description>
+	<operation>BlockThread(r1)
+	</operation>
+	<instruction name="DELAY" form="r32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_delay_64">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned __int64" varname="r1" etype="UI64"/>
+	<description>Stalls a thread without blocking other threads for 64-bit unsigned integer "r1" clock cycles.</description>
+	<operation>BlockThread(r1)
+	</operation>
+	<instruction name="DELAY" form="r64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_spflt_32">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="r1" etype="UI32"/>
+	<description>Set performance monitoring filtering mask to 32-bit unsigned integer "r1".</description>
+	<operation>SetPerfMonMask(r1[31:0])
+	</operation>
+	<instruction name="SPFLT" form="r32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_spflt_64">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned __int64" varname="r1" etype="UI64"/>
+	<description>Set performance monitoring filtering mask to 64-bit unsigned integer "r1".</description>
+	<operation>SetPerfMonMask(r1[63:0])
+	</operation>
+	<instruction name="SPFLT" form="r64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm_clevict">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="const void *" varname="ptr"/>
+	<parameter type="int" varname="level" etype="UI32"/>
+	<description>Evicts the cache line containing the address "ptr" from cache level "level" (can be either 0 or 1).</description>
+	<operation>CacheLineEvict(ptr, level)
+	</operation>
+	<instruction name="CLEVICT0" form="m8"/>
+	<instruction name="CLEVICT1" form="m8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kandnr">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="dst" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Performs a bitwise AND operation between NOT of "k2" and "k1", storing the result in "dst".</description>
+	<operation>dst[15:0] := NOT(k2[15:0]) &amp; k1[15:0]
+	</operation>
+	<instruction name="KANDNR" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kswapb">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Moves high byte from "k2" to low byte of "k1", and moves low byte of "k2" to high byte of "k1".</description>
+	<operation>
+tmp[7:0] := k2[15:8]
+k2[15:8] := k1[7:0]
+k1[7:0]  := tmp[7:0]
+tmp[7:0] := k2[7:0]
+k2[7:0]  := k1[15:8]
+k1[15:8] := tmp[7:0]
+	</operation>
+	<instruction name="KMERGE2L1H" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kortestz">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0.</description>
+	<operation>dst[15:0] := k1[15:0] | k2[15:0]
+IF dst == 0
+	SetZF()
+FI
+	</operation>
+	<instruction name="KORTEST" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kortestc">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's.</description>
+	<operation>dst[15:0] := k1[15:0] | k2[15:0]
+IF PopCount(dst[15:0]) == 16
+	SetCF()
+FI
+	</operation>
+	<instruction name="KORTEST" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask2int">
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<description>Converts bit mask "k1" into an integer value, storing the results in "dst".</description>
+	<operation>
+dst := ZeroExtend32(k1)
+	</operation>
+	<instruction name="KMOV" form="r32, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_int2mask">
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="dst" etype="MASK"/>
+	<parameter type="int" varname="mask" etype="UI16"/>
+	<description>Converts integer "mask" into bitmask, storing the result in "dst".</description>
+	<operation>
+dst := mask[15:0]
+	</operation>
+	<instruction name="KMOV" form="k, r32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kconcathi_64">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Packs masks "k1" and "k2" into the high 32 bits of "dst". The rest of "dst" is set to 0.</description>
+	<operation>
+dst[63:48] := k1[15:0]
+dst[47:32] := k2[15:0]
+dst[31:0]  := 0
+	</operation>
+	<instruction name="KCONCATH" form="r64, k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kconcatlo_64">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Packs masks "k1" and "k2" into the low 32 bits of "dst". The rest of "dst" is set to 0.</description>
+	<operation>
+dst[31:16] := k1[15:0]
+dst[15:0]  := k2[15:0]
+dst[63:32] := 0
+	</operation>
+	<instruction name="KCONCATL" form="r64, k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kextract_64">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="dst" etype="MASK"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="b" etype="UI32"/>
+	<description>Extracts 16-bit value "b" from 64-bit integer "a", storing the result in "dst".</description>
+	<operation>
+CASE b[1:0] OF
+0: dst[15:0] := a[63:48]
+1: dst[15:0] := a[47:32]
+2: dst[15:0] := a[31:16]
+3: dst[15:0] := a[15:0]
+ESAC
+dst[MAX:15] := 0
+	</operation>
+	<instruction name="KEXTRACT" form="k, r64, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_fmadd233_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	base := (j &amp; ~0x3) * 32
+	scale[31:0] := b[base+63:base+32]
+	bias[31:0]  := b[base+31:base]
+	dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD233PS" form="zmm, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_fmadd233_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="__m512" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		base := (j &amp; ~0x3) * 32
+		scale[31:0] := b[base+63:base+32]
+		bias[31:0]  := b[base+31:base]
+		dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VFMADD233PS" form="zmm {k}, zmm, m512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64extgather_epi32lo">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 single-precision (32-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   dst[i+31:i] := MEM[addr+31:addr]
+	_MM_UPCONV_EPI32_UINT8:  dst[i+31:i] := ZeroExtend32(MEM[addr+7:addr])
+	_MM_UPCONV_EPI32_SINT8:  dst[i+31:i] := SignExtend32(MEM[addr+7:addr])
+	_MM_UPCONV_EPI32_UINT16: dst[i+31:i] := ZeroExtend32(MEM[addr+15:addr])
+	_MM_UPCONV_EPI32_SINT16: dst[i+31:i] := SignExtend32(MEM[addr+15:addr])
+	ESAC
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64extgather_epi32lo">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI32"/>
+	<parameter type="_MM_UPCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 single-precision (32-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:   dst[i+31:i] := MEM[addr+31:addr]
+		_MM_UPCONV_EPI32_UINT8:  dst[i+31:i] := ZeroExtend32(MEM[addr+7:addr])
+		_MM_UPCONV_EPI32_SINT8:  dst[i+31:i] := SignExtend32(MEM[addr+7:addr])
+		_MM_UPCONV_EPI32_UINT16: dst[i+31:i] := ZeroExtend32(MEM[addr+15:addr])
+		_MM_UPCONV_EPI32_SINT16: dst[i+31:i] := SignExtend32(MEM[addr+15:addr])
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64extgather_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst". "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE: dst[i+63:i] := MEM[addr+63:addr]
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64extgather_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI64"/>
+	<parameter type="__m512i" varname="src" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const*" varname="base_addr" etype="UI64"/>
+	<parameter type="_MM_UPCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_EPI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 double-precision (64-bit) memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE: dst[i+63:i] := MEM[addr+63:addr]
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64extgather_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in the lower half of "dst". "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:    dst[i+31:i] := MEM[addr+31:addr]
+	_MM_UPCONV_PS_FLOAT16: dst[i+31:i] := Convert_FP16_To_FP32(MEM[addr+15:addr])
+	_MM_UPCONV_PS_UINT8:   dst[i+31:i] := Convert_UInt8_To_FP32(MEM[addr+7:addr])
+	_MM_UPCONV_PS_SINT8:   dst[i+31:i] := Convert_Int8_To_FP32(MEM[addr+7:addr])
+	_MM_UPCONV_PS_UINT16:  dst[i+31:i] := Convert_UInt16_To_FP32(MEM[addr+15:addr])
+	_MM_UPCONV_PS_SINT16:  dst[i+31:i] := Convert_Int16_To_FP32(MEM[addr+15:addr])
+	ESAC
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64extgather_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="FP32"/>
+	<parameter type="_MM_UPCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in the lower half of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:    dst[i+31:i] := MEM[addr+31:addr]
+		_MM_UPCONV_PS_FLOAT16: dst[i+31:i] := Convert_FP16_To_FP32(MEM[addr+15:addr])
+		_MM_UPCONV_PS_UINT8:   dst[i+31:i] := Convert_UInt8_To_FP32(MEM[addr+7:addr])
+		_MM_UPCONV_PS_SINT8:   dst[i+31:i] := Convert_Int8_To_FP32(MEM[addr+7:addr])
+		_MM_UPCONV_PS_UINT16:  dst[i+31:i] := Convert_UInt16_To_FP32(MEM[addr+15:addr])
+		_MM_UPCONV_PS_SINT16:  dst[i+31:i] := Convert_Int16_To_FP32(MEM[addr+15:addr])
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64extgather_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_UPCONV_PD_NONE: dst[i+63:i] := MEM[addr+63:addr]
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64extgather_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512d" varname="dst" etype="FP64"/>
+	<parameter type="__m512d" varname="src" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="FP64"/>
+	<parameter type="_MM_UPCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_UPCONV_PD"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_PD_NONE: dst[i+63:i] := MEM[addr+63:addr]
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64extscatter_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed single-precision (32-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_DOWNCONV_PS_NONE:    MEM[addr+31:addr] := a[i+31:i]
+	_MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i])
+	_MM_DOWNCONV_PS_UINT8:   MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i])
+	_MM_DOWNCONV_PS_SINT8:   MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i])
+	_MM_DOWNCONV_PS_UINT16:  MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i])
+	_MM_DOWNCONV_PS_SINT16:  MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64extscatter_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_DOWNCONV_PS_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PS"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed single-precision (32-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Elements are only written when the corresponding mask bit is set in "k"; otherwise, elements are unchanged in memory. "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_PS_NONE:    MEM[addr+31:addr] := a[i+31:i]
+		_MM_DOWNCONV_PS_FLOAT16: MEM[addr+15:addr] := Convert_FP32_To_FP16(a[i+31:i])
+		_MM_DOWNCONV_PS_UINT8:   MEM[addr+ 7:addr] := Convert_FP32_To_UInt8(a[i+31:i])
+		_MM_DOWNCONV_PS_SINT8:   MEM[addr+ 7:addr] := Convert_FP32_To_Int8(a[i+31:i])
+		_MM_DOWNCONV_PS_UINT16:  MEM[addr+15:addr] := Convert_FP32_To_UInt16(a[i+31:i])
+		_MM_DOWNCONV_PS_SINT16:  MEM[addr+15:addr] := Convert_FP32_To_Int16(a[i+31:i])
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64extscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64extscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512d" varname="a" etype="FP64"/>
+	<parameter type="_MM_DOWNCONV_PD_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_PD"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed double-precision (64-bit) floating-point elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Elements are written to memory using writemask "k" (elements are not stored to memory when the corresponding mask bit is not set; the memory location is left unchagned). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64extscatter_epi32lo">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts the low 8 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_DOWNCONV_EPI32_NONE:   MEM[addr+31:addr] := a[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:  MEM[addr+ 7:addr] := Truncate8(a[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:  MEM[addr+ 7:addr] := Saturate8(a[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64extscatter_epi32lo">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="UI32"/>
+	<parameter type="_MM_DOWNCONV_EPI32_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts the low 8 packed 32-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Elements are written to memory using writemask "k" (elements are only written when the corresponding mask bit is set; otherwise, the memory location is left unchanged). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI32_NONE:   MEM[addr+31:addr] := a[i+31:i]
+		_MM_DOWNCONV_EPI32_UINT8:  MEM[addr+ 7:addr] := Truncate8(a[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT8:  MEM[addr+ 7:addr] := Saturate8(a[i+31:i])
+		_MM_DOWNCONV_EPI32_UINT16: MEM[addr+15:addr] := Truncate16(a[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT16: MEM[addr+15:addr] := Saturate16(a[i+31:i])
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64extscatter_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI64"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed 64-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	CASE conv OF
+	_MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64extscatter_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI64"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="UI64"/>
+	<parameter type="_MM_DOWNCONV_EPI64_ENUM" varname="conv" etype="IMM" immtype="_MM_DOWNCONV_EPI64"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<parameter type="int" varname="hint" etype="UI32" hint="TRUE" immtype="_MM_HINT_EXT"/>
+	<description>Down-converts 8 packed 64-bit integer elements in "a" using "conv" and stores them in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI64_NONE: MEM[addr+63:addr] := a[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_permute4f128_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in "a" using constant "imm8". The results are stored in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control)  {
+	CASE control[1:0] OF
+	0: tmp[127:0] := src[127:0]
+	1: tmp[127:0] := src[255:128]
+	2: tmp[127:0] := src[383:256]
+	3: tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+FOR j := 0 to 3
+	i := j*128
+	n := j*2
+	dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMF32X4" form="zmm, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_mask_permute4f128_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="_MM_PERM_ENUM" varname="imm8" etype="IMM" immtype="_MM_PERM"/>
+	<description>Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in "a" using constant "imm8". The results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+DEFINE SELECT4(src, control)  {
+	CASE control[1:0] OF
+	0: tmp[127:0] := src[127:0]
+	1: tmp[127:0] := src[255:128]
+	2: tmp[127:0] := src[383:256]
+	3: tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+tmp[511:0] := 0
+FOR j := 0 to 3
+	i := j*128
+	n := j*2
+	tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
+ENDFOR
+FOR j := 0 to 15
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPERMF32X4" form="zmm {k}, m512, imm8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64gather_epi32lo">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Loads 8 32-bit integer memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64gather_epi32lo">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512i" varname="dst" etype="UI32"/>
+	<parameter type="__m512i" varname="src" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="UI32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Loads 8 32-bit integer memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64gather_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Loads 8 single-precision (32-bit) floating-point memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	dst[i+31:i] := MEM[addr+31:addr]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64gather_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<return type="__m512" varname="dst" etype="FP32"/>
+	<parameter type="__m512" varname="src" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="void const *" varname="base_addr" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Loads 8 single-precision (32-bit) floating-point memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		dst[i+31:i] := MEM[addr+31:addr]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64scatter_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Stores 8 packed single-precision (32-bit) floating-point elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64scatter_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="FP32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512" varname="a" etype="FP32"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Stores 8 packed single-precision (32-bit) floating-point elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are only written to memory when the corresponding mask bit is set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI	
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_i64scatter_epi32lo">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI32"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="M512"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Stores 8 packed 32-bit integer elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+	MEM[addr+31:addr] := a[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" sequence="TRUE" name="_mm512_mask_i64scatter_epi32lo">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="base_addr" etype="UI32"/>
+	<parameter type="__mmask8" varname="k" etype="MASK"/>
+	<parameter type="__m512i" varname="vindex" etype="SI64"/>
+	<parameter type="__m512i" varname="a" etype="M512"/>
+	<parameter type="int" varname="scale" etype="IMM" immtype="_MM_INDEX_SCALE"/>
+	<description>Stores 8 packed 32-bit integer elements in "a" in memory locations starting at location "base_addr" at packed 64-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements are only written to memory when the corresponding mask bit is set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8
+		MEM[addr+31:addr] := a[i+31:i]
+	FI	
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kmerge2l1h">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Move the high element from "k1" to the low element of "k1", and insert the low element of "k2" into the high element of "k1".</description>
+	<operation>
+tmp[7:0] := k1[15:8]
+k1[15:8] := k2[7:0]
+k1[7:0]  := tmp[7:0]
+	</operation>
+	<instruction name="KMERGE2L1H" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" name="_mm512_kmerge2l1l">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<return type="__mmask16" varname="k" etype="MASK"/>
+	<parameter type="__mmask16" varname="k1" etype="MASK"/>
+	<parameter type="__mmask16" varname="k2" etype="MASK"/>
+	<description>Insert the low element of "k2" into the high element of "k1".</description>
+	<operation>
+k1[15:8] := k2[7:0]
+	</operation>
+	<instruction name="KMERGE2L1L" form="k, k"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_lzcnt_u32">
+	<type>Integer</type>
+	<CPUID>LZCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Count the number of leading zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 31
+dst := 0
+DO WHILE (tmp &gt;= 0 AND a[tmp] == 0)
+	tmp := tmp - 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction name="LZCNT" form="r32, r32" xed="LZCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_lzcnt_u64">
+	<type>Integer</type>
+	<CPUID>LZCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Count the number of leading zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 63
+dst := 0
+DO WHILE (tmp &gt;= 0 AND a[tmp] == 0)
+	tmp := tmp - 1
+	dst := dst + 1
+OD
+	</operation>
+	<instruction name="LZCNT" form="r64, r64" xed="LZCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_from_int64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="MOVQ" form="mm, r64" xed="MOVQ_MMXq_GPR64"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_to_int64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m64" varname="a" etype="FP32"/>
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="MOVQ" form="r64, mm" xed="MOVQ_GPR64_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_empty">
+	<CPUID>MMX</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures.</description>
+	<instruction name="EMMS" xed="EMMS"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_from_int">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := 0
+	</operation>
+	<instruction name="MOVD" form="mm, r32" xed="MOVD_MMXq_GPR32"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_to_int">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m64" varname="a" etype="FP32"/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name="MOVD" form="r32, mm" xed="MOVD_GPR32_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_packsswb">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="SI8"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(b[15:0])
+dst[47:40] := Saturate8(b[31:16])
+dst[55:48] := Saturate8(b[47:32])
+dst[63:56] := Saturate8(b[63:48])
+	</operation>
+	<instruction name="PACKSSWB" form="mm, mm" xed="PACKSSWB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_packssdw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI32"/>
+	<parameter type="__m64" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(b[31:0])
+dst[63:48] := Saturate16(b[63:32])
+	</operation>
+	<instruction name="PACKSSDW" form="mm, mm" xed="PACKSSDW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_packuswb">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(b[15:0])
+dst[47:40] := SaturateU8(b[31:16])
+dst[55:48] := SaturateU8(b[47:32])
+dst[63:56] := SaturateU8(b[63:48])
+	</operation>
+	<instruction name="PACKUSWB" form="mm, mm" xed="PACKUSWB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_punpckhbw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) {
+	dst[7:0] := src1[39:32]
+	dst[15:8] := src2[39:32] 
+	dst[23:16] := src1[47:40]
+	dst[31:24] := src2[47:40]
+	dst[39:32] := src1[55:48]
+	dst[47:40] := src2[55:48]
+	dst[55:48] := src1[63:56]
+	dst[63:56] := src2[63:56]
+	RETURN dst[63:0]
+}
+dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction name="PUNPCKHBW" form="mm, mm" xed="PUNPCKHBW_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_punpckhwd">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) {
+	dst[15:0] := src1[47:32]
+	dst[31:16] := src2[47:32]
+	dst[47:32] := src1[63:48]
+	dst[63:48] := src2[63:48]
+	RETURN dst[63:0]
+}
+dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction name="PUNPCKLBW" form="mm, mm" xed="PUNPCKLBW_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_punpckhdq">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32]
+dst[63:32] := b[63:32]
+	</operation>
+	<instruction name="PUNPCKHDQ" form="mm, mm" xed="PUNPCKHDQ_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_punpcklbw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction name="PUNPCKLBW" form="mm, mm" xed="PUNPCKLBW_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_punpcklwd">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction name="PUNPCKLWD" form="mm, mm" xed="PUNPCKLWD_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_punpckldq">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := b[31:0]
+	</operation>
+	<instruction name="PUNPCKLDQ" form="mm, mm" xed="PUNPCKLDQ_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_paddb">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDB" form="mm, mm" xed="PADDB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_paddw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDW" form="mm, mm" xed="PADDW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_paddd">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDD" form="mm, mm" xed="PADDD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_paddsb">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="b" etype="SI64"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDSB" form="mm, mm" xed="PADDSB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_paddsw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="b" etype="SI64"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDSW" form="mm, mm" xed="PADDSW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_paddusb">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDUSB" form="mm, mm" xed="PADDUSB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_paddusw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDUSW" form="mm, mm" xed="PADDUSW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psubb">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBB" form="mm, mm" xed="PSUBB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psubw">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBW" form="mm, mm" xed="PSUBW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psubd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBD" form="mm, mm" xed="PSUBD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psubsb">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="b" etype="SI64"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name="PSUBSB" form="mm, mm" xed="PSUBSB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psubsw">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="b" etype="SI64"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PSUBSW" form="mm, mm" xed="PSUBSW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psubusb">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name="PSUBUSB" form="mm, mm" xed="PSUBUSB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psubusw">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+	</operation>
+	<instruction name="PSUBUSW" form="mm, mm" xed="PSUBUSW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pmaddwd">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="b" etype="SI64"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMADDWD" form="mm, mm" xed="PMADDWD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pmulhw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="b" etype="SI64"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name="PMULHW" form="mm, mm" xed="PMULHW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pmullw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+	</operation>
+	<instruction name="PMULLW" form="mm, mm" xed="PMULLW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psllw">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLW" form="mm, mm" xed="PSLLW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psllwi">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLW" form="mm, imm8" xed="PSLLW_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pslld">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLD" form="mm, mm" xed="PSLLD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pslldi">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLD" form="mm, imm8" xed="PSLLD_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psllq">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &lt;&lt; count[63:0])
+FI
+	</operation>
+	<instruction name="PSLLQ" form="mm, mm" xed="PSLLQ_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psllqi">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &lt;&lt; imm8[7:0])
+FI
+	</operation>
+	<instruction name="PSLLQ" form="mm, imm8" xed="PSLLQ_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psraw">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAW" form="mm, mm" xed="PSRAW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psrawi">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAW" form="mm, imm8" xed="PSRAW_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psrad">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAD" form="mm, mm" xed="PSRAD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psradi">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAD" form="mm, imm8" xed="PSRAD_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psrlw">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLW" form="mm, mm" xed="PSRLW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psrlwi">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLW" form="mm, imm8" xed="PSRLW_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psrld">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLD" form="mm, mm" xed="PSRLD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psrldi">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLD" form="mm, imm8" xed="PSRLD_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psrlq">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &gt;&gt; count[63:0])
+FI
+	</operation>
+	<instruction name="PSRLQ" form="mm, mm" xed="PSRLQ_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_psrlqi">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &gt;&gt; imm8[7:0])
+FI
+	</operation>
+	<instruction name="PSRLQ" form="mm, imm8" xed="PSRLQ_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pand">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] AND b[63:0])
+	</operation>
+	<instruction name="PAND" form="mm, mm" xed="PAND_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pandn">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := ((NOT a[63:0]) AND b[63:0])
+	</operation>
+	<instruction name="PANDN" form="mm, mm" xed="PANDN_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_por">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] OR b[63:0])
+	</operation>
+	<instruction name="POR" form="mm, mm" xed="POR_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pxor">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] XOR b[63:0])
+	</operation>
+	<instruction name="PXOR" form="mm, mm" xed="PXOR_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pcmpeqb">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQB" form="mm, mm" xed="PCMPEQB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pcmpeqw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQW" form="mm, mm" xed="PCMPEQW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pcmpeqd">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQD" form="mm, mm" xed="PCMPEQD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pcmpgtb">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="b" etype="SI64"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTB" form="mm, mm" xed="PCMPGTB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pcmpgtw">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="b" etype="SI64"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTW" form="mm, mm" xed="PCMPGTW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_m_pcmpgtd">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI64"/>
+	<parameter type="__m64" varname="b" etype="SI64"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTD" form="mm, mm" xed="PCMPGTD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_empty">
+	<CPUID>MMX</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures.</description>
+	<instruction name="EMMS" xed="EMMS"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_add_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDB" form="mm, mm" xed="PADDB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_add_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDW" form="mm, mm" xed="PADDW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_add_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="__m64" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDD" form="mm, mm" xed="PADDD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_adds_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI8"/>
+	<parameter type="__m64" varname="b" etype="SI8"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDSB" form="mm, mm" xed="PADDSB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_adds_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDSW" form="mm, mm" xed="PADDSW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_adds_pu8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDUSB" form="mm, mm" xed="PADDUSB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_adds_pu16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDUSW" form="mm, mm" xed="PADDUSW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_sub_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBB" form="mm, mm" xed="PSUBB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_sub_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBW" form="mm, mm" xed="PSUBW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_sub_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="__m64" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBD" form="mm, mm" xed="PSUBD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_subs_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI8"/>
+	<parameter type="__m64" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name="PSUBSB" form="mm, mm" xed="PSUBSB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_subs_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PSUBSW" form="mm, mm" xed="PSUBSW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_subs_pu8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name="PSUBUSB" form="mm, mm" xed="PSUBUSB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_subs_pu16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+	</operation>
+	<instruction name="PSUBUSW" form="mm, mm" xed="PSUBUSW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_madd_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMADDWD" form="mm, mm" xed="PMADDWD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_mulhi_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name="PMULHW" form="mm, mm" xed="PMULHW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_mullo_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+	</operation>
+	<instruction name="PMULLW" form="mm, mm" xed="PMULLW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_sll_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLW" form="mm, mm" xed="PSLLW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_slli_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLW" form="mm, imm8" xed="PSLLW_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_sll_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="__m64" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLD" form="mm, mm" xed="PSLLD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_slli_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLD" form="mm, imm8" xed="PSLLD_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_sll_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &lt;&lt; count[63:0])
+FI
+	</operation>
+	<instruction name="PSLLQ" form="mm, mm" xed="PSLLQ_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_slli_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &lt;&lt; imm8[7:0])
+FI
+	</operation>
+	<instruction name="PSLLQ" form="mm, imm8" xed="PSLLQ_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_sra_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAW" form="mm, mm" xed="PSRAW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_srai_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAW" form="mm, imm8" xed="PSRAW_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_sra_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="__m64" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAD" form="mm, mm" xed="PSRAD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_srai_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAD" form="mm, imm8" xed="PSRAD_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_srl_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLW" form="mm, mm" xed="PSRLW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_srli_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLW" form="mm, imm8" xed="PSRLW_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_srl_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="__m64" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLD" form="mm, mm" xed="PSRLD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_srli_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLD" form="mm, imm8" xed="PSRLD_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_srl_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="count" etype="UI64"/>
+	<description>Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &gt;&gt; count[63:0])
+FI
+	</operation>
+	<instruction name="PSRLQ" form="mm, mm" xed="PSRLQ_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_srli_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst".</description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend64(a[63:0] &gt;&gt; imm8[7:0])
+FI
+	</operation>
+	<instruction name="PSRLQ" form="mm, imm8" xed="PSRLQ_MMXq_IMMb"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_and_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] AND b[63:0])
+	</operation>
+	<instruction name="PAND" form="mm, mm" xed="PAND_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_andnot_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := ((NOT a[63:0]) AND b[63:0])
+	</operation>
+	<instruction name="PANDN" form="mm, mm" xed="PANDN_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_or_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] OR b[63:0])
+	</operation>
+	<instruction name="POR" form="mm, mm" xed="POR_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_xor_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] XOR b[63:0])
+	</operation>
+	<instruction name="PXOR" form="mm, mm" xed="PXOR_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cmpeq_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQB" form="mm, mm" xed="PCMPEQB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cmpeq_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQW" form="mm, mm" xed="PCMPEQW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cmpeq_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="__m64" varname="b" etype="UI32"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQD" form="mm, mm" xed="PCMPEQD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cmpgt_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI8"/>
+	<parameter type="__m64" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTB" form="mm, mm" xed="PCMPGTB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cmpgt_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTW" form="mm, mm" xed="PCMPGTW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cmpgt_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI32"/>
+	<parameter type="__m64" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTD" form="mm, mm" xed="PCMPGTD_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cvtsi32_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := 0
+	</operation>
+	<instruction name="MOVD" form="mm, r32" xed="MOVD_MMXq_GPR32"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cvtsi64_si32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m64" varname="a" etype="FP32"/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name="MOVD" form="r32, mm" xed="MOVD_GPR32_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cvtm64_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m64" varname="a" etype="FP32"/>
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="MOVQ" form="r64, mm" xed="MOVQ_GPR64_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_cvtsi64_m64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="MOVQ" form="mm, r64" xed="MOVQ_MMXq_GPR64"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_setzero_si64">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m64 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="PXOR" form="mm, mm" xed="PXOR_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" sequence="TRUE" name="_mm_set_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="int" varname="e1" etype="UI32"/>
+	<parameter type="int" varname="e0" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" sequence="TRUE" name="_mm_set_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="short" varname="e3" etype="UI16"/>
+	<parameter type="short" varname="e2" etype="UI16"/>
+	<parameter type="short" varname="e1" etype="UI16"/>
+	<parameter type="short" varname="e0" etype="UI16"/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" sequence="TRUE" name="_mm_set_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="char" varname="e7" etype="UI8"/>
+	<parameter type="char" varname="e6" etype="UI8"/>
+	<parameter type="char" varname="e5" etype="UI8"/>
+	<parameter type="char" varname="e4" etype="UI8"/>
+	<parameter type="char" varname="e3" etype="UI8"/>
+	<parameter type="char" varname="e2" etype="UI8"/>
+	<parameter type="char" varname="e1" etype="UI8"/>
+	<parameter type="char" varname="e0" etype="UI8"/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" sequence="TRUE" name="_mm_set1_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" sequence="TRUE" name="_mm_set1_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast 16-bit integer "a" to all all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" sequence="TRUE" name="_mm_set1_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" sequence="TRUE" name="_mm_setr_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="int" varname="e1" etype="UI32"/>
+	<parameter type="int" varname="e0" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e1
+dst[63:32] := e0
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" sequence="TRUE" name="_mm_setr_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="short" varname="e3" etype="UI16"/>
+	<parameter type="short" varname="e2" etype="UI16"/>
+	<parameter type="short" varname="e1" etype="UI16"/>
+	<parameter type="short" varname="e0" etype="UI16"/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e3
+dst[31:16] := e2
+dst[47:32] := e1
+dst[63:48] := e0
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" sequence="TRUE" name="_mm_setr_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="char" varname="e7" etype="UI8"/>
+	<parameter type="char" varname="e6" etype="UI8"/>
+	<parameter type="char" varname="e5" etype="UI8"/>
+	<parameter type="char" varname="e4" etype="UI8"/>
+	<parameter type="char" varname="e3" etype="UI8"/>
+	<parameter type="char" varname="e2" etype="UI8"/>
+	<parameter type="char" varname="e1" etype="UI8"/>
+	<parameter type="char" varname="e0" etype="UI8"/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e7
+dst[15:8] := e6
+dst[23:16] := e5
+dst[31:24] := e4
+dst[39:32] := e3
+dst[47:40] := e2
+dst[55:48] := e1
+dst[63:56] := e0
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_packs_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="SI8"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(b[15:0])
+dst[47:40] := Saturate8(b[31:16])
+dst[55:48] := Saturate8(b[47:32])
+dst[63:56] := Saturate8(b[63:48])
+	</operation>
+	<instruction name="PACKSSWB" form="mm, mm" xed="PACKSSWB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_packs_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI32"/>
+	<parameter type="__m64" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(b[31:0])
+dst[63:48] := Saturate16(b[63:32])
+	</operation>
+	<instruction name="PACKSSDW" form="mm, mm" xed="PACKSSDW_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_packs_pu16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(b[15:0])
+dst[47:40] := SaturateU8(b[31:16])
+dst[55:48] := SaturateU8(b[47:32])
+dst[63:56] := SaturateU8(b[63:48])
+	</operation>
+	<instruction name="PACKUSWB" form="mm, mm" xed="PACKUSWB_MMXq_MMXq"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_unpackhi_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) {
+	dst[7:0] := src1[39:32]
+	dst[15:8] := src2[39:32] 
+	dst[23:16] := src1[47:40]
+	dst[31:24] := src2[47:40]
+	dst[39:32] := src1[55:48]
+	dst[47:40] := src2[55:48]
+	dst[55:48] := src1[63:56]
+	dst[63:56] := src2[63:56]
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction name="PUNPCKHBW" form="mm, mm" xed="PUNPCKHBW_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_unpackhi_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) {
+	dst[15:0] := src1[47:32]
+	dst[31:16] := src2[47:32]
+	dst[47:32] := src1[63:48]
+	dst[63:48] := src2[63:48]
+	RETURN dst[63:0]
+}
+dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction name="PUNPCKLBW" form="mm, mm" xed="PUNPCKLBW_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_unpackhi_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="__m64" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32]
+dst[63:32] := b[63:32]
+	</operation>
+	<instruction name="PUNPCKHDQ" form="mm, mm" xed="PUNPCKHDQ_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_unpacklo_pi8">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction name="PUNPCKLBW" form="mm, mm" xed="PUNPCKLBW_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_unpacklo_pi16">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	RETURN dst[63:0]	
+}
+dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction name="PUNPCKLWD" form="mm, mm" xed="PUNPCKLWD_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="MMX" name="_mm_unpacklo_pi32">
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="__m64" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := b[31:0]
+	</operation>
+	<instruction name="PUNPCKLDQ" form="mm, mm" xed="PUNPCKLDQ_MMXq_MMXd"/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_monitor">
+	<CPUID>MONITOR</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void const*" varname="p"/>
+	<parameter type="unsigned" varname="extensions" etype="UI32"/>
+	<parameter type="unsigned" varname="hints" etype="UI32"/>
+	<description>Arm address monitoring hardware using the address specified in "p". A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in "extensions", and optional hints in "hints".</description>
+	<instruction name="MONITOR" xed="MONITOR"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_mwait">
+	<CPUID>MONITOR</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned" varname="extensions" etype="UI32"/>
+	<parameter type="unsigned" varname="hints" etype="UI32"/>
+	<description>Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR.</description>
+	<instruction name="MWAIT" xed="MWAIT"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_loadbe_i16">
+	<CPUID>MOVBE</CPUID>
+	<category>Load</category>
+	<return type="short" varname="dst" etype="UI16"/>
+	<parameter type="void const *" varname="ptr" etype="UI16" memwidth="16"/>
+	<description>Load 16 bits from memory, perform a byte swap operation, and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*8
+	dst[i+7:i] := MEM[ptr+15-i:ptr+8-i]
+ENDFOR
+	</operation>
+	<instruction name="MOVBE" form="r16, m16" xed="MOVBE_GPRv_MEMv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_loadbe_i32">
+	<CPUID>MOVBE</CPUID>
+	<category>Load</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="void const *" varname="ptr" etype="UI32" memwidth="32"/>
+	<description>Load 32 bits from memory, perform a byte swap operation, and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*8
+	dst[i+7:i] := MEM[ptr+31-i:ptr+24-i]
+ENDFOR
+	</operation>
+	<instruction name="MOVBE" form="r32, m32" xed="MOVBE_GPRv_MEMv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_loadbe_i64">
+	<CPUID>MOVBE</CPUID>
+	<category>Load</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="void const *" varname="ptr" etype="UI64" memwidth="64"/>
+	<description>Load 64 bits from memory, perform a byte swap operation, and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MEM[ptr+63-i:ptr+56-i]
+ENDFOR
+	</operation>
+	<instruction name="MOVBE" form="r64, m64" xed="MOVBE_GPRv_MEMv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_storebe_i16">
+	<CPUID>MOVBE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="ptr" etype="UI16" memwidth="16"/>
+	<parameter type="short" varname="data" etype="UI16"/>
+	<description>Perform a bit swap operation of the 16 bits in "data", and store the results to memory.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*8
+	MEM[ptr+i+7:ptr+i] := data[15-i:8-i]
+ENDFOR
+	</operation>
+	<instruction name="MOVBE" form="m16, r16" xed="MOVBE_MEMv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_storebe_i32">
+	<CPUID>MOVBE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="ptr" etype="UI32" memwidth="32"/>
+	<parameter type="int" varname="data" etype="UI32"/>
+	<description>Perform a bit swap operation of the 32 bits in "data", and store the results to memory.</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 3
+	i := j*8
+	MEM[ptr+i+7:ptr+i] := data[31-i:24-i]
+ENDFOR
+	</operation>
+	<instruction name="MOVBE" form="m32, r32" xed="MOVBE_MEMv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_storebe_i64">
+	<CPUID>MOVBE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void *" varname="ptr" etype="UI64" memwidth="64"/>
+	<parameter type="__int64" varname="data" etype="UI64"/>
+	<description>Perform a bit swap operation of the 64 bits in "data", and store the results to memory.</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 7
+	i := j*8
+	MEM[ptr+i+7:ptr+i] := data[63-i:56-i]
+ENDFOR
+	</operation>
+	<instruction name="MOVBE" form="m64, r64" xed="MOVBE_MEMv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_movdir64b">
+	<CPUID>MOVDIR64B</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="dst" etype="M512" memwidth="512"/>
+	<parameter type="const void*" varname="src" etype="M512" memwidth="512"/>
+	<description>Move 64-byte (512-bit) value using direct store from source memory address "src" to destination memory address "dst".</description>
+	<operation>
+MEM[dst+511:dst] := MEM[src+511:src]
+	</operation>
+	<instruction name="MOVDIR64B" form="r64, m512" xed="MOVDIR64B_GPRa_MEM"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_directstoreu_u64">
+	<CPUID>MOVDIRI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="dst" etype="UI64" memwidth="64"/>
+	<parameter type="unsigned __int64" varname="val" etype="UI64"/>
+	<description>Store 64-bit integer from "val" into memory using direct store.</description>
+	<operation>
+MEM[dst+63:dst] := val[63:0]
+	</operation>
+	<instruction name="MOVDIRI" form="m64, r64" xed="MOVDIRI_MEMu64_GPR64u64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_directstoreu_u32">
+	<CPUID>MOVDIRI</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="dst" etype="UI32" memwidth="32"/>
+	<parameter type="unsigned int" varname="val" etype="UI32"/>
+	<description>Store 32-bit integer from "val" into memory using direct store.</description>
+	<operation>
+MEM[dst+31:dst] := val[31:0]
+	</operation>
+	<instruction name="MOVDIRI" form="m32, r32" xed="MOVDIRI_MEMu32_GPR32u32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bnd_set_ptr_bounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void *"/>
+	<parameter type="const void *" varname="srcmem"/>
+	<parameter type="size_t" varname="size" etype="UI64"/>
+	<description>Make a pointer with the value of "srcmem" and bounds set to ["srcmem", "srcmem" + "size" - 1], and store the result in "dst".</description>
+	<operation>dst := srcmem
+dst.LB := srcmem.LB
+dst.UB := srcmem + size - 1
+	</operation>
+	<instruction name="BNDMK" form="bnd, m32" xed="BNDMK_BND_AGEN"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" sequence="TRUE" name="_bnd_narrow_ptr_bounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void *"/>
+	<parameter type="const void *" varname="q"/>
+	<parameter type="const void *" varname="r"/>
+	<parameter type="size_t" varname="size" etype="UI64"/>
+	<description>Narrow the bounds for pointer "q" to the intersection of the bounds of "r" and the bounds ["q", "q" + "size" - 1], and store the result in "dst".</description>
+	<operation>dst := q
+IF r.LB &gt; (q + size - 1) OR r.UB &lt; q
+	dst.LB := 1
+	dst.UB := 0
+ELSE
+	dst.LB := MAX(r.LB, q)
+	dst.UB := MIN(r.UB, (q + size - 1))
+FI
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" sequence="TRUE" name="_bnd_copy_ptr_bounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void *"/>
+	<parameter type="const void *" varname="q"/>
+	<parameter type="const void *" varname="r"/>
+	<description>Make a pointer with the value of "q" and bounds set to the bounds of "r" (e.g. copy the bounds of "r" to pointer "q"), and store the result in "dst".</description>
+	<operation>dst := q
+dst.LB := r.LB
+dst.UB := r.UB
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" sequence="TRUE" name="_bnd_init_ptr_bounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void *"/>
+	<parameter type="const void *" varname="q"/>
+	<description>Make a pointer with the value of "q" and open bounds, which allow the pointer to access the entire virtual address space, and store the result in "dst".</description>
+	<operation>dst := q
+dst.LB := 0
+dst.UB := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bnd_store_ptr_bounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="const void **" varname="ptr_addr"/>
+	<parameter type="const void *" varname="ptr_val"/>
+	<description>Stores the bounds of "ptr_val" pointer in memory at address "ptr_addr".</description>
+	<operation>MEM[ptr_addr].LB := ptr_val.LB
+MEM[ptr_addr].UB := ptr_val.UB
+	</operation>
+	<instruction name="BNDSTX" form="mib, bnd" xed="BNDSTX_MEMbnd64_BND"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bnd_chk_ptr_lbounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="const void *" varname="q"/>
+	<description>Checks if "q" is within its lower bound, and throws a #BR if not.</description>
+	<operation>IF q &lt; q.LB
+	#BR
+FI
+	</operation>
+	<instruction name="BNDCL" form="bnd, m64" xed="BNDCL_BND_AGEN"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bnd_chk_ptr_ubounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="const void *" varname="q"/>
+	<description>Checks if "q" is within its upper bound, and throws a #BR if not.</description>
+	<operation>IF q &gt; q.UB
+	#BR
+FI
+	</operation>
+	<instruction name="BNDCU" form="bnd, m64" xed="BNDCU_BND_AGEN"/>
+	<instruction name="BNDCN" form="bnd, m64" xed="BNDCN_BND_AGEN"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bnd_chk_ptr_bounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="const void *" varname="q"/>
+	<parameter type="size_t" varname="size" etype="UI64"/>
+	<description>Checks if ["q", "q" + "size" - 1] is within the lower and upper bounds of "q" and throws a #BR if not.</description>
+	<operation>IF (q + size - 1) &lt; q.LB OR (q + size - 1) &gt; q.UB
+	#BR
+FI
+	</operation>
+	<instruction name="BNDCU" form="bnd, m32" xed="BNDCU_BND_AGEN"/>
+	<instruction name="BNDCN" form="bnd, m32" xed="BNDCN_BND_AGEN"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" sequence="TRUE" name="_bnd_get_ptr_lbound">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="const void *"/>
+	<parameter type="const void *" varname="q"/>
+	<description>Return the lower bound of "q".</description>
+	<operation>dst := q.LB
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" sequence="TRUE" name="_bnd_get_ptr_ubound">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<return type="const void *"/>
+	<parameter type="const void *" varname="q"/>
+	<description>Return the upper bound of "q".</description>
+	<operation>dst := q.UB
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bit_scan_forward">
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Set "dst" to the index of the lowest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined.</description>
+	<operation>
+tmp := 0
+IF a == 0
+	// dst is undefined
+ELSE
+	DO WHILE ((tmp &lt; 32) AND a[tmp] == 0)
+		tmp := tmp + 1
+	OD
+FI
+dst := tmp
+	</operation>
+	<instruction name="BSF" form="r32, r32" xed="BSF_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bit_scan_reverse">
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Set "dst" to the index of the highest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined.</description>
+	<operation>
+tmp := 31
+IF a == 0
+	// dst is undefined
+ELSE
+	DO WHILE ((tmp &gt; 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+	OD
+FI
+dst := tmp
+	</operation>
+	<instruction name="BSR" form="r32, r32" xed="BSR_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_BitScanForward">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned __int32*" varname="index" etype="UI32" memwidth="32"/>
+	<parameter type="unsigned __int32" varname="a" etype="UI32"/>
+	<description>Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1.</description>
+	<operation>
+tmp := 0
+IF a == 0
+	// MEM[index+31:index] is undefined
+	dst := 0
+ELSE
+	DO WHILE ((tmp &lt; 32) AND a[tmp] == 0)
+		tmp := tmp + 1
+	OD
+	MEM[index+31:index] := tmp
+	dst := (tmp == 31) ? 0 : 1
+FI
+	</operation>
+	<instruction name="BSF" form="r32, r32" xed="BSF_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_BitScanReverse">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned __int32*" varname="index" etype="UI32" memwidth="32"/>
+	<parameter type="unsigned __int32" varname="a" etype="UI32"/>
+	<description>Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1.</description>
+	<operation>
+tmp := 31
+IF a == 0
+	// MEM[index+31:index] is undefined
+	dst := 0
+ELSE
+	DO WHILE ((tmp &gt; 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+	OD
+	MEM[index+31:index] := tmp
+	dst := (tmp == 0) ? 0 : 1
+FI
+	</operation>
+	<instruction name="BSR" form="r32, r32" xed="BSR_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_BitScanForward64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned __int32*" varname="index" etype="UI32" memwidth="32"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1.</description>
+	<operation>
+tmp := 0
+IF a == 0
+	// MEM[index+31:index] is undefined
+	dst := 0
+ELSE
+	DO WHILE ((tmp &lt; 64) AND a[tmp] == 0)
+		tmp := tmp + 1
+	OD
+	MEM[index+31:index] := tmp
+	dst := (tmp == 63) ? 0 : 1
+FI
+	</operation>
+	<instruction name="BSF" form="r64, r64" xed="BSF_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_BitScanReverse64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned __int32*" varname="index" etype="UI32" memwidth="32"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1.</description>
+	<operation>
+tmp := 63
+IF a == 0
+	// MEM[index+31:index] is undefined
+	dst := 0
+ELSE
+	DO WHILE ((tmp &gt; 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+	OD
+	MEM[index+31:index] := tmp
+	dst := (tmp == 0) ? 0 : 1
+FI
+	</operation>
+	<instruction name="BSR" form="r64, r64" xed="BSR_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bittest">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__int32*" varname="a" etype="UI32" memwidth="32"/>
+	<parameter type="__int32" varname="b" etype="IMM" immwidth="5"/>
+	<description>Return the bit at index "b" of 32-bit integer "a".</description>
+	<operation>
+addr := a + ZeroExtend64(b)
+dst[0] := MEM[addr]
+	</operation>
+	<instruction name="BT" form="m32, r32" xed="BT_MEMv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bittestandcomplement">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__int32*" varname="a" etype="UI32" memwidth="32"/>
+	<parameter type="__int32" varname="b" etype="IMM" immwidth="5"/>
+	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to its complement.</description>
+	<operation>
+addr := a + ZeroExtend64(b)
+dst[0] := MEM[addr]
+MEM[addr] := ~dst[0]
+	</operation>
+	<instruction name="BTC" form="m32, r32" xed="BTC_MEMv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bittestandreset">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__int32*" varname="a" etype="UI32" memwidth="32"/>
+	<parameter type="__int32" varname="b" etype="IMM" immwidth="5"/>
+	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to zero.</description>
+	<operation>
+addr := a + ZeroExtend64(b)
+dst[0] := MEM[addr]
+MEM[addr] := 0
+	</operation>
+	<instruction name="BTR" form="m32, r32" xed="BTR_MEMv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bittestandset">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__int32*" varname="a" etype="UI32" memwidth="32"/>
+	<parameter type="__int32" varname="b" etype="IMM" immwidth="5"/>
+	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to one.</description>
+	<operation>
+addr := a + ZeroExtend64(b)
+dst[0] := MEM[addr]
+MEM[addr] := 1
+	</operation>
+	<instruction name="BTS" form="m32, r32" xed="BTS_MEMv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bittest64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__int64*" varname="a" etype="UI64" memwidth="32"/>
+	<parameter type="__int64" varname="b" etype="IMM" immwidth="6"/>
+	<description>Return the bit at index "b" of 64-bit integer "a".</description>
+	<operation>
+addr := a + b
+dst[0] := MEM[addr]
+	</operation>
+	<instruction name="BT" form="r64, r64" xed="BT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bittestandcomplement64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__int64*" varname="a" etype="UI64" memwidth="32"/>
+	<parameter type="__int64" varname="b" etype="IMM" immwidth="6"/>
+	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to its complement.</description>
+	<operation>
+addr := a + b
+dst[0] := MEM[addr]
+MEM[addr] := ~dst[0]
+	</operation>
+	<instruction name="BTC" form="r64, r64" xed="BTC_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bittestandreset64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__int64*" varname="a" etype="UI64" memwidth="32"/>
+	<parameter type="__int64" varname="b" etype="IMM" immwidth="6"/>
+	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to zero.</description>
+	<operation>
+addr := a + b
+dst[0] := MEM[addr]
+MEM[addr] := 0
+	</operation>
+	<instruction name="BTR" form="r64, r64" xed="BTR_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bittestandset64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Bit Manipulation</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="__int64*" varname="a" etype="UI64" memwidth="32"/>
+	<parameter type="__int64" varname="b" etype="IMM" immwidth="6"/>
+	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to one.</description>
+	<operation>
+addr := a + b
+dst[0] := MEM[addr]
+MEM[addr] := 1
+	</operation>
+	<instruction name="BTS" form="r64, r64" xed="BTS_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bswap">
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Reverse the byte order of 32-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values.</description>
+	<operation>
+dst[7:0] := a[31:24]
+dst[15:8] := a[23:16]
+dst[23:16] := a[15:8]
+dst[31:24] := a[7:0]
+	</operation>
+	<instruction name="BSWAP" form="r32" xed="BSWAP_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_bswap64">
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Reverse the byte order of 64-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values.</description>
+	<operation>
+dst[7:0] := a[63:56]
+dst[15:8] := a[55:48]
+dst[23:16] := a[47:40]
+dst[31:24] := a[39:32]
+dst[39:32] := a[31:24]
+dst[47:40] := a[23:16]
+dst[55:48] := a[15:8]
+dst[63:56] := a[7:0]
+	</operation>
+	<instruction name="BSWAP" form="r64" xed="BSWAP_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_castf32_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<category>Cast</category>
+	<return type="unsigned __int32" varname="dst" etype="UI32"/>
+	<parameter type="float" varname="a" etype="FP32"/>
+	<description>Cast from type float to type unsigned __int32 without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_castf64_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<category>Cast</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="double" varname="a" etype="FP64"/>
+	<description>Cast from type double to type unsigned __int64 without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_castu32_f32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<category>Cast</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="unsigned __int32" varname="a" etype="UI32"/>
+	<description>Cast from type unsigned __int32 to type float without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_castu64_f64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<category>Cast</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Cast from type unsigned __int64 to type double without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_lrotl">
+	<type>Integer</type>
+	<category>Shift</category>
+	<return type="unsigned long" varname="dst" etype="UI32"/>
+	<parameter type="unsigned long" varname="a" etype="UI32"/>
+	<parameter type="int" varname="shift" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of unsigned long integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>// size := 32 or 64
+dst := a
+count := shift AND (size - 1)
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[size - 1]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction name="ROL" form="r64, imm8" xed="ROL_GPRv_IMMb"/>
+	<instruction name="ROL" form="r32, imm8" xed="ROL_GPRv_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_lrotr">
+	<type>Integer</type>
+	<category>Shift</category>
+	<return type="unsigned long" varname="dst" etype="UI32"/>
+	<parameter type="unsigned long" varname="a" etype="UI32"/>
+	<parameter type="int" varname="shift" etype="IMM" immwidth="8"/>
+	<description>Shift the bits of unsigned long integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>// size := 32 or 64
+dst := a
+count := shift AND (size - 1)
+DO WHILE (count &gt; 0)
+	tmp[size - 1] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp[size - 1]
+	count := count - 1
+OD
+	</operation>
+	<instruction name="ROR" form="r64, imm8" xed="ROR_GPRv_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" sequence="TRUE" name="_allow_cpu_features">
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned __int64" varname="a" etype="IMM" immwidth="8"/>
+	<description>Treat the processor-specific feature(s) specified in "a" as available. Multiple features may be OR'd together. See the valid feature flags below:</description>
+	<operation>
+_FEATURE_GENERIC_IA32
+_FEATURE_FPU
+_FEATURE_CMOV
+_FEATURE_MMX
+_FEATURE_FXSAVE
+_FEATURE_SSE
+_FEATURE_SSE2
+_FEATURE_SSE3
+_FEATURE_SSSE3
+_FEATURE_SSE4_1
+_FEATURE_SSE4_2
+_FEATURE_MOVBE
+_FEATURE_POPCNT
+_FEATURE_PCLMULQDQ
+_FEATURE_AES
+_FEATURE_F16C
+_FEATURE_AVX
+_FEATURE_RDRND
+_FEATURE_FMA
+_FEATURE_BMI
+_FEATURE_LZCNT
+_FEATURE_HLE
+_FEATURE_RTM
+_FEATURE_AVX2
+_FEATURE_KNCNI
+_FEATURE_AVX512F
+_FEATURE_ADX
+_FEATURE_RDSEED
+_FEATURE_AVX512ER
+_FEATURE_AVX512PF
+_FEATURE_AVX512CD
+_FEATURE_SHA
+_FEATURE_MPX
+_FEATURE_AVX512BW
+_FEATURE_AVX512VL
+_FEATURE_AVX512VBMI
+_FEATURE_AVX512_4FMAPS
+_FEATURE_AVX512_4VNNIW
+_FEATURE_AVX512_VPOPCNTDQ
+_FEATURE_AVX512_BITALG
+_FEATURE_AVX512_VBMI2
+_FEATURE_GFNI
+_FEATURE_VAES
+_FEATURE_VPCLMULQDQ
+_FEATURE_AVX512_VNNI
+_FEATURE_CLWB
+_FEATURE_RDPID
+_FEATURE_IBT
+_FEATURE_SHSTK
+_FEATURE_SGX
+_FEATURE_WBNOINVD
+_FEATURE_PCONFIG
+_FEATURE_AXV512_4VNNIB
+_FEATURE_AXV512_4FMAPH
+_FEATURE_AXV512_BITALG2
+_FEATURE_AXV512_VP2INTERSECT
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" sequence="TRUE" name="_may_i_use_cpu_feature">
+	<category>General Support</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="unsigned __int64" varname="a" etype="IMM" immwidth="8"/>
+	<description>Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This intrinsic does not check the processor vendor. See the valid feature flags below:</description>
+	<operation>
+_FEATURE_GENERIC_IA32
+_FEATURE_FPU
+_FEATURE_CMOV
+_FEATURE_MMX
+_FEATURE_FXSAVE
+_FEATURE_SSE
+_FEATURE_SSE2
+_FEATURE_SSE3
+_FEATURE_SSSE3
+_FEATURE_SSE4_1
+_FEATURE_SSE4_2
+_FEATURE_MOVBE
+_FEATURE_POPCNT
+_FEATURE_PCLMULQDQ
+_FEATURE_AES
+_FEATURE_F16C
+_FEATURE_AVX
+_FEATURE_RDRND
+_FEATURE_FMA
+_FEATURE_BMI
+_FEATURE_LZCNT
+_FEATURE_HLE
+_FEATURE_RTM
+_FEATURE_AVX2
+_FEATURE_KNCNI
+_FEATURE_AVX512F
+_FEATURE_ADX
+_FEATURE_RDSEED
+_FEATURE_AVX512ER
+_FEATURE_AVX512PF
+_FEATURE_AVX512CD
+_FEATURE_SHA
+_FEATURE_MPX
+_FEATURE_AVX512BW
+_FEATURE_AVX512VL
+_FEATURE_AVX512VBMI
+_FEATURE_AVX512_4FMAPS
+_FEATURE_AVX512_4VNNIW
+_FEATURE_AVX512_VPOPCNTDQ
+_FEATURE_AVX512_BITALG
+_FEATURE_AVX512_VBMI2
+_FEATURE_GFNI
+_FEATURE_VAES
+_FEATURE_VPCLMULQDQ
+_FEATURE_AVX512_VNNI
+_FEATURE_CLWB
+_FEATURE_RDPID
+_FEATURE_IBT
+_FEATURE_SHSTK
+_FEATURE_SGX
+_FEATURE_WBNOINVD
+_FEATURE_PCONFIG
+_FEATURE_AXV512_4VNNIB
+_FEATURE_AXV512_4FMAPH
+_FEATURE_AXV512_BITALG2
+_FEATURE_AXV512_VP2INTERSECT
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdpmc">
+	<category>General Support</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Read the Performance Monitor Counter (PMC) specified by "a", and store up to 64-bits in "dst". The width of performance counters is implementation specific.</description>
+	<operation>dst[63:0] := ReadPMC(a)
+	</operation>
+	<instruction name="RDPMC" xed="RDPMC"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rotl">
+	<type>Integer</type>
+	<category>Shift</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="int" varname="shift" etype="IMM" immwidth="5"/>
+	<description>Shift the bits of unsigned 32-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 31
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[31]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction name="ROL" form="r32, imm8" xed="ROL_GPRv_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rotr">
+	<type>Integer</type>
+	<category>Shift</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="int" varname="shift" etype="IMM" immwidth="5"/>
+	<description>Shift the bits of unsigned 32-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 31
+DO WHILE (count &gt; 0)
+	tmp[31] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp
+	count := count - 1
+OD
+	</operation>
+	<instruction name="ROR" form="r32, imm8" xed="ROR_GPRv_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rotwl">
+	<type>Integer</type>
+	<category>Shift</category>
+	<return type="unsigned short" varname="dst" etype="UI16"/>
+	<parameter type="unsigned short" varname="a" etype="UI16"/>
+	<parameter type="int" varname="shift" etype="IMM" immwidth="4"/>
+	<description>Shift the bits of unsigned 16-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 15
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[15]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction name="ROL" form="r16, imm8" xed="ROL_GPRv_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rotwr">
+	<type>Integer</type>
+	<category>Shift</category>
+	<return type="unsigned short" varname="dst" etype="UI16"/>
+	<parameter type="unsigned short" varname="a" etype="UI16"/>
+	<parameter type="int" varname="shift" etype="IMM" immwidth="4"/>
+	<description>Shift the bits of unsigned 16-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 15
+DO WHILE (count &gt; 0)
+	tmp[15] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp
+	count := count - 1
+OD
+	</operation>
+	<instruction name="ROR" form="r16, imm8" xed="ROR_GPRv_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rotl64">
+	<type>Integer</type>
+	<category>Shift</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="shift" etype="IMM" immwidth="6"/>
+	<description>Shift the bits of unsigned 64-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 63
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[63]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction name="ROL" form="r64, imm8" xed="ROL_GPRv_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rotr64">
+	<type>Integer</type>
+	<category>Shift</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="int" varname="shift" etype="IMM" immwidth="6"/>
+	<description>Shift the bits of unsigned 64-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift AND 63
+DO WHILE (count &gt; 0)
+	tmp[63] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp[63]
+	count := count - 1
+OD
+	</operation>
+	<instruction name="ROR" form="r64, imm8" xed="ROR_GPRv_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_addcarry_u32">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Arithmetic</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned char" varname="c_in" etype="UI8"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="b" etype="UI32"/>
+	<parameter type="unsigned int *" varname="out" etype="UI32" memwidth="32"/>
+	<description>Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[32:0] := a[31:0] + b[31:0] + (c_in &gt; 0 ? 1 : 0)
+MEM[out+31:out] := tmp[31:0]
+dst[0] := tmp[32]
+dst[7:1] := 0
+	</operation>
+	<instruction name="ADC" form="r32, r32" xed="ADC_GPRv_GPRv_11"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_addcarry_u64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Arithmetic</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned char" varname="c_in" etype="UI8"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="b" etype="UI64"/>
+	<parameter type="unsigned __int64 *" varname="out" etype="UI64" memwidth="64"/>
+	<description>Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[64:0] := a[63:0] + b[63:0] + (c_in &gt; 0 ? 1 : 0)
+MEM[out+63:out] := tmp[63:0]
+dst[0] := tmp[64]
+dst[7:1] := 0
+	</operation>
+	<instruction name="ADC" form="r64, r64" xed="ADC_GPRv_GPRv_11"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_subborrow_u32">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Arithmetic</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned char" varname="c_in" etype="UI8"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned int" varname="b" etype="UI32"/>
+	<parameter type="unsigned int *" varname="out" etype="UI32" memwidth="32"/>
+	<description>Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 32-bit integer "b", and subtract the result from unsigned 32-bit integer "a". Store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[32:0] := a[31:0] - (b[31:0] + (c_in &gt; 0 ? 1 : 0))
+MEM[out+31:out] := tmp[31:0]
+dst[0] := tmp[32]
+dst[7:1] := 0
+	</operation>
+	<instruction name="SBB" form="r32, r32" xed="SBB_GPRv_GPRv_19"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_subborrow_u64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<category>Arithmetic</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned char" varname="c_in" etype="UI8"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="b" etype="UI64"/>
+	<parameter type="unsigned __int64 *" varname="out" etype="UI64" memwidth="64"/>
+	<description>Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 64-bit integer "b", and subtract the result from unsigned 64-bit integer "a". Store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+tmp[64:0] := a[63:0] - (b[63:0] + (c_in &gt; 0 ? 1 : 0))
+MEM[out+63:out] := tmp[63:0]
+dst[0] := tmp[64]
+dst[7:1] := 0
+	</operation>
+	<instruction name="SBB" form="r64, r64" xed="SBB_GPRv_GPRv_19"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_ptwrite32">
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Insert the 32-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled.</description>
+	<instruction name="PTWRITE" form="r32" xed="PTWRITE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_ptwrite64">
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Insert the 64-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled.</description>
+	<instruction name="PTWRITE" form="r64" xed="PTWRITE_GPRy"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_enclu_u32">
+	<category>Miscellaneous</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="const int" varname="a" etype="UI32"/>
+	<parameter type="size_t*" varname="__data" etype="UI64"/>
+	<description>Invoke the Intel SGX enclave user (non-privilege) leaf function specified by "a", and return the error code. The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx.</description>
+	<instruction name="ENCLU" xed="ENCLU"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_encls_u32">
+	<category>Miscellaneous</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="const int" varname="a" etype="UI32"/>
+	<parameter type="size_t*" varname="__data" etype="UI64"/>
+	<description>Invoke the Intel SGX enclave system (privileged) leaf function specified by "a", and return the error code. The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx.</description>
+	<instruction name="ENCLS" xed="ENCLS"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_enclv_u32">
+	<category>Miscellaneous</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="const int" varname="a" etype="UI32"/>
+	<parameter type="size_t*" varname="__data" etype="UI64"/>
+	<description>Invoke the Intel SGX enclave virtualized (VMM) leaf function specified by "a", and return the error code. The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx.</description>
+	<instruction name="ENCLV" xed="ENCLV"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_wbinvd">
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Write back and flush internal caches.
+		Initiate writing-back and flushing of external
+		caches.</description>
+	<instruction name="WBINVD" xed="WBINVD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" sequence="TRUE" name="_cvtsh_ss">
+	<type>Floating Point</type>
+	<category>Convert</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="unsigned short" varname="a" etype="UI16"/>
+	<description>Convert the half-precision (16-bit) floating-point value "a" to a single-precision (32-bit) floating-point value, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP16_To_FP32(a[15:0])
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" sequence="TRUE" name="_cvtss_sh">
+	<type>Floating Point</type>
+	<category>Convert</category>
+	<return type="unsigned short" varname="dst" etype="UI16"/>
+	<parameter type="float" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" hint="TRUE" immtype="_MM_FROUND"/>
+	<description>Convert the single-precision (32-bit) floating-point value "a" to a half-precision (16-bit) floating-point value, and store the result in "dst".
+	[round_note]</description>
+	<operation>
+dst[15:0] := Convert_FP32_To_FP16(a[31:0])
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" vexEq="TRUE" name="_mm_clmulepi64_si128">
+	<type>Integer</type>
+	<CPUID>PCLMULQDQ</CPUID>
+	<category>Application-Targeted</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0)
+	TEMP1 := a[63:0]
+ELSE
+	TEMP1 := a[127:64]
+FI 
+IF (imm8[4] == 0)
+	TEMP2 := b[63:0]
+ELSE 
+	TEMP2 := b[127:64]
+FI
+FOR i := 0 to 63
+	TEMP[i] := (TEMP1[0] and TEMP2[i])
+	FOR j := 1 to i
+		TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j])
+	ENDFOR 
+	dst[i] := TEMP[i]
+ENDFOR
+FOR i := 64 to 127
+	TEMP[i] := 0
+	FOR j := (i - 63) to 63
+		TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j])
+	ENDFOR
+	dst[i] := TEMP[i]
+ENDFOR
+dst[127] := 0
+	</operation>
+	<instruction name="PCLMULQDQ" form="xmm, xmm, imm8" xed="PCLMULQDQ_XMMdq_XMMdq_IMMb"/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_pconfig_u32">
+	<CPUID>PCONFIG</CPUID>
+	<category>Miscellaneous</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="const int" varname="a" etype="UI32"/>
+	<parameter type="size_t*" varname="__data" etype="UI64"/>
+	<description>Invoke the PCONFIG leaf function specified by "a". The "__data" array contains 3 32-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to rbx, rcx, and rdx. May return the value in eax, depending on the semantics of the specified leaf function.</description>
+	<instruction name="PCONFIG" xed="PCONFIG"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_popcnt_u32">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>POPCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+dst := 0
+FOR i := 0 to 31
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name="POPCNT" form="r32, r32" xed="POPCNT_GPRv_GPRv"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_popcnt_u64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>POPCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="a" etype="UI64"/>
+	<description>Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+dst := 0
+FOR i := 0 to 63
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name="POPCNT" form="r64, r64" xed="POPCNT_GPRv_GPRv"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_popcnt32">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>POPCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Count the number of bits set to 1 in 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+dst := 0
+FOR i := 0 to 31
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name="POPCNT" form="r32, r32" xed="POPCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_popcnt64">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>POPCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Count the number of bits set to 1 in 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+dst := 0
+FOR i := 0 to 63
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name="POPCNT" form="r64, r64" xed="POPCNT_GPRv_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_prefetch">
+	<CPUID>PREFETCHWT1</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="char const*" varname="p" etype="UI8"/>
+	<parameter type="int" varname="i" etype="IMM" immwidth="2"/>
+	<description>Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i".</description>
+	<instruction name="PREFETCHWT1" form="m8" xed="PREFETCHWT1_MEMu8"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdpid_u32">
+	<CPUID>RDPID</CPUID>
+	<category>General Support</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="void"/>
+	<description>Copy the IA32_TSC_AUX MSR (signature value) into "dst".</description>
+	<operation>dst[31:0] := IA32_TSC_AUX[31:0]
+	</operation>
+	<instruction name="RDPID" form="r32" xed="RDPID_GPR32u32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdrand16_step">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>RDRAND</CPUID>
+	<category>Random</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned short*" varname="val" etype="UI16" memwidth="16"/>
+	<description>Read a hardware generated 16-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_RND_GEN.ready == 1
+	val[15:0] := HW_RND_GEN.data
+	dst := 1
+ELSE
+	val[15:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction name="RDRAND" form="r16" xed="RDRAND_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdrand32_step">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>RDRAND</CPUID>
+	<category>Random</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int*" varname="val" etype="UI32" memwidth="32"/>
+	<description>Read a hardware generated 32-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_RND_GEN.ready == 1
+	val[31:0] := HW_RND_GEN.data
+	dst := 1
+ELSE
+	val[31:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction name="RDRAND" form="r32" xed="RDRAND_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdrand64_step">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>RDRAND</CPUID>
+	<category>Random</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned __int64*" varname="val" etype="UI64" memwidth="64"/>
+	<description>Read a hardware generated 64-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_RND_GEN.ready == 1
+	val[63:0] := HW_RND_GEN.data
+	dst := 1
+ELSE
+	val[63:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction name="RDRAND" form="r64" xed="RDRAND_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdseed16_step">
+	<type>Flag</type>
+	<CPUID>RDSEED</CPUID>
+	<category>Random</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned short *" varname="val" etype="UI16"/>
+	<description>Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_NRND_GEN.ready == 1
+	val[15:0] := HW_NRND_GEN.data
+	dst := 1
+ELSE
+	val[15:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction name="RDSEED" form="r16" xed="RDSEED_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdseed32_step">
+	<type>Flag</type>
+	<CPUID>RDSEED</CPUID>
+	<category>Random</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int *" varname="val" etype="UI32"/>
+	<description>Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_NRND_GEN.ready == 1
+	val[31:0] := HW_NRND_GEN.data
+	dst := 1
+ELSE
+	val[31:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction name="RDSEED" form="r32" xed="RDSEED_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdseed64_step">
+	<type>Flag</type>
+	<CPUID>RDSEED</CPUID>
+	<category>Random</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned __int64 *" varname="val" etype="UI64"/>
+	<description>Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>IF HW_NRND_GEN.ready == 1
+	val[63:0] := HW_NRND_GEN.data
+	dst := 1
+ELSE
+	val[63:0] := 0
+	dst := 0
+FI
+	</operation>
+	<instruction name="RDSEED" form="r64" xed="RDSEED_GPRv"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="__rdtscp">
+	<CPUID>RDTSCP</CPUID>
+	<category>General Support</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned int *" varname="mem_addr" etype="UI32" memwidth="32"/>
+	<description>Copy the current 64-bit value of the processor's time-stamp counter into "dst", and store the IA32_TSC_AUX MSR (signature value) into memory at "mem_addr".</description>
+	<operation>dst[63:0] := TimeStampCounter
+MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0]
+	</operation>
+	<instruction name="RDTSCP" xed="RDTSCP"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xabort">
+	<CPUID>RTM</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="const unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the "imm8" parameter will be provided in bits [31:24] of EAX.
+	Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction.</description>
+	<operation>IF RTM_ACTIVE == 0
+	// nop
+ELSE
+	// restore architectural register state
+	// discard memory updates performed in transaction
+	// update EAX with status and imm8 value
+	eax[31:24] := imm8[7:0]
+	RTM_NEST_COUNT := 0
+	RTM_ACTIVE := 0
+	IF _64_BIT_MODE
+		RIP := fallbackRIP
+	ELSE
+		EIP := fallbackEIP
+	FI
+FI
+	</operation>
+	<instruction name="XABORT" form="imm8" xed="XABORT_IMMb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xbegin">
+	<CPUID>RTM</CPUID>
+	<category>General Support</category>
+	<return type="unsigned int" varname="k" etype="UI32"/>
+	<parameter type="void"/>
+	<description>Specify the start of an RTM code region. 
+	If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution. 
+	On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction. Return status of ~0 (0xFFFF) if continuing inside transaction; all other codes are aborts.</description>
+	<operation>IF RTM_NEST_COUNT &lt; MAX_RTM_NEST_COUNT
+	RTM_NEST_COUNT := RTM_NEST_COUNT + 1
+	IF RTM_NEST_COUNT == 1
+		IF _64_BIT_MODE
+			fallbackRIP := RIP
+		ELSE IF _32_BIT_MODE
+			fallbackEIP := EIP
+		FI
+		
+		RTM_ACTIVE := 1
+		// enter RTM execution, record register state, start tracking memory state
+	FI
+ELSE
+	// RTM abort (see _xabort)
+FI
+	</operation>
+	<instruction name="XBEGIN" form="r32" xed="XBEGIN_RELBRz"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xend">
+	<CPUID>RTM</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Specify the end of an RTM code region.
+	If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically. 
+	If the commit fails, the logical processor will perform an RTM abort.</description>
+	<operation>IF RTM_ACTIVE == 1
+	RTM_NEST_COUNT := RTM_NEST_COUNT - 1
+	IF RTM_NEST_COUNT == 0
+		// try to commit transaction
+		IF FAIL_TO_COMMIT_TRANSACTION
+			// RTM abort (see _xabort)
+		ELSE
+			RTM_ACTIVE := 0
+		FI
+	FI
+FI
+	</operation>
+	<instruction name="XEND" xed="XEND"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xtest">
+	<CPUID>RTM</CPUID>
+	<category>General Support</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="void"/>
+	<description>Query the transactional execution status, return 1 if inside a transactionally executing RTM or HLE region, and return 0 otherwise.</description>
+	<operation>IF (RTM_ACTIVE == 1 OR HLE_ACTIVE == 1)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="XTEST" xed="XTEST"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_serialize">
+	<CPUID>SERIALIZE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<description>Serialize instruction execution, ensuring all modifications to flags, registers, and memory by previous instructions are completed before the next instruction is fetched.</description>
+	<instruction name="SERIALIZE" xed="SERIALIZE"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_sha1msg1_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst".</description>
+	<operation>
+W0 := a[127:96]
+W1 := a[95:64]
+W2 := a[63:32]
+W3 := a[31:0]
+W4 := b[127:96]
+W5 := b[95:64]
+dst[127:96] := W2 XOR W0
+dst[95:64] := W3 XOR W1
+dst[63:32] := W4 XOR W2
+dst[31:0] := W5 XOR W3
+	</operation>
+	<instruction name="SHA1MSG1" form="xmm, xmm" xed="SHA1MSG1_XMMi32_XMMi32_SHA"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_sha1msg2_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in "a" and the previous message values in "b", and store the result in "dst".</description>
+	<operation>
+W13 := b[95:64]
+W14 := b[63:32]
+W15 := b[31:0]
+W16 := (a[127:96] XOR W13) &lt;&lt;&lt; 1
+W17 := (a[95:64] XOR W14) &lt;&lt;&lt; 1
+W18 := (a[63:32] XOR W15) &lt;&lt;&lt; 1
+W19 := (a[31:0] XOR W16) &lt;&lt;&lt; 1
+dst[127:96] := W16
+dst[95:64] := W17
+dst[63:32] := W18
+dst[31:0] := W19
+	</operation>
+	<instruction name="SHA1MSG2" form="xmm, xmm" xed="SHA1MSG2_XMMi32_XMMi32_SHA"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_sha1nexte_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable "a", add that value to the scheduled values (unsigned 32-bit integers) in "b", and store the result in "dst".</description>
+	<operation>
+tmp := (a[127:96] &lt;&lt;&lt; 30)
+dst[127:96] := b[127:96] + tmp
+dst[95:64] := b[95:64]
+dst[63:32] := b[63:32]
+dst[31:0] := b[31:0]
+	</operation>
+	<instruction name="SHA1NEXTE" form="xmm, xmm" xed="SHA1NEXTE_XMMi32_XMMi32_SHA"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_sha1rnds4_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="const int" varname="func" etype="IMM" immwidth="2"/>
+	<description>Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from "a" and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from "b", and store the updated SHA1 state (A,B,C,D) in "dst". "func" contains the logic functions and round constants.</description>
+	<operation>IF (func[1:0] == 0)
+	f := f0()
+	K := K0
+ELSE IF (func[1:0] == 1)
+	f := f1()
+	K := K1
+ELSE IF (func[1:0] == 2)
+	f := f2()
+	K := K2
+ELSE IF (func[1:0] == 3)
+	f := f3()
+	K := K3
+FI
+A := a[127:96]
+B := a[95:64]
+C := a[63:32]
+D := a[31:0]
+W[0] := b[127:96]
+W[1] := b[95:64]
+W[2] := b[63:32]
+W[3] := b[31:0]
+A[1] := f(B, C, D) + (A &lt;&lt;&lt; 5) + W[0] + K
+B[1] := A
+C[1] := B &lt;&lt;&lt; 30
+D[1] := C
+E[1] := D
+FOR i := 1 to 3
+	A[i+1] := f(B[i], C[i], D[i]) + (A[i] &lt;&lt;&lt; 5) + W[i] + E[i] + K
+	B[i+1] := A[i]
+	C[i+1] := B[i] &lt;&lt;&lt; 30
+	D[i+1] := C[i]
+	E[i+1] := D[i]
+ENDFOR
+dst[127:96] := A[4]
+dst[95:64] := B[4]
+dst[63:32] := C[4]
+dst[31:0] := D[4]
+	</operation>
+	<instruction name="SHA1RNDS4" form="xmm, xmm, imm8" xed="SHA1RNDS4_XMMi32_XMMi32_IMM8_SHA"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_sha256msg1_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst".</description>
+	<operation>W4 := b[31:0]
+W3 := a[127:96]
+W2 := a[95:64]
+W1 := a[63:32]
+W0 := a[31:0]
+dst[127:96] := W3 + sigma0(W4)
+dst[95:64] := W2 + sigma0(W3)
+dst[63:32] := W1 + sigma0(W2)
+dst[31:0] := W0 + sigma0(W1)
+	</operation>
+	<instruction name="SHA256MSG1" form="xmm, xmm" xed="SHA256MSG1_XMMi32_XMMi32_SHA"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_sha256msg2_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst"."</description>
+	<operation>W14 := b[95:64]
+W15 := b[127:96]
+W16 := a[31:0] + sigma1(W14)
+W17 := a[63:32] + sigma1(W15)
+W18 := a[95:64] + sigma1(W16)
+W19 := a[127:96] + sigma1(W17)
+dst[127:96] := W19
+dst[95:64] := W18
+dst[63:32] := W17
+dst[31:0] := W16
+	</operation>
+	<instruction name="SHA256MSG2" form="xmm, xmm" xed="SHA256MSG2_XMMi32_XMMi32_SHA"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm_sha256rnds2_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<parameter type="__m128i" varname="k" etype="UI32"/>
+	<description>Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from "a", an initial SHA256 state (A,B,E,F) from "b", and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from "k", and store the updated SHA256 state (A,B,E,F) in "dst".</description>
+	<operation>A[0] := b[127:96]
+B[0] := b[95:64]
+C[0] := a[127:96]
+D[0] := a[95:64]
+E[0] := b[63:32]
+F[0] := b[31:0]
+G[0] := a[63:32]
+H[0] := a[31:0]
+W_K[0] := k[31:0]
+W_K[1] := k[63:32]
+FOR i := 0 to 1
+	A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i])
+	B[i+1] := A[i]
+	C[i+1] := B[i]
+	D[i+1] := C[i]
+	E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i]
+	F[i+1] := E[i]
+	G[i+1] := F[i]
+	H[i+1] := G[i]
+ENDFOR
+dst[127:96] := A[2]
+dst[95:64] := B[2]
+dst[63:32] := E[2]
+dst[31:0] := F[2]
+	</operation>
+	<instruction name="SHA256RNDS2" form="xmm, xmm" xed="SHA256RNDS2_XMMi32_XMMi32_SHA"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_MM_TRANSPOSE4_PS">
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="void"/>
+	<parameter type="__m128" varname="row0" etype="FP32"/>
+	<parameter type="__m128" varname="row1" etype="FP32"/>
+	<parameter type="__m128" varname="row2" etype="FP32"/>
+	<parameter type="__m128" varname="row3" etype="FP32"/>
+	<description>Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in "row0", "row1", "row2", and "row3", and store the transposed matrix in these vectors ("row0" now contains column 0, etc.).</description>
+	<operation>
+__m128 tmp3, tmp2, tmp1, tmp0;
+tmp0 := _mm_unpacklo_ps(row0, row1);
+tmp2 := _mm_unpacklo_ps(row2, row3);
+tmp1 := _mm_unpackhi_ps(row0, row1);
+tmp3 := _mm_unpackhi_ps(row2, row3);
+row0 := _mm_movelh_ps(tmp0, tmp2);
+row1 := _mm_movehl_ps(tmp2, tmp0);
+row2 := _mm_movelh_ps(tmp1, tmp3);
+row3 := _mm_movehl_ps(tmp3, tmp1);
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_getcsr">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="void"/>
+	<description>Get the unsigned 32-bit value of the MXCSR control and status register.</description>
+	<operation>dst[31:0] := MXCSR
+	</operation>
+	<instruction name="STMXCSR" form="m32" xed="STMXCSR_MEMd"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_setcsr">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Set the MXCSR control and status register with the value in unsigned 32-bit integer "a".</description>
+	<operation>
+MXCSR := a[31:0]
+	</operation>
+	<instruction name="LDMXCSR" form="m32" xed="LDMXCSR_MEMd"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_MM_GET_EXCEPTION_STATE">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<description>Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT</description>
+	<operation>dst[31:0] := MXCSR &amp; _MM_EXCEPT_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_MM_SET_EXCEPTION_STATE">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT</description>
+	<operation>MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_MM_GET_EXCEPTION_MASK">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<description>Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT</description>
+	<operation>dst[31:0] := MXCSR &amp; _MM_MASK_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_MM_SET_EXCEPTION_MASK">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT</description>
+	<operation>MXCSR := a[31:0] AND ~_MM_MASK_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_MM_GET_ROUNDING_MODE">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<description>Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO</description>
+	<operation>dst[31:0] := MXCSR &amp; _MM_ROUND_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_MM_SET_ROUNDING_MODE">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO</description>
+	<operation>MXCSR := a[31:0] AND ~_MM_ROUND_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_MM_GET_FLUSH_ZERO_MODE">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<description>Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF</description>
+	<operation>dst[31:0] := MXCSR &amp; _MM_FLUSH_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_MM_SET_FLUSH_ZERO_MODE">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF</description>
+	<operation>MXCSR := a[31:0] AND ~_MM_FLUSH_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_prefetch">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="char const*" varname="p" etype="UI8"/>
+	<parameter type="int" varname="i" etype="IMM" immwidth="2"/>
+	<description>Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i".</description>
+	<instruction name="PREFETCHNTA" form="m8" xed="PREFETCHNTA_MEMmprefetch"/>
+	<instruction name="PREFETCHT0" form="m8" xed="PREFETCHT0_MEMmprefetch"/>
+	<instruction name="PREFETCHT1" form="m8" xed="PREFETCHT1_MEMmprefetch"/>
+	<instruction name="PREFETCHT2" form="m8" xed="PREFETCHT2_MEMmprefetch"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_sfence">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order.</description>
+	<instruction name="SFENCE" xed="SFENCE"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_max_pi16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXSW" form="mm, mm" xed="PMAXSW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pmaxsw">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXSW" form="mm, mm" xed="PMAXSW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_max_pu8">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXUB" form="mm, mm" xed="PMAXUB_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pmaxub">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXUB" form="mm, mm" xed="PMAXUB_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_min_pi16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINSW" form="mm, mm" xed="PMINSW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pminsw">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINSW" form="mm, mm" xed="PMINSW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_min_pu8">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINUB" form="mm, mm" xed="PMINUB_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pminub">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINUB" form="mm, mm" xed="PMINUB_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_mulhi_pu16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name="PMULHUW" form="mm, mm" xed="PMULHUW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pmulhuw">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name="PMULHUW" form="mm, mm" xed="PMULHUW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_avg_pu8">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name="PAVGB" form="mm, mm" xed="PAVGB_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pavgb">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name="PAVGB" form="mm, mm" xed="PAVGB_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_avg_pu16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name="PAVGW" form="mm, mm" xed="PAVGW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pavgw">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="__m64" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name="PAVGW" form="mm, mm" xed="PAVGW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_sad_pu8">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
+dst[63:16] := 0
+	</operation>
+	<instruction name="PSADBW" form="mm, mm" xed="PSADBW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_psadbw">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
+dst[63:16] := 0
+	</operation>
+	<instruction name="PSADBW" form="mm, mm" xed="PSADBW_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cvtsi32_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="b" etype="SI32"/>
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CVTSI2SS" form="xmm, r32" xed="CVTSI2SS_XMMss_GPR32d"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cvt_si2ss">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="b" etype="SI32"/>
+	<description>Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CVTSI2SS" form="xmm, r32" xed="CVTSI2SS_XMMss_GPR32d"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvtsi64_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__int64" varname="b" etype="SI64"/>
+	<description>Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="CVTSI2SS" form="xmm, r64" xed="CVTSI2SS_XMMss_GPR64q"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvtpi32_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m64" varname="b" etype="SI32"/>
+	<description>Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name="CVTPI2PS" form="xmm, mm" xed="CVTPI2PS_XMMq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvt_pi2ps">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m64" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name="CVTPI2PS" form="xmm, mm" xed="CVTPI2PS_XMMq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_cvtpi16_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<description>Convert packed 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	m := j*32
+	dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_cvtpu16_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	m := j*32
+	dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_cvtpi8_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI8"/>
+	<description>Convert the lower packed 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*8
+	m := j*32
+	dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_cvtpu8_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<description>Convert the lower packed unsigned 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*8
+	m := j*32
+	dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_cvtpi32x2_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="SI32"/>
+	<parameter type="__m64" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", then covert the packed signed 32-bit integers in "b" to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_stream_pi">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m64*" varname="mem_addr" etype="FP32" memwidth="64"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<description>Store 64-bits of integer data from "a" into memory using a non-temporal memory hint.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name="MOVNTQ" form="m64, mm" xed="MOVNTQ_MEMq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_maskmove_si64">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="mask" etype="UI8"/>
+	<parameter type="char*" varname="mem_addr" etype="UI8" memwidth="64"/>
+	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF mask[i+7]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="MASKMOVQ" form="mm, mm" xed="MASKMOVQ_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_maskmovq">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="mask" etype="UI8"/>
+	<parameter type="char*" varname="mem_addr" etype="UI8" memwidth="64"/>
+	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF mask[i+7]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="MASKMOVQ" form="mm, mm" xed="MASKMOVQ_MMXq_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_extract_pi16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="int" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[15:0] := (a[63:0] &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := 0
+	</operation>
+	<instruction name="PEXTRW" form="r32, mm, imm8" xed="PEXTRW_GPR32_MMXq_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pextrw">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="int" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[15:0] := (a[63:0] &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := 0
+	</operation>
+	<instruction name="PEXTRW" form="r32, mm, imm8" xed="PEXTRW_GPR32_MMXq_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_insert_pi16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="int" varname="i" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[63:0] := a[63:0]
+sel := imm8[1:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<instruction name="PINSRW" form="mm, r32, imm8" xed="PINSRW_MMXq_GPR32_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pinsrw">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="int" varname="i" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[63:0] := a[63:0]
+sel := imm8[1:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<instruction name="PINSRW" form="mm, r32, imm8" xed="PINSRW_MMXq_GPR32_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_movemask_pi8">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<return type="int" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+dst[MAX:8] := 0
+	</operation>
+	<instruction name="PMOVMSKB" form="r32, mm" xed="PMOVMSKB_GPR32_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pmovmskb">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<return type="int" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+dst[MAX:8] := 0
+	</operation>
+	<instruction name="PMOVMSKB" form="r32, mm" xed="PMOVMSKB_GPR32_MMXq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_shuffle_pi16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[15:0] := src[15:0]
+	1:	tmp[15:0] := src[31:16]
+	2:	tmp[15:0] := src[47:32]
+	3:	tmp[15:0] := src[63:48]
+	ESAC
+	RETURN tmp[15:0]
+}
+dst[15:0] := SELECT4(a[63:0], imm8[1:0])
+dst[31:16] := SELECT4(a[63:0], imm8[3:2])
+dst[47:32] := SELECT4(a[63:0], imm8[5:4])
+dst[63:48] := SELECT4(a[63:0], imm8[7:6])
+	</operation>
+	<instruction name="PSHUFW" form="mm, mm, imm8" xed="PSHUFW_MMXq_MMXq_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_m_pshufw">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[15:0] := src[15:0]
+	1:	tmp[15:0] := src[31:16]
+	2:	tmp[15:0] := src[47:32]
+	3:	tmp[15:0] := src[63:48]
+	ESAC
+	RETURN tmp[15:0]
+}
+dst[15:0] := SELECT4(a[63:0], imm8[1:0])
+dst[31:16] := SELECT4(a[63:0], imm8[3:2])
+dst[47:32] := SELECT4(a[63:0], imm8[5:4])
+dst[63:48] := SELECT4(a[63:0], imm8[7:6])
+	</operation>
+	<instruction name="PSHUFW" form="mm, mm, imm8" xed="PSHUFW_MMXq_MMXq_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_add_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] + b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="ADDSS" form="xmm, xmm" xed="ADDSS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_add_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="ADDPS" form="xmm, xmm" xed="ADDPS_XMMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_sub_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="SUBSS" form="xmm, xmm" xed="SUBSS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="SUBPS" form="xmm, xmm" xed="SUBPS_XMMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_mul_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] * b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="MULSS" form="xmm, xmm" xed="MULSS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="MULPS" form="xmm, xmm" xed="MULPS_XMMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_div_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] / b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="DIVSS" form="xmm, xmm" xed="DIVSS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_div_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="DIVPS" form="xmm, xmm" xed="DIVPS_XMMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_sqrt_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := SQRT(a[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="SQRTSS" form="xmm, xmm" xed="SQRTSS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="SQRTPS" form="xmm, xmm" xed="SQRTPS_XMMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_rcp_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+dst[31:0] := (1.0 / a[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="RCPSS" form="xmm, xmm" xed="RCPSS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_rcp_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (1.0 / a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="RCPPS" form="xmm, xmm" xed="RCPPS_XMMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_rsqrt_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+dst[31:0] := (1.0 / SQRT(a[31:0]))
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="RSQRTSS" form="xmm, xmm" xed="RSQRTSS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_rsqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
+ENDFOR
+	</operation>
+	<instruction name="RSQRTPS" form="xmm, xmm" xed="RSQRTPS_XMMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_min_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst".</description>
+	<operation>
+dst[31:0] := MIN(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="MINSS" form="xmm, xmm" xed="MINSS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_min_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="MINPS" form="xmm, xmm" xed="MINPS_XMMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_max_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst".</description>
+	<operation>
+dst[31:0] := MAX(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="MAXSS" form="xmm, xmm" xed="MAXSS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_max_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="MAXPS" form="xmm, xmm" xed="MAXPS_XMMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_and_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="ANDPS" form="xmm, xmm" xed="ANDPS_XMMxud_XMMxud"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="ANDNPS" form="xmm, xmm" xed="ANDNPS_XMMxud_XMMxud"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_or_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="ORPS" form="xmm, xmm" xed="ORPS_XMMxud_XMMxud"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Logical</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="XORPS" form="xmm, xmm" xed="XORPS_XMMxud_XMMxud"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpeq_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpeq_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmplt_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &lt; b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmplt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &lt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmple_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &lt;= b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmple_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpgt_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &gt; b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpgt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpge_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &gt;= b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpge_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpneq_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpneq_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpnlt_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (!( a[31:0] &lt; b[31:0] )) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpnlt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := !( a[i+31:i] &lt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpnle_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (!( a[31:0] &lt;= b[31:0] )) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpnle_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (!( a[i+31:i] &lt;= b[i+31:i] )) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpngt_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (!( a[31:0] &gt; b[31:0] )) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpngt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (!( a[i+31:i] &gt; b[i+31:i] )) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpnge_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (!( a[31:0] &gt;= b[31:0] )) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpnge_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (!( a[i+31:i] &gt;= b[i+31:i] )) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpord_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpord_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpunord_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>dst[31:0] := ( a[31:0] == NaN OR b[31:0] == NaN ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="CMPSS" form="xmm, xmm, imm8" xed="CMPSS_XMMss_XMMss_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cmpunord_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == NaN OR b[i+31:i] == NaN ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPS" form="xmm, xmm, imm8" xed="CMPPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_comieq_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] == b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISS" form="xmm, xmm" xed="COMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_comilt_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] &lt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISS" form="xmm, xmm" xed="COMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_comile_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] &lt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISS" form="xmm, xmm" xed="COMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_comigt_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] &gt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISS" form="xmm, xmm" xed="COMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_comige_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] &gt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISS" form="xmm, xmm" xed="COMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_comineq_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] != b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISS" form="xmm, xmm" xed="COMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_ucomieq_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] == b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISS" form="xmm, xmm" xed="UCOMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_ucomilt_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] &lt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISS" form="xmm, xmm" xed="UCOMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_ucomile_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] &lt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISS" form="xmm, xmm" xed="UCOMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_ucomigt_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] &gt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISS" form="xmm, xmm" xed="UCOMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_ucomige_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] &gt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISS" form="xmm, xmm" xed="UCOMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_ucomineq_ss">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] != b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISS" form="xmm, xmm" xed="UCOMISS_XMMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cvtss_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name="CVTSS2SI" form="r32, xmm" xed="CVTSS2SI_GPR32d_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cvt_ss2si">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name="CVTSS2SI" form="r32, xmm" xed="CVTSS2SI_GPR32d_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvtss_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction name="CVTSS2SI" form="r64, xmm" xed="CVTSS2SI_GPR64q_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvtss_f32">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="float" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name="MOVSS" form="m32, xmm" xed="MOVSS_MEMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvtps_pi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="CVTPS2PI" form="mm, xmm" xed="CVTPS2PI_MMXq_XMMq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvt_ps2pi">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="CVTPS2PI" form="mm, xmm" xed="CVTPS2PI_MMXq_XMMq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cvttss_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name="CVTTSS2SI" form="r32, xmm" xed="CVTTSS2SI_GPR32d_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_cvtt_ss2si">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name="CVTTSS2SI" form="r32, xmm" xed="CVTTSS2SI_GPR32d_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvttss_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction name="CVTTSS2SI" form="r64, xmm" xed="CVTTSS2SI_GPR64q_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvttps_pi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="CVTTPS2PI" form="mm, xmm" xed="CVTTPS2PI_MMXq_XMMq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_cvtt_ps2pi">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="CVTTPS2PI" form="mm, xmm" xed="CVTTPS2PI_MMXq_XMMq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_cvtps_pi16">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF.</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	k := 32*j
+	IF a[k+31:k] &gt;= FP32(0x7FFF) &amp;&amp; a[k+31:k] &lt;= FP32(0x7FFFFFFF)
+		dst[i+15:i] := 0x7FFF
+	ELSE
+		dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
+	FI
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_cvtps_pi8">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="SI8"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 8-bit integers, and store the results in lower 4 elements of "dst". Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF.</description>
+	<operation>
+FOR j := 0 to 3
+	i := 8*j
+	k := 32*j
+	IF a[k+31:k] &gt;= FP32(0x7F) &amp;&amp; a[k+31:k] &lt;= FP32(0x7FFFFFFF)
+		dst[i+7:i] := 0x7F
+	ELSE
+		dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
+	FI
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_set_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="a" etype="FP32"/>
+	<description>Copy single-precision (32-bit) floating-point element "a" to the lower element of "dst", and zero the upper 3 elements.</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[127:32] := 0
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_set1_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="a" etype="FP32"/>
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_set_ps1">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="a" etype="FP32"/>
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_set_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="e3" etype="FP32"/>
+	<parameter type="float" varname="e2" etype="FP32"/>
+	<parameter type="float" varname="e1" etype="FP32"/>
+	<parameter type="float" varname="e0" etype="FP32"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_setr_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float" varname="e3" etype="FP32"/>
+	<parameter type="float" varname="e2" etype="FP32"/>
+	<parameter type="float" varname="e1" etype="FP32"/>
+	<parameter type="float" varname="e0" etype="FP32"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e3
+dst[63:32] := e2
+dst[95:64] := e1
+dst[127:96] := e0
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_setzero_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m128 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="XORPS" form="xmm, xmm" xed="XORPS_XMMxud_XMMxud"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_loadh_pi">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m64 const*" varname="mem_addr" etype="FP32" memwidth="64"/>
+	<description>Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := a[63:32]
+dst[95:64] := MEM[mem_addr+31:mem_addr]
+dst[127:96] := MEM[mem_addr+63:mem_addr+32]
+	</operation>
+	<instruction name="MOVHPS" form="xmm, m64" xed="MOVHPS_XMMq_MEMq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_loadl_pi">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m64 const*" varname="mem_addr" etype="FP32" memwidth="64"/>
+	<description>Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[63:32] := MEM[mem_addr+63:mem_addr+32]
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name="MOVLPS" form="xmm, m64" xed="MOVLPS_XMMq_MEMq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_load_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<description>Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[127:32] := 0
+	</operation>
+	<instruction name="MOVSS" form="xmm, m32" xed="MOVSS_XMMdq_MEMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_load1_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<description>Load a single-precision (32-bit) floating-point element from memory into all elements of "dst".</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[63:32] := MEM[mem_addr+31:mem_addr]
+dst[95:64] := MEM[mem_addr+31:mem_addr]
+dst[127:96] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_load_ps1">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<description>Load a single-precision (32-bit) floating-point element from memory into all elements of "dst".</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[63:32] := MEM[mem_addr+31:mem_addr]
+dst[95:64] := MEM[mem_addr+31:mem_addr]
+dst[127:96] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_load_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name="MOVAPS" form="xmm, m128" xed="MOVAPS_XMMps_MEMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name="MOVUPS" form="xmm, m128" xed="MOVUPS_XMMps_MEMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_loadr_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="float const*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<description>Load 4 single-precision (32-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+dst[127:96] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_stream_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="MOVNTPS" form="m128, xmm" xed="MOVNTPS_MEMdq_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_storeh_pi">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m64*" varname="mem_addr" etype="FP32" memwidth="64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[95:64]
+MEM[mem_addr+63:mem_addr+32] := a[127:96]
+	</operation>
+	<instruction name="MOVHPS" form="m64, xmm" xed="MOVHPS_MEMq_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_storel_pi">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m64*" varname="mem_addr" etype="FP32" memwidth="64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+MEM[mem_addr+63:mem_addr+32] := a[63:32]
+	</operation>
+	<instruction name="MOVLPS" form="m64, xmm" xed="MOVLPS_MEMq_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_store_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float*" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction name="MOVSS" form="m32, xmm" xed="MOVSS_MEMss_XMMss"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_store1_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float*" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+MEM[mem_addr+63:mem_addr+32] := a[31:0]
+MEM[mem_addr+95:mem_addr+64] := a[31:0]
+MEM[mem_addr+127:mem_addr+96] := a[31:0]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_store_ps1">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float*" varname="mem_addr" etype="FP32" memwidth="32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+MEM[mem_addr+63:mem_addr+32] := a[31:0]
+MEM[mem_addr+95:mem_addr+64] := a[31:0]
+MEM[mem_addr+127:mem_addr+96] := a[31:0]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_store_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="MOVAPS" form="m128, xmm" xed="MOVAPS_MEMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="MOVUPS" form="m128, xmm" xed="MOVUPS_MEMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_storer_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="float*" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Store 4 single-precision (32-bit) floating-point elements from "a" into memory in reverse order.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[127:96]
+MEM[mem_addr+63:mem_addr+32] := a[95:64]
+MEM[mem_addr+95:mem_addr+64] := a[63:32]
+MEM[mem_addr+127:mem_addr+96] := a[31:0]
+	</operation>
+	<instruction name="MOVUPS" form="m128, xmm" xed="MOVUPS_MEMps_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_move_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="MOVSS" form="xmm, xmm" xed="MOVSS_XMMss_XMMss_0F10"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="unsigned int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+	</operation>
+	<instruction name="SHUFPS" form="xmm, xmm, imm8" xed="SHUFPS_XMMps_XMMps_IMMb"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="UNPCKHPS" form="xmm, xmm" xed="UNPCKHPS_XMMps_XMMdq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="UNPCKLPS" form="xmm, xmm" xed="UNPCKLPS_XMMps_XMMq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_movehl_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst".</description>
+	<operation>
+dst[31:0] := b[95:64]
+dst[63:32] := b[127:96]
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name="MOVHLPS" form="xmm, xmm" xed="MOVHLPS_XMMq_XMMq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_movelh_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := a[63:32]
+dst[95:64] := b[31:0]
+dst[127:96] := b[63:32]
+	</operation>
+	<instruction name="MOVLHPS" form="xmm, xmm" xed="MOVLHPS_XMMq_XMMq"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" vexEq="TRUE" name="_mm_movemask_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:4] := 0
+	</operation>
+	<instruction name="MOVMSKPS" form="r32, xmm" xed="MOVMSKPS_GPR32_XMMps"/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_malloc">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="void*"/>
+	<parameter type="size_t" varname="size" etype="UI64"/>
+	<parameter type="size_t" varname="align" etype="UI64"/>
+	<description>Allocate "size" bytes of memory, aligned to the alignment specified in "align", and return a pointer to the allocated memory. "_mm_free" should be used to free memory that is allocated with "_mm_malloc".</description>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_free">
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<description>Free aligned memory that was allocated with "_mm_malloc".</description>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_undefined_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m128 with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_acos_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ACOS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_acos_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ACOS(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_acosh_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ACOSH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_acosh_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ACOSH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_asin_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ASIN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_asin_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ASIN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_asinh_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ASINH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_asinh_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ASINH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_atan_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_atan_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_atan2_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_atan2_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_atanh_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ATANH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_atanh_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ATANH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cbrt_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cbrt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cdfnorm_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cdfnorm_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cdfnorminv_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cdfnorminv_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cexp_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CEXP(a[31:0], b[31:0]) {
+	result[31:0]  := POW(FP32(e), a[31:0]) * COS(b[31:0])
+	result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0])
+	RETURN result
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_clog_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CLOG(a[31:0], b[31:0]) {
+	result[31:0]  := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0)))
+	result[63:32] := ATAN2(b, a)
+	RETURN result
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cos_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cos_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cosd_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := COSD(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cosd_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := COSD(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cosh_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := COSH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_cosh_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := COSH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_csqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]".</description>
+	<operation>
+DEFINE CSQRT(a[31:0], b[31:0]) {
+	sign[31:0] := (b &lt; 0.0) ? -FP32(1.0) : FP32(1.0)
+	result[31:0]  := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0)
+	result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0)
+	RETURN result
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_div_epi8">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_div_epi16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_div_epi32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_div_epi64">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_div_epu8">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 8*j
+	IF b[i+7:i] == 0
+		#DE
+	FI
+	dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_div_epu16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	IF b[i+15:i] == 0
+		#DE
+	FI
+	dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_div_epu32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF b[i+31:i] == 0
+		#DE
+	FI
+	dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_div_epu64">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF b[i+63:i] == 0
+		#DE
+	FI
+	dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_erf_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ERF(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_erf_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ERF(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_erfc_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_erfc_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+63:i] := 1.0 - ERF(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_erfcinv_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_erfcinv_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_erfinv_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_erfinv_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+63:i] := 1.0 / ERF(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_exp_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_exp_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_exp10_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(10.0, a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_exp10_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(FP32(10.0), a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_exp2_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(2.0, a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_exp2_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(FP32(2.0), a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_expm1_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(e, a[i+63:i]) - 1.0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_expm1_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_hypot_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_hypot_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_idiv_epi32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_idivrem_epi32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i *" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_invcbrt_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := InvCubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_invcbrt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := InvCubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_invsqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := InvSQRT(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_invsqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := InvSQRT(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_irem_epi32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_log_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_log_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_log10_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_log10_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_log1p_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LOG(1.0 + a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_log1p_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LOG(1.0 + a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_log2_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_log2_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_logb_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_logb_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_pow_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := POW(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_pow_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := POW(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_rem_epi8">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_rem_epi16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_rem_epi32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_rem_epi64">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_rem_epu8">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 15
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_rem_epu16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 7
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_rem_epu32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_rem_epu64">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_sin_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_sin_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_sincos_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d *" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_sincos_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128 *" varname="mem_addr" etype="FP32" memwidth="128"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_sind_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SIND(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_sind_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SIND(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_sinh_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SINH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_sinh_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SINH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_svml_ceil_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_svml_ceil_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_svml_floor_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_svml_floor_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_svml_round_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_svml_round_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_svml_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_svml_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_tan_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TAN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_tan_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TAN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_tand_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TAND(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_tand_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TAND(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_tanh_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TANH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_tanh_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TANH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_trunc_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TRUNCATE(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_trunc_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TRUNCATE(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_udiv_epi32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_udivrem_epi32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i *" varname="mem_addr" etype="UI32" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" sequence="TRUE" name="_mm_urem_epi32">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_storeu_si16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI16" memwidth="16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Store 16-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+15:mem_addr] := a[15:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_loadu_si64">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI64" memwidth="64"/>
+	<description>Load unaligned 64-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="MOVQ" form="xmm, m64" xed="MOVQ_XMMdq_MEMq_0F6E"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" name="_mm_storeu_si64">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI64" memwidth="64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Store 64-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name="MOVQ" form="m64, xmm" xed="MOVQ_MEMq_XMMq_0F7E"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE" sequence="TRUE" name="_mm_loadu_si16">
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI16" memwidth="16"/>
+	<description>Load unaligned 16-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[15:0] := MEM[mem_addr+15:mem_addr]
+dst[MAX:16] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_undefined_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m128d with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_undefined_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m128i with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_loadu_si32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="void const*" varname="mem_addr" etype="UI32" memwidth="32"/>
+	<description>Load unaligned 32-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="MOVD" form="xmm, m32" xed="MOVD_XMMdq_MEMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_storeu_si32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="void*" varname="mem_addr" etype="UI32" memwidth="32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Store 32-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction name="MOVD" form="m32, xmm" xed="MOVD_MEMd_XMMd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_pause">
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops.</description>
+	<instruction name="PAUSE" xed="PAUSE"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_clflush">
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void const*" varname="p"/>
+	<description>Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy.</description>
+	<instruction name="CLFLUSH" form="m8" xed="CLFLUSH_MEMmprefetch"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_lfence">
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order.</description>
+	<instruction name="LFENCE" xed="LFENCE"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_mfence">
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order.</description>
+	<instruction name="MFENCE" xed="MFENCE"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_add_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDB" form="xmm, xmm" xed="PADDB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_add_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDW" form="xmm, xmm" xed="PADDW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_add_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDD" form="xmm, xmm" xed="PADDD_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_add_si64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Add 64-bit integers "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] + b[63:0]
+	</operation>
+	<instruction name="PADDQ" form="mm, mm" xed="PADDQ_MMXq_MMXq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_add_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="PADDQ" form="xmm, xmm" xed="PADDQ_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_adds_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDSB" form="xmm, xmm" xed="PADDSB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_adds_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDSW" form="xmm, xmm" xed="PADDSW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_adds_epu8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDUSB" form="xmm, xmm" xed="PADDUSB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_adds_epu16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name="PADDUSW" form="xmm, xmm" xed="PADDUSW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_avg_epu8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name="PAVGB" form="xmm, xmm" xed="PAVGB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_avg_epu16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Probability/Statistics</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name="PAVGW" form="xmm, xmm" xed="PAVGW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_madd_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMADDWD" form="xmm, xmm" xed="PMADDWD_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_max_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXSW" form="xmm, xmm" xed="PMAXSW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_max_epu8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXUB" form="xmm, xmm" xed="PMAXUB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_min_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINSW" form="xmm, xmm" xed="PMINSW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_min_epu8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINUB" form="xmm, xmm" xed="PMINUB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name="PMULHW" form="xmm, xmm" xed="PMULHW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name="PMULHUW" form="xmm, xmm" xed="PMULHUW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+	</operation>
+	<instruction name="PMULLW" form="xmm, xmm" xed="PMULLW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_mul_su32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI32"/>
+	<parameter type="__m64" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from "a" and "b", and store the unsigned 64-bit result in "dst".</description>
+	<operation>
+dst[63:0] := a[31:0] * b[31:0]
+	</operation>
+	<instruction name="PMULUDQ" form="mm, mm" xed="PMULUDQ_MMXq_MMXq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_mul_epu32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="PMULUDQ" form="xmm, xmm" xed="PMULUDQ_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sad_epu8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+FOR j := 0 to 1
+	i := j*64
+	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \
+	               tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
+	dst[i+63:i+16] := 0
+ENDFOR
+	</operation>
+	<instruction name="PSADBW" form="xmm, xmm" xed="PSADBW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sub_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBB" form="xmm, xmm" xed="PSUBB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sub_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBW" form="xmm, xmm" xed="PSUBW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sub_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBD" form="xmm, xmm" xed="PSUBD_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sub_si64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<parameter type="__m64" varname="b" etype="UI64"/>
+	<description>Subtract 64-bit integer "b" from 64-bit integer "a", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - b[63:0]
+	</operation>
+	<instruction name="PSUBQ" form="mm, mm" xed="PSUBQ_MMXq_MMXq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sub_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="PSUBQ" form="xmm, xmm" xed="PSUBQ_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_subs_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name="PSUBSB" form="xmm, xmm" xed="PSUBSB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_subs_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PSUBSW" form="xmm, xmm" xed="PSUBSW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_subs_epu8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name="PSUBUSB" form="xmm, xmm" xed="PSUBUSB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_subs_epu16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+	</operation>
+	<instruction name="PSUBUSW" form="xmm, xmm" xed="PSUBUSW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_slli_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+	</operation>
+	<instruction name="PSLLDQ" form="xmm, imm8" xed="PSLLDQ_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_bslli_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+	</operation>
+	<instruction name="PSLLDQ" form="xmm, imm8" xed="PSLLDQ_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_bsrli_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+	</operation>
+	<instruction name="PSRLDQ" form="xmm, imm8" xed="PSRLDQ_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_slli_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLW" form="xmm, imm8" xed="PSLLW_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sll_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLW" form="xmm, xmm" xed="PSLLW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_slli_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLD" form="xmm, imm8" xed="PSLLD_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sll_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLD" form="xmm, xmm" xed="PSLLD_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_slli_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLQ" form="xmm, imm8" xed="PSLLQ_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sll_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSLLQ" form="xmm, xmm" xed="PSLLQ_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_srai_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAW" form="xmm, imm8" xed="PSRAW_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sra_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+	ELSE
+		dst[i+15:i] := SignExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAW" form="xmm, xmm" xed="PSRAW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_srai_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAD" form="xmm, imm8" xed="PSRAD_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sra_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+	ELSE
+		dst[i+31:i] := SignExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRAD" form="xmm, xmm" xed="PSRAD_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_srli_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+	</operation>
+	<instruction name="PSRLDQ" form="xmm, imm8" xed="PSRLDQ_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_srli_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLW" form="xmm, imm8" xed="PSRLW_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_srl_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="count" etype="UI16"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend16(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLW" form="xmm, xmm" xed="PSRLW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_srli_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLD" form="xmm, imm8" xed="PSRLD_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_srl_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="count" etype="UI32"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend32(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLD" form="xmm, xmm" xed="PSRLD_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_srli_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLQ" form="xmm, imm8" xed="PSRLQ_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_srl_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="count" etype="UI64"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend64(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSRLQ" form="xmm, xmm" xed="PSRLQ_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_and_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := (a[127:0] AND b[127:0])
+	</operation>
+	<instruction name="PAND" form="xmm, xmm" xed="PAND_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_andnot_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<description>Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := ((NOT a[127:0]) AND b[127:0])
+	</operation>
+	<instruction name="PANDN" form="xmm, xmm" xed="PANDN_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_or_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<description>Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := (a[127:0] OR b[127:0])
+	</operation>
+	<instruction name="POR" form="xmm, xmm" xed="POR_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_xor_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<description>Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := (a[127:0] XOR b[127:0])
+	</operation>
+	<instruction name="PXOR" form="xmm, xmm" xed="PXOR_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpeq_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQB" form="xmm, xmm" xed="PCMPEQB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpeq_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQW" form="xmm, xmm" xed="PCMPEQW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpeq_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQD" form="xmm, xmm" xed="PCMPEQD_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpgt_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTB" form="xmm, xmm" xed="PCMPGTB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpgt_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTW" form="xmm, xmm" xed="PCMPGTW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpgt_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTD" form="xmm, xmm" xed="PCMPGTD_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmplt_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &lt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTB" form="xmm, xmm" xed="PCMPGTB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmplt_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &lt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTW" form="xmm, xmm" xed="PCMPGTW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmplt_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &lt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTD" form="xmm, xmm" xed="PCMPGTD_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="CVTDQ2PD" form="xmm, xmm" xed="CVTDQ2PD_XMMpd_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsi32_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="b" etype="SI32"/>
+	<description>Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="CVTSI2SD" form="xmm, r32" xed="CVTSI2SD_XMMsd_GPR32d"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsi64_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__int64" varname="b" etype="SI64"/>
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="CVTSI2SD" form="xmm, r64" xed="CVTSI2SD_XMMsd_GPR64q"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsi64x_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__int64" varname="b" etype="SI64"/>
+	<description>Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="CVTSI2SD" form="xmm, r64" xed="CVTSI2SD_XMMsd_GPR64q"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="CVTDQ2PS" form="xmm, xmm" xed="CVTDQ2PS_XMMps_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_cvtpi32_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m64" varname="a" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="CVTPI2PD" form="xmm, mm" xed="CVTPI2PD_XMMpd_MMXq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsi32_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[127:32] := 0
+	</operation>
+	<instruction name="MOVD" form="xmm, r32" xed="MOVD_XMMdq_GPR32"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsi64_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction name="MOVQ" form="xmm, r64" xed="MOVQ_XMMdq_GPR64"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsi64x_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction name="MOVQ" form="xmm, r64" xed="MOVQ_XMMdq_GPR64"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsi128_si32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name="MOVD" form="r32, xmm" xed="MOVD_GPR32_XMMd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsi128_si64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="MOVQ" form="r64, xmm" xed="MOVQ_GPR64_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsi128_si64x">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="MOVQ" form="r64, xmm" xed="MOVQ_GPR64_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m64" varname="e1" etype="UI64"/>
+	<parameter type="__m64" varname="e0" etype="UI64"/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set_epi64x">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="e1" etype="UI64"/>
+	<parameter type="__int64" varname="e0" etype="UI64"/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="e3" etype="UI32"/>
+	<parameter type="int" varname="e2" etype="UI32"/>
+	<parameter type="int" varname="e1" etype="UI32"/>
+	<parameter type="int" varname="e0" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="short" varname="e7" etype="UI16"/>
+	<parameter type="short" varname="e6" etype="UI16"/>
+	<parameter type="short" varname="e5" etype="UI16"/>
+	<parameter type="short" varname="e4" etype="UI16"/>
+	<parameter type="short" varname="e3" etype="UI16"/>
+	<parameter type="short" varname="e2" etype="UI16"/>
+	<parameter type="short" varname="e1" etype="UI16"/>
+	<parameter type="short" varname="e0" etype="UI16"/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+dst[79:64] := e4
+dst[95:80] := e5
+dst[111:96] := e6
+dst[127:112] := e7
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="char" varname="e15" etype="UI8"/>
+	<parameter type="char" varname="e14" etype="UI8"/>
+	<parameter type="char" varname="e13" etype="UI8"/>
+	<parameter type="char" varname="e12" etype="UI8"/>
+	<parameter type="char" varname="e11" etype="UI8"/>
+	<parameter type="char" varname="e10" etype="UI8"/>
+	<parameter type="char" varname="e9" etype="UI8"/>
+	<parameter type="char" varname="e8" etype="UI8"/>
+	<parameter type="char" varname="e7" etype="UI8"/>
+	<parameter type="char" varname="e6" etype="UI8"/>
+	<parameter type="char" varname="e5" etype="UI8"/>
+	<parameter type="char" varname="e4" etype="UI8"/>
+	<parameter type="char" varname="e3" etype="UI8"/>
+	<parameter type="char" varname="e2" etype="UI8"/>
+	<parameter type="char" varname="e1" etype="UI8"/>
+	<parameter type="char" varname="e0" etype="UI8"/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+dst[71:64] := e8
+dst[79:72] := e9
+dst[87:80] := e10
+dst[95:88] := e11
+dst[103:96] := e12
+dst[111:104] := e13
+dst[119:112] := e14
+dst[127:120] := e15
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set1_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set1_epi64x">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set1_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set1_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="short" varname="a" etype="UI16"/>
+	<description>Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set1_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="char" varname="a" etype="UI8"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_setr_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m64" varname="e1" etype="UI64"/>
+	<parameter type="__m64" varname="e0" etype="UI64"/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e1
+dst[127:64] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_setr_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="int" varname="e3" etype="UI32"/>
+	<parameter type="int" varname="e2" etype="UI32"/>
+	<parameter type="int" varname="e1" etype="UI32"/>
+	<parameter type="int" varname="e0" etype="UI32"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e3
+dst[63:32] := e2
+dst[95:64] := e1
+dst[127:96] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_setr_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="short" varname="e7" etype="UI16"/>
+	<parameter type="short" varname="e6" etype="UI16"/>
+	<parameter type="short" varname="e5" etype="UI16"/>
+	<parameter type="short" varname="e4" etype="UI16"/>
+	<parameter type="short" varname="e3" etype="UI16"/>
+	<parameter type="short" varname="e2" etype="UI16"/>
+	<parameter type="short" varname="e1" etype="UI16"/>
+	<parameter type="short" varname="e0" etype="UI16"/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e7
+dst[31:16] := e6
+dst[47:32] := e5
+dst[63:48] := e4
+dst[79:64] := e3
+dst[95:80] := e2
+dst[111:96] := e1
+dst[127:112] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_setr_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="char" varname="e15" etype="UI8"/>
+	<parameter type="char" varname="e14" etype="UI8"/>
+	<parameter type="char" varname="e13" etype="UI8"/>
+	<parameter type="char" varname="e12" etype="UI8"/>
+	<parameter type="char" varname="e11" etype="UI8"/>
+	<parameter type="char" varname="e10" etype="UI8"/>
+	<parameter type="char" varname="e9" etype="UI8"/>
+	<parameter type="char" varname="e8" etype="UI8"/>
+	<parameter type="char" varname="e7" etype="UI8"/>
+	<parameter type="char" varname="e6" etype="UI8"/>
+	<parameter type="char" varname="e5" etype="UI8"/>
+	<parameter type="char" varname="e4" etype="UI8"/>
+	<parameter type="char" varname="e3" etype="UI8"/>
+	<parameter type="char" varname="e2" etype="UI8"/>
+	<parameter type="char" varname="e1" etype="UI8"/>
+	<parameter type="char" varname="e0" etype="UI8"/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e15
+dst[15:8] := e14
+dst[23:16] := e13
+dst[31:24] := e12
+dst[39:32] := e11
+dst[47:40] := e10
+dst[55:48] := e9
+dst[63:56] := e8
+dst[71:64] := e7
+dst[79:72] := e6
+dst[87:80] := e5
+dst[95:88] := e4
+dst[103:96] := e3
+dst[111:104] := e2
+dst[119:112] := e1
+dst[127:120] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_setzero_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<description>Return vector of type __m128i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="PXOR" form="xmm, xmm" xed="PXOR_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_loadl_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i const*" varname="mem_addr" etype="UI64" memwidth="64"/>
+	<description>Load 64-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="MOVQ" form="xmm, m64" xed="MOVQ_XMMdq_MEMq_0F7E"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_load_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i const*" varname="mem_addr" etype="M128" memwidth="128"/>
+	<description>Load 128-bits of integer data from memory into "dst". 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name="MOVDQA" form="xmm, m128" xed="MOVDQA_XMMdq_MEMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_loadu_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i const*" varname="mem_addr" etype="M128" memwidth="128"/>
+	<description>Load 128-bits of integer data from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name="MOVDQU" form="xmm, m128" xed="MOVDQU_XMMdq_MEMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_maskmoveu_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="mask" etype="UI8"/>
+	<parameter type="char*" varname="mem_addr" etype="UI8" memwidth="128"/>
+	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF mask[i+7]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="MASKMOVDQU" form="xmm, xmm" xed="MASKMOVDQU_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_store_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m128i*" varname="mem_addr" etype="M128" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<description>Store 128-bits of integer data from "a" into memory. 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="MOVDQA" form="m128, xmm" xed="MOVDQA_MEMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_storeu_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m128i*" varname="mem_addr" etype="M128" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<description>Store 128-bits of integer data from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="MOVDQU" form="m128, xmm" xed="MOVDQU_MEMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_storel_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m128i*" varname="mem_addr" etype="UI64" memwidth="64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Store 64-bit integer from the first element of "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name="MOVQ" form="m64, xmm" xed="MOVQ_MEMq_XMMq_0F7E"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_stream_si128">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__m128i*" varname="mem_addr" etype="M128" memwidth="128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<description>Store 128-bits of integer data from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="MOVNTDQ" form="m128, xmm" xed="MOVNTDQ_MEMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_stream_si32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="int*" varname="mem_addr" etype="UI32" memwidth="32"/>
+	<parameter type="int" varname="a" etype="UI32"/>
+	<description>Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction name="MOVNTI" form="m32, r32" xed="MOVNTI_MEMd_GPR32"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_stream_si64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="__int64*" varname="mem_addr" etype="UI64" memwidth="64"/>
+	<parameter type="__int64" varname="a" etype="UI64"/>
+	<description>Store 64-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name="MOVNTI" form="m64, r64" xed="MOVNTI_MEMq_GPR64"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_movepi64_pi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="MOVDQ2Q" form="mm, xmm" xed="MOVDQ2Q_MMXq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_movpi64_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m64" varname="a" etype="UI64"/>
+	<description>Copy the 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction name="MOVQ2DQ" form="xmm, mm" xed="MOVQ2DQ_XMMdq_MMXq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_move_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Move</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction name="MOVQ" form="xmm, xmm" xed="MOVQ_XMMdq_XMMq_0F7E"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_packs_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="SI8"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate8(a[15:0])
+dst[15:8] := Saturate8(a[31:16])
+dst[23:16] := Saturate8(a[47:32])
+dst[31:24] := Saturate8(a[63:48])
+dst[39:32] := Saturate8(a[79:64])
+dst[47:40] := Saturate8(a[95:80])
+dst[55:48] := Saturate8(a[111:96])
+dst[63:56] := Saturate8(a[127:112])
+dst[71:64] := Saturate8(b[15:0])
+dst[79:72] := Saturate8(b[31:16])
+dst[87:80] := Saturate8(b[47:32])
+dst[95:88] := Saturate8(b[63:48])
+dst[103:96] := Saturate8(b[79:64])
+dst[111:104] := Saturate8(b[95:80])
+dst[119:112] := Saturate8(b[111:96])
+dst[127:120] := Saturate8(b[127:112])
+	</operation>
+	<instruction name="PACKSSWB" form="xmm, xmm" xed="PACKSSWB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_packs_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:0])
+dst[31:16] := Saturate16(a[63:32])
+dst[47:32] := Saturate16(a[95:64])
+dst[63:48] := Saturate16(a[127:96])
+dst[79:64] := Saturate16(b[31:0])
+dst[95:80] := Saturate16(b[63:32])
+dst[111:96] := Saturate16(b[95:64])
+dst[127:112] := Saturate16(b[127:96])
+	</operation>
+	<instruction name="PACKSSDW" form="xmm, xmm" xed="PACKSSDW_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_packus_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := SaturateU8(a[15:0])
+dst[15:8] := SaturateU8(a[31:16])
+dst[23:16] := SaturateU8(a[47:32])
+dst[31:24] := SaturateU8(a[63:48])
+dst[39:32] := SaturateU8(a[79:64])
+dst[47:40] := SaturateU8(a[95:80])
+dst[55:48] := SaturateU8(a[111:96])
+dst[63:56] := SaturateU8(a[127:112])
+dst[71:64] := SaturateU8(b[15:0])
+dst[79:72] := SaturateU8(b[31:16])
+dst[87:80] := SaturateU8(b[47:32])
+dst[95:88] := SaturateU8(b[63:48])
+dst[103:96] := SaturateU8(b[79:64])
+dst[111:104] := SaturateU8(b[95:80])
+dst[119:112] := SaturateU8(b[111:96])
+dst[127:120] := SaturateU8(b[127:112])
+	</operation>
+	<instruction name="PACKUSWB" form="xmm, xmm" xed="PACKUSWB_XMMdq_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_extract_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="int" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="3"/>
+	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[15:0] := (a[127:0] &gt;&gt; (imm8[2:0] * 16))[15:0]
+dst[31:16] := 0
+	</operation>
+	<instruction name="PEXTRW" form="r32, xmm, imm8" xed="PEXTRW_GPR32_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_insert_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="i" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="3"/>
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[2:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<instruction name="PINSRW" form="xmm, r32, imm8" xed="PINSRW_XMMdq_GPR32_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_movemask_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="int" varname="dst" etype="MASK"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="PMOVMSKB" form="r32, xmm" xed="PMOVMSKB_GPR32_XMMdq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+DEFINE SELECT4(src, control) {
+	CASE(control[1:0]) OF
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+	</operation>
+	<instruction name="PSHUFD" form="xmm, xmm, imm8" xed="PSHUFD_XMMdq_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+	</operation>
+	<instruction name="PSHUFHW" form="xmm, xmm, imm8" xed="PSHUFHW_XMMdq_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst".</description>
+	<operation>
+dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="PSHUFLW" form="xmm, xmm, imm8" xed="PSHUFLW_XMMdq_XMMdq_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+	</operation>
+	<instruction name="PUNPCKHBW" form="xmm, xmm" xed="PUNPCKHBW_XMMdq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="PUNPCKHWD" form="xmm, xmm" xed="PUNPCKHWD_XMMdq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="PUNPCKHDQ" form="xmm, xmm" xed="PUNPCKHDQ_XMMdq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="PUNPCKHQDQ" form="xmm, xmm" xed="PUNPCKHQDQ_XMMdq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+	</operation>
+	<instruction name="PUNPCKLBW" form="xmm, xmm" xed="PUNPCKLBW_XMMdq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) {
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="PUNPCKLWD" form="xmm, xmm" xed="PUNPCKLWD_XMMdq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="PUNPCKLDQ" form="xmm, xmm" xed="PUNPCKLDQ_XMMdq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="PUNPCKLQDQ" form="xmm, xmm" xed="PUNPCKLQDQ_XMMdq_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_add_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] + b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="ADDSD" form="xmm, xmm" xed="ADDSD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_add_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="ADDPD" form="xmm, xmm" xed="ADDPD_XMMpd_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_div_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] / b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="DIVSD" form="xmm, xmm" xed="DIVSD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_div_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="DIVPD" form="xmm, xmm" xed="DIVPD_XMMpd_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_max_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := MAX(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="MAXSD" form="xmm, xmm" xed="MAXSD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_max_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name="MAXPD" form="xmm, xmm" xed="MAXPD_XMMpd_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_min_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := MIN(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="MINSD" form="xmm, xmm" xed="MINSD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_min_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name="MINPD" form="xmm, xmm" xed="MINPD_XMMpd_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_mul_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] * b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="MULSD" form="xmm, xmm" xed="MULSD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="MULPD" form="xmm, xmm" xed="MULPD_XMMpd_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sqrt_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := SQRT(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="SQRTSD" form="xmm, xmm" xed="SQRTSD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Elementary Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name="SQRTPD" form="xmm, xmm" xed="SQRTPD_XMMpd_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sub_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="SUBSD" form="xmm, xmm" xed="SUBSD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="SUBPD" form="xmm, xmm" xed="SUBPD_XMMpd_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_and_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name="ANDPD" form="xmm, xmm" xed="ANDPD_XMMxuq_XMMxuq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name="ANDNPD" form="xmm, xmm" xed="ANDNPD_XMMxuq_XMMxuq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_or_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="ORPD" form="xmm, xmm" xed="ORPD_XMMxuq_XMMxuq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="XORPD" form="xmm, xmm" xed="XORPD_XMMxuq_XMMxuq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpeq_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] == b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmplt_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &lt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmple_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &lt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpgt_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &gt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpge_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &gt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpord_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>dst[63:0] := (a[63:0] != NaN AND b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpunord_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>dst[63:0] := (a[63:0] == NaN OR b[63:0] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpneq_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] != b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpnlt_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (!(a[63:0] &lt; b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpnle_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (!(a[63:0] &lt;= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpngt_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (!(a[63:0] &gt; b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpnge_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (!(a[63:0] &gt;= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="CMPSD" form="xmm, xmm, imm8" xed="CMPSD_XMM_XMMsd_XMMsd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpeq_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] == b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmplt_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &lt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmple_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &lt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpgt_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &gt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpge_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &gt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpord_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpunord_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpneq_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] != b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpnlt_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (!(a[i+63:i] &lt; b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpnle_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (!(a[i+63:i] &lt;= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpngt_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (!(a[i+63:i] &gt; b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cmpnge_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (!(a[i+63:i] &gt;= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="CMPPD" form="xmm, xmm, imm8" xed="CMPPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_comieq_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] == b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISD" form="xmm, xmm" xed="COMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_comilt_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] &lt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISD" form="xmm, xmm" xed="COMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_comile_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] &lt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISD" form="xmm, xmm" xed="COMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_comigt_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] &gt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISD" form="xmm, xmm" xed="COMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_comige_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] &gt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISD" form="xmm, xmm" xed="COMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_comineq_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] != b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="COMISD" form="xmm, xmm" xed="COMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_ucomieq_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] == b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISD" form="xmm, xmm" xed="UCOMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_ucomilt_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] &lt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISD" form="xmm, xmm" xed="UCOMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_ucomile_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] &lt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISD" form="xmm, xmm" xed="UCOMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_ucomigt_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] &gt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISD" form="xmm, xmm" xed="UCOMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_ucomige_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] &gt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISD" form="xmm, xmm" xed="UCOMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_ucomineq_sd">
+	<type>Floating Point</type>
+	<type>Flag</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] != b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name="UCOMISD" form="xmm, xmm" xed="UCOMISD_XMMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[127:64] := 0
+	</operation>
+	<instruction name="CVTPD2PS" form="xmm, xmm" xed="CVTPD2PS_XMMps_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtps_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+	</operation>
+	<instruction name="CVTPS2PD" form="xmm, xmm" xed="CVTPS2PD_XMMpd_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction name="CVTPD2DQ" form="xmm, xmm" xed="CVTPD2DQ_XMMdq_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsd_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction name="CVTSD2SI" form="r32, xmm" xed="CVTSD2SI_GPR32d_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsd_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name="CVTSD2SI" form="r64, xmm" xed="CVTSD2SI_GPR64q_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsd_si64x">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name="CVTSD2SI" form="r64, xmm" xed="CVTSD2SI_GPR64q_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsd_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="CVTSD2SS" form="xmm, xmm" xed="CVTSD2SS_XMMss_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtsd_f64">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="double" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name="MOVSD" form="m64, xmm" xed="MOVSD_XMM_MEMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtss_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="CVTSS2SD" form="xmm, xmm" xed="CVTSS2SD_XMMsd_XMMss"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction name="CVTTPD2DQ" form="xmm, xmm" xed="CVTTPD2DQ_XMMdq_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvttsd_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction name="CVTTSD2SI" form="r32, xmm" xed="CVTTSD2SI_GPR32d_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvttsd_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name="CVTTSD2SI" form="r64, xmm" xed="CVTTSD2SI_GPR64q_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvttsd_si64x">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name="CVTTSD2SI" form="r64, xmm" xed="CVTTSD2SI_GPR64q_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="CVTPS2DQ" form="xmm, xmm" xed="CVTPS2DQ_XMMdq_XMMps"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="CVTTPS2DQ" form="xmm, xmm" xed="CVTTPS2DQ_XMMdq_XMMps"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_cvtpd_pi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction name="CVTPD2PI" form="mm, xmm" xed="CVTPD2PI_MMXq_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_cvttpd_pi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<return type="__m64" varname="dst" etype="FP32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction name="CVTTPD2PI" form="mm, xmm" xed="CVTTPD2PI_MMXq_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="a" etype="FP64"/>
+	<description>Copy double-precision (64-bit) floating-point element "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set1_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="a" etype="FP64"/>
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set_pd1">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="a" etype="FP64"/>
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_set_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="e1" etype="FP64"/>
+	<parameter type="double" varname="e0" etype="FP64"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_setr_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double" varname="e1" etype="FP64"/>
+	<parameter type="double" varname="e0" etype="FP64"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e1
+dst[127:64] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_setzero_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="void"/>
+	<description>Return vector of type __m128d with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name="XORPD" form="xmm, xmm" xed="XORPD_XMMxuq_XMMxuq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_load_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name="MOVAPD" form="xmm, m128" xed="MOVAPD_XMMpd_MEMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_load1_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name="MOVAPD" form="xmm, m128" xed="MOVAPD_XMMpd_MEMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_load_pd1">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name="MOVAPD" form="xmm, m128" xed="MOVAPD_XMMpd_MEMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_loadr_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Load 2 double-precision (64-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name="MOVAPD" form="xmm, m128" xed="MOVAPD_XMMpd_MEMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<description>Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name="MOVUPD" form="xmm, m128" xed="MOVUPD_XMMpd_MEMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_load_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := 0
+	</operation>
+	<instruction name="MOVSD" form="xmm, m64" xed="MOVSD_XMM_XMMdq_MEMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_loadh_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="double const*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name="MOVHPD" form="xmm, m64" xed="MOVHPD_XMMsd_MEMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_loadl_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="double const*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="MOVLPD" form="xmm, m64" xed="MOVLPD_XMMsd_MEMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_stream_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="MOVNTPD" form="m128, xmm" xed="MOVNTPD_MEMdq_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_store_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name="MOVSD" form="m64, xmm" xed="MOVSD_XMM_MEMsd_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_store1_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+MEM[mem_addr+127:mem_addr+64] := a[63:0]
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_store_pd1">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+MEM[mem_addr+127:mem_addr+64] := a[63:0]
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_store_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="MOVAPD" form="m128, xmm" xed="MOVAPD_MEMpd_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name="MOVUPD" form="m128, xmm" xed="MOVUPD_MEMpd_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" sequence="TRUE" name="_mm_storer_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="128"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store 2 double-precision (64-bit) floating-point elements from "a" into memory in reverse order.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[127:64]
+MEM[mem_addr+127:mem_addr+64] := a[63:0]
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_storeh_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store the upper double-precision (64-bit) floating-point element from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[127:64]
+	</operation>
+	<instruction name="MOVHPD" form="m64, xmm" xed="MOVHPD_MEMq_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_storel_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<return type="void"/>
+	<parameter type="double*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name="MOVLPD" form="m64, xmm" xed="MOVLPD_MEMq_XMMsd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]	
+}
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="UNPCKHPD" form="xmm, xmm" xed="UNPCKHPD_XMMpd_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name="UNPCKLPD" form="xmm, xmm" xed="UNPCKLPD_XMMpd_XMMq"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_movemask_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<return type="int" varname="dst" etype="MASK"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:2] := 0
+	</operation>
+	<instruction name="MOVMSKPD" form="r32, xmm" xed="MOVMSKPD_GPR32_XMMpd"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+	</operation>
+	<instruction name="SHUFPD" form="xmm, xmm, imm8" xed="SHUFPD_XMMpd_XMMpd_IMMb"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" vexEq="TRUE" name="_mm_move_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Move</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="MOVSD" form="xmm, xmm" xed="MOVSD_XMM_XMMsd_XMMsd_0F10"/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_castpd_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Cast</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_castpd_si128">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Cast</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Cast vector of type __m128d to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_castps_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Cast</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m128 to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_castps_si128">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Cast</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Cast vector of type __m128 to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_castsi128_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Cast</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<description>Cast vector of type __m128i to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE2" name="_mm_castsi128_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Cast</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Cast vector of type __m128i to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_addsub_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF ((j &amp; 1) == 0)
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="ADDSUBPS" form="xmm, xmm" xed="ADDSUBPS_XMMps_XMMps"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_addsub_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF ((j &amp; 1) == 0)
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="ADDSUBPD" form="xmm, xmm" xed="ADDSUBPD_XMMpd_XMMpd"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_hadd_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[127:64] + a[63:0]
+dst[127:64] := b[127:64] + b[63:0]
+	</operation>
+	<instruction name="HADDPD" form="xmm, xmm" xed="HADDPD_XMMpd_XMMpd"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_hadd_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+	</operation>
+	<instruction name="HADDPS" form="xmm, xmm" xed="HADDPS_XMMps_XMMps"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_hsub_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - a[127:64]
+dst[127:64] := b[63:0] - b[127:64]
+	</operation>
+	<instruction name="HSUBPD" form="xmm, xmm" xed="HSUBPD_XMMpd_XMMpd"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_hsub_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+	</operation>
+	<instruction name="HSUBPS" form="xmm, xmm" xed="HSUBPS_XMMps_XMMps"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_lddqu_si128">
+	<type>Integer</type>
+	<CPUID>SSE3</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i const*" varname="mem_addr" etype="M128" memwidth="128"/>
+	<description>Load 128-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm_loadu_si128" when the data crosses a cache line boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name="LDDQU" form="xmm, m128" xed="LDDQU_XMMpd_MEMdq"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Move</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := a[63:0]
+	</operation>
+	<instruction name="MOVDDUP" form="xmm, xmm" xed="MOVDDUP_XMMdq_XMMq"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_loaddup_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Load</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="double const*" varname="mem_addr" etype="FP64" memwidth="64"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name="MOVDDUP" form="xmm, m64" xed="MOVDDUP_XMMdq_MEMq"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] 
+dst[63:32] := a[63:32]
+dst[95:64] := a[127:96] 
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name="MOVSHDUP" form="xmm, xmm" xed="MOVSHDUP_XMMps_XMMps"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE3" vexEq="TRUE" name="_mm_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Move</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] 
+dst[63:32] := a[31:0]
+dst[95:64] := a[95:64] 
+dst[127:96] := a[95:64]
+	</operation>
+	<instruction name="MOVSLDUP" form="xmm, xmm" xed="MOVSLDUP_XMMps_XMMps"/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_blend_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="BLENDPD" form="xmm, xmm, imm8" xed="BLENDPD_XMMdq_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_blend_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="BLENDPS" form="xmm, xmm, imm8" xed="BLENDPS_XMMdq_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_blendv_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="__m128d" varname="mask" etype="FP64"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="BLENDVPD" form="xmm, xmm" xed="BLENDVPD_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_blendv_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="__m128" varname="mask" etype="FP32"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="BLENDVPS" form="xmm, xmm" xed="BLENDVPS_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_blendv_epi8">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="__m128i" varname="mask" etype="UI8"/>
+	<description>Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF mask[i+7]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PBLENDVB" form="xmm, xmm" xed="PBLENDVB_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_blend_epi16">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PBLENDW" form="xmm, xmm, imm8" xed="PBLENDW_XMMdq_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_dp_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
+	<operation>
+DEFINE DP(a[127:0], b[127:0], imm8[7:0]) {
+	FOR j := 0 to 1
+		i := j*64
+		IF imm8[(4+j)%8]
+			temp[i+63:i] := a[i+63:i] * b[i+63:i]
+		ELSE
+			temp[i+63:i] := 0.0
+		FI
+	ENDFOR
+	
+	sum[63:0] := temp[127:64] + temp[63:0]
+	
+	FOR j := 0 to 1
+		i := j*64
+		IF imm8[j%8]
+			tmpdst[i+63:i] := sum[63:0]
+		ELSE
+			tmpdst[i+63:i] := 0.0
+		FI
+	ENDFOR
+	RETURN tmpdst[127:0]
+}
+dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
+	</operation>
+	<instruction name="DPPD" form="xmm, xmm, imm8" xed="DPPD_XMMdq_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_dp_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
+	<operation>
+DEFINE DP(a[127:0], b[127:0], imm8[7:0]) {
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[(4+j)%8]
+			temp[i+31:i] := a[i+31:i] * b[i+31:i]
+		ELSE
+			temp[i+31:i] := 0
+		FI
+	ENDFOR
+	
+	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
+	
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[j%8]
+			tmpdst[i+31:i] := sum[31:0]
+		ELSE
+			tmpdst[i+31:i] := 0
+		FI
+	ENDFOR
+	RETURN tmpdst[127:0]
+}
+dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
+	</operation>
+	<instruction name="DPPS" form="xmm, xmm, imm8" xed="DPPS_XMMdq_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_extract_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+dst[31:0] := (a[127:0] &gt;&gt; (imm8[1:0] * 32))[31:0]
+	</operation>
+	<instruction name="EXTRACTPS" form="r32, xmm, imm8" xed="EXTRACTPS_GPR32d_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_extract_epi8">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="int" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[7:0] := (a[127:0] &gt;&gt; (imm8[3:0] * 8))[7:0]
+dst[31:8] := 0
+	</operation>
+	<instruction name="PEXTRB" form="r32, xmm, imm8" xed="PEXTRB_GPR32d_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_extract_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+dst[31:0] := (a[127:0] &gt;&gt; (imm8[1:0] * 32))[31:0]
+	</operation>
+	<instruction name="PEXTRD" form="r32, xmm, imm8" xed="PEXTRD_GPR32d_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_extract_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[127:0] &gt;&gt; (imm8[0] * 64))[63:0]
+	</operation>
+	<instruction name="PEXTRQ" form="r64, xmm, imm8" xed="PEXTRQ_GPR64q_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_insert_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set).</description>
+	<operation>
+tmp2[127:0] := a[127:0]
+CASE (imm8[7:6]) OF
+0: tmp1[31:0] := b[31:0]
+1: tmp1[31:0] := b[63:32]
+2: tmp1[31:0] := b[95:64]
+3: tmp1[31:0] := b[127:96]
+ESAC
+CASE (imm8[5:4]) OF
+0: tmp2[31:0] := tmp1[31:0]
+1: tmp2[63:32] := tmp1[31:0]
+2: tmp2[95:64] := tmp1[31:0]
+3: tmp2[127:96] := tmp1[31:0]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[j%8]
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := tmp2[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="INSERTPS" form="xmm, xmm, imm8" xed="INSERTPS_XMMps_XMMps_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_insert_epi8">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="int" varname="i" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[3:0]*8
+dst[sel+7:sel] := i[7:0]
+	</operation>
+	<instruction name="PINSRB" form="xmm, r32, imm8" xed="PINSRB_XMMdq_GPR32d_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_insert_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="int" varname="i" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="2"/>
+	<description>Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[1:0]*32
+dst[sel+31:sel] := i[31:0]
+	</operation>
+	<instruction name="PINSRD" form="xmm, r32, imm8" xed="PINSRD_XMMdq_GPR32d_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_insert_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__int64" varname="i" etype="UI64"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="1"/>
+	<description>Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[0]*64
+dst[sel+63:sel] := i[63:0]
+	</operation>
+	<instruction name="PINSRQ" form="xmm, r64, imm8" xed="PINSRQ_XMMdq_GPR64q_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_max_epi8">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXSB" form="xmm, xmm" xed="PMAXSB_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_max_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXSD" form="xmm, xmm" xed="PMAXSD_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_max_epu32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXUD" form="xmm, xmm" xed="PMAXUD_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_max_epu16">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMAXUW" form="xmm, xmm" xed="PMAXUW_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_min_epi8">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINSB" form="xmm, xmm" xed="PMINSB_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_min_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINSD" form="xmm, xmm" xed="PMINSD_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_min_epu32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINUD" form="xmm, xmm" xed="PMINUD_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_min_epu16">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<parameter type="__m128i" varname="b" etype="UI16"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PMINUW" form="xmm, xmm" xed="PMINUW_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_packus_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := SaturateU16(a[31:0])
+dst[31:16] := SaturateU16(a[63:32])
+dst[47:32] := SaturateU16(a[95:64])
+dst[63:48] := SaturateU16(a[127:96])
+dst[79:64] := SaturateU16(b[31:0])
+dst[95:80] := SaturateU16(b[63:32])
+dst[111:96] := SaturateU16(b[95:64])
+dst[127:112] := SaturateU16(b[127:96])
+	</operation>
+	<instruction name="PACKUSDW" form="xmm, xmm" xed="PACKUSDW_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cmpeq_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI64"/>
+	<parameter type="__m128i" varname="b" etype="UI64"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPEQQ" form="xmm, xmm" xed="PCMPEQQ_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	dst[l+15:l] := SignExtend16(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMOVSXBW" form="xmm, xmm" xed="PMOVSXBW_XMMdq_XMMq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := SignExtend32(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVSXBD" form="xmm, xmm" xed="PMOVSXBD_XMMdq_XMMd"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := SignExtend64(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVSXBQ" form="xmm, xmm" xed="PMOVSXBQ_XMMdq_XMMw"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := SignExtend32(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVSXWD" form="xmm, xmm" xed="PMOVSXWD_XMMdq_XMMq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := SignExtend64(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVSXWQ" form="xmm, xmm" xed="PMOVSXWQ_XMMdq_XMMd"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := SignExtend64(a[k+31:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVSXDQ" form="xmm, xmm" xed="PMOVSXDQ_XMMdq_XMMq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	dst[l+15:l] := ZeroExtend16(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PMOVZXBW" form="xmm, xmm" xed="PMOVZXBW_XMMdq_XMMq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := ZeroExtend32(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVZXBD" form="xmm, xmm" xed="PMOVZXBD_XMMdq_XMMd"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := ZeroExtend64(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVZXBQ" form="xmm, xmm" xed="PMOVZXBQ_XMMdq_XMMw"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := ZeroExtend32(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVZXWD" form="xmm, xmm" xed="PMOVZXWD_XMMdq_XMMq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := ZeroExtend64(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVZXWQ" form="xmm, xmm" xed="PMOVZXWQ_XMMdq_XMMd"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := ZeroExtend64(a[k+31:k])
+ENDFOR
+	</operation>
+	<instruction name="PMOVZXDQ" form="xmm, xmm" xed="PMOVZXDQ_XMMdq_XMMq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_mul_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI64"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="PMULDQ" form="xmm, xmm" xed="PMULDQ_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="UI32"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp[63:0] := a[i+31:i] * b[i+31:i]
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+	</operation>
+	<instruction name="PMULLD" form="xmm, xmm" xed="PMULLD_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_testz_si128">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+IF ((a[127:0] AND b[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[127:0]) AND b[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction name="PTEST" form="xmm, xmm" xed="PTEST_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_testc_si128">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="k" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+IF ((a[127:0] AND b[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[127:0]) AND b[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction name="PTEST" form="xmm, xmm" xed="PTEST_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_testnzc_si128">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+IF ((a[127:0] AND b[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[127:0]) AND b[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="PTEST" form="xmm, xmm" xed="PTEST_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_test_all_zeros">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="mask" etype="M128"/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and return 1 if the result is zero, otherwise return 0.</description>
+	<operation>
+IF ((a[127:0] AND mask[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+dst := ZF
+	</operation>
+	<instruction name="PTEST" form="xmm, xmm" xed="PTEST_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_test_mix_ones_zeros">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="mask" etype="M128"/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "mask", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+IF ((a[127:0] AND mask[127:0]) == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF (((NOT a[127:0]) AND mask[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	dst := 1
+ELSE
+	dst := 0
+FI
+	</operation>
+	<instruction name="PTEST" form="xmm, xmm" xed="PTEST_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" sequence="TRUE" name="_mm_test_all_ones">
+	<type>Integer</type>
+	<type>Flag</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<description>Compute the bitwise NOT of "a" and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0.</description>
+	<operation>
+FOR j := 0 to 127
+	tmp[j] := 1
+ENDFOR
+IF (((NOT a[127:0]) AND tmp[127:0]) == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+dst := CF
+	</operation>
+	<instruction name="PCMPEQD" form="xmm, xmm" xed="PCMPEQD_XMMdq_XMMdq"/>
+	<instruction name="PTEST" form="xmm, xmm" xed="PTEST_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_round_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" immtype="_MM_FROUND"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i], rounding)
+ENDFOR
+	</operation>
+	<instruction name="ROUNDPD" form="xmm, xmm, imm8" xed="ROUNDPD_XMMpd_XMMpd_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_floor_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name="ROUNDPD" form="xmm, xmm, imm8" xed="ROUNDPD_XMMpd_XMMpd_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_ceil_pd">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name="ROUNDPD" form="xmm, xmm, imm8" xed="ROUNDPD_XMMpd_XMMpd_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_round_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" immtype="_MM_FROUND"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i], rounding)
+ENDFOR
+	</operation>
+	<instruction name="ROUNDPS" form="xmm, xmm, imm8" xed="ROUNDPS_XMMps_XMMps_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_floor_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="ROUNDPS" form="xmm, xmm, imm8" xed="ROUNDPS_XMMps_XMMps_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_ceil_ps">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="ROUNDPS" form="xmm, xmm, imm8" xed="ROUNDPS_XMMps_XMMps_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_round_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<parameter type="int" varname="rounding" etype="IMM" immtype="_MM_FROUND"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := ROUND(b[63:0], rounding)
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="ROUNDSD" form="xmm, xmm, imm8" xed="ROUNDSD_XMMq_XMMq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_floor_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := FLOOR(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="ROUNDSD" form="xmm, xmm, imm8" xed="ROUNDSD_XMMq_XMMq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_ceil_sd">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128d" varname="dst" etype="FP64"/>
+	<parameter type="__m128d" varname="a" etype="FP64"/>
+	<parameter type="__m128d" varname="b" etype="FP64"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := CEIL(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name="ROUNDSD" form="xmm, xmm, imm8" xed="ROUNDSD_XMMq_XMMq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_round_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<parameter type="int" varname="rounding" etype="IMM" immtype="_MM_FROUND"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := ROUND(b[31:0], rounding)
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="ROUNDSS" form="xmm, xmm, imm8" xed="ROUNDSS_XMMd_XMMd_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_floor_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := FLOOR(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="ROUNDSS" form="xmm, xmm, imm8" xed="ROUNDSS_XMMd_XMMd_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_ceil_ss">
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128" varname="dst" etype="FP32"/>
+	<parameter type="__m128" varname="a" etype="FP32"/>
+	<parameter type="__m128" varname="b" etype="FP32"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := CEIL(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name="ROUNDSS" form="xmm, xmm, imm8" xed="ROUNDSS_XMMd_XMMd_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_minpos_epu16">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="UI16"/>
+	<description>Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst".</description>
+	<operation>
+index[2:0] := 0
+min[15:0] := a[15:0]
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15:i] &lt; min[15:0]
+		index[2:0] := j
+		min[15:0] := a[i+15:i]
+	FI
+ENDFOR
+dst[15:0] := min[15:0]
+dst[18:16] := index[2:0]
+dst[127:19] := 0
+	</operation>
+	<instruction name="PHMINPOSUW" form="xmm, xmm" xed="PHMINPOSUW_XMMdq_XMMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_mpsadbw_epu8">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8".</description>
+	<operation>
+DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
+	a_offset := imm8[2]*32
+	b_offset := imm8[1:0]*32
+	FOR j := 0 to 7
+		i := j*8
+		k := a_offset+i
+		l := b_offset
+		tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \
+		                   ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24]))
+	ENDFOR
+	RETURN tmp[127:0]
+}
+dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
+	</operation>
+	<instruction name="MPSADBW" form="xmm, xmm, imm8" xed="MPSADBW_XMMdq_XMMdq_IMMb"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.1" vexEq="TRUE" name="_mm_stream_load_si128">
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Load</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i *" varname="mem_addr" etype="M128" memwidth="128"/>
+	<description>Load 128-bits of integer data from memory into "dst" using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name="MOVNTDQA" form="xmm, m128" xed="MOVNTDQA_XMMdq_MEMdq"/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpistrm">
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated mask in "dst".
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+IF imm8[6] // byte / word mask
+	FOR i := 0 to UpperBound
+		j := i*size
+		IF IntRes2[i]
+			dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
+		ELSE
+			dst[j+size-1:j] := 0
+		FI
+	ENDFOR
+ELSE // bit mask
+	dst[UpperBound:0] := IntRes2[UpperBound:0]
+	dst[127:UpperBound+1] := 0
+FI
+	</operation>
+	<instruction name="PCMPISTRM" form="xmm, xmm, imm8" xed="PCMPISTRM_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpistri">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated index in "dst".
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+IF imm8[6] // most significant bit
+	tmp := UpperBound
+	dst := tmp
+	DO WHILE ((tmp &gt;= 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+		dst := tmp
+	OD
+ELSE // least significant bit
+	tmp := 0
+	dst := tmp
+	DO WHILE ((tmp &lt;= UpperBound) AND a[tmp] == 0)
+		tmp := tmp + 1
+		dst := tmp
+	OD
+FI
+	</operation>
+	<instruction name="PCMPISTRI" form="xmm, xmm, imm8" xed="PCMPISTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpistrz">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128" hint="TRUE"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+bInvalid := 0
+FOR j := 0 to UpperBound
+	n := j*size
+	IF b[n+size-1:n] == 0
+		bInvalid := 1
+	FI
+ENDFOR
+dst := bInvalid
+	</operation>
+	<instruction name="PCMPISTRI" form="xmm, xmm, imm8" xed="PCMPISTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpistrc">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := (IntRes2 != 0)
+	</operation>
+	<instruction name="PCMPISTRI" form="xmm, xmm, imm8" xed="PCMPISTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpistrs">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128" hint="TRUE"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+aInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	IF a[m+size-1:m] == 0
+		aInvalid := 1
+	FI
+ENDFOR
+dst := aInvalid
+	</operation>
+	<instruction name="PCMPISTRI" form="xmm, xmm, imm8" xed="PCMPISTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpistro">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns bit 0 of the resulting bit mask.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := IntRes2[0]
+	</operation>
+	<instruction name="PCMPISTRI" form="xmm, xmm, imm8" xed="PCMPISTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpistra">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := (IntRes2 == 0) AND bInvalid
+	</operation>
+	<instruction name="PCMPISTRI" form="xmm, xmm, imm8" xed="PCMPISTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpestrm">
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="__m128i" varname="dst" etype="M128"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="int" varname="la" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="int" varname="lb" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated mask in "dst".
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+IF imm8[6] // byte / word mask
+	FOR i := 0 to UpperBound
+		j := i*size
+		IF IntRes2[i]
+			dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
+		ELSE
+			dst[j+size-1:j] := 0
+		FI
+	ENDFOR
+ELSE // bit mask
+	dst[UpperBound:0] := IntRes2[UpperBound:0]
+	dst[127:UpperBound+1] := 0
+FI
+	</operation>
+	<instruction name="PCMPESTRM" form="xmm, xmm, imm8" xed="PCMPESTRM_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpestri">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="int" varname="la" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="int" varname="lb" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated index in "dst".
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+IF imm8[6] // most significant bit
+	tmp := UpperBound
+	dst := tmp
+	DO WHILE ((tmp &gt;= 0) AND a[tmp] == 0)
+		tmp := tmp - 1
+		dst := tmp
+	OD
+ELSE // least significant bit
+	tmp := 0
+	dst := tmp
+	DO WHILE ((tmp &lt;= UpperBound) AND a[tmp] == 0)
+		tmp := tmp + 1
+		dst := tmp
+	OD
+FI
+	</operation>
+	<instruction name="PCMPESTRI" form="xmm, xmm, imm8" xed="PCMPESTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpestrz">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128" hint="TRUE"/>
+	<parameter type="int" varname="la" etype="UI32" hint="TRUE"/>
+	<parameter type="__m128i" varname="b" etype="M128" hint="TRUE"/>
+	<parameter type="int" varname="lb" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+dst := (lb &lt;= UpperBound)
+	</operation>
+	<instruction name="PCMPESTRI" form="xmm, xmm, imm8" xed="PCMPESTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpestrc">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="int" varname="la" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="int" varname="lb" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := (IntRes2 != 0)
+	</operation>
+	<instruction name="PCMPESTRI" form="xmm, xmm, imm8" xed="PCMPESTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpestrs">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128" hint="TRUE"/>
+	<parameter type="int" varname="la" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="M128" hint="TRUE"/>
+	<parameter type="int" varname="lb" etype="UI32" hint="TRUE"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+dst := (la &lt;= UpperBound)
+	</operation>
+	<instruction name="PCMPESTRI" form="xmm, xmm, imm8" xed="PCMPESTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpestro">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="int" varname="la" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="int" varname="lb" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns bit 0 of the resulting bit mask.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := IntRes2[0]
+	</operation>
+	<instruction name="PCMPESTRI" form="xmm, xmm, imm8" xed="PCMPESTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpestra">
+	<type>Flag</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<return type="int" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="M128"/>
+	<parameter type="int" varname="la" etype="UI32"/>
+	<parameter type="__m128i" varname="b" etype="M128"/>
+	<parameter type="int" varname="lb" etype="UI32"/>
+	<parameter type="const int" varname="imm8" etype="IMM" immwidth="8"/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise.
+	[strcmp_note]</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+BoolRes := 0
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			ELSE IF (aInvalid &amp;&amp; bInvalid)
+				BoolRes.word[i].bit[j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1])
+			j += 2
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes.word[i].bit[i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j]
+			k := k+1
+		ENDFOR
+	ENDFOR
+ESAC
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+// output
+dst := (IntRes2 == 0) AND (lb &gt; UpperBound)
+	</operation>
+	<instruction name="PCMPESTRI" form="xmm, xmm, imm8" xed="PCMPESTRI_XMMdq_XMMdq_IMMb"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" vexEq="TRUE" name="_mm_cmpgt_epi64">
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Compare</category>
+	<return type="__m128i" varname="dst" etype="UI64"/>
+	<parameter type="__m128i" varname="a" etype="SI64"/>
+	<parameter type="__m128i" varname="b" etype="SI64"/>
+	<description>Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] &gt; b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name="PCMPGTQ" form="xmm, xmm" xed="PCMPGTQ_XMMdq_XMMdq"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" name="_mm_crc32_u8">
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Cryptography</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="crc" etype="UI32"/>
+	<parameter type="unsigned char" varname="v" etype="UI8"/>
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[7:0] := v[0:7] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[39:0] := tmp1[7:0] &lt;&lt; 32 
+tmp4[39:0] := tmp2[31:0] &lt;&lt; 8
+tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0]
+tmp6[31:0] := MOD2(tmp5[39:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction name="CRC32" form="r32, r8" xed="CRC32_GPRyy_GPR8b"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" name="_mm_crc32_u16">
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Cryptography</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="crc" etype="UI32"/>
+	<parameter type="unsigned short" varname="v" etype="UI16"/>
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[15:0] := v[0:15] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[47:0] := tmp1[15:0] &lt;&lt; 32
+tmp4[47:0] := tmp2[31:0] &lt;&lt; 16
+tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0]
+tmp6[31:0] := MOD2(tmp5[47:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction name="CRC32" form="r32, r16" xed="CRC32_GPRyy_GPRv"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" name="_mm_crc32_u32">
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Cryptography</category>
+	<return type="unsigned int" varname="dst" etype="UI32"/>
+	<parameter type="unsigned int" varname="crc" etype="UI32"/>
+	<parameter type="unsigned int" varname="v" etype="UI32"/>
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[31:0] := v[0:31] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[63:0] := tmp1[31:0] &lt;&lt; 32
+tmp4[63:0] := tmp2[31:0] &lt;&lt; 32
+tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0]
+tmp6[31:0] := MOD2(tmp5[63:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction name="CRC32" form="r32, r32" xed="CRC32_GPRyy_GPRv"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSE4.2" name="_mm_crc32_u64">
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Cryptography</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="crc" etype="UI64"/>
+	<parameter type="unsigned __int64" varname="v" etype="UI64"/>
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst".</description>
+	<operation>tmp1[63:0] := v[0:63] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[95:0] := tmp1[31:0] &lt;&lt; 32
+tmp4[95:0] := tmp2[63:0] &lt;&lt; 64
+tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0]
+tmp6[31:0] := MOD2(tmp5[95:0], 0x11EDC6F41) // remainder from polynomial division modulus 2
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction name="CRC32" form="r64, r64" xed="CRC32_GPRyy_GPRv"/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_abs_pi8">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ABS(Int(a[i+7:i]))
+ENDFOR
+	</operation>
+	<instruction name="PABSB" form="mm, mm" xed="PABSB_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_abs_epi8">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<description>Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name="PABSB" form="xmm, xmm" xed="PABSB_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_abs_pi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ABS(Int(a[i+15:i]))
+ENDFOR
+	</operation>
+	<instruction name="PABSW" form="mm, mm" xed="PABSW_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_abs_epi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<description>Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name="PABSW" form="xmm, xmm" xed="PABSW_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_abs_pi32">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m64" varname="dst" etype="UI32"/>
+	<parameter type="__m64" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="PABSD" form="mm, mm" xed="PABSD_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_abs_epi32">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<description>Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name="PABSD" form="xmm, xmm" xed="PABSD_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Swizzle</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[3:0] := b[i+3:i]
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSHUFB" form="xmm, xmm" xed="PSHUFB_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_shuffle_pi8">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Swizzle</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[2:0] := b[i+2:i]
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSHUFB" form="mm, mm" xed="PSHUFB_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="5"/>
+	<description>Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst".</description>
+	<operation>
+tmp[255:0] := ((a[127:0] &lt;&lt; 128)[255:0] OR b[127:0]) &gt;&gt; (imm8*8)
+dst[127:0] := tmp[127:0]
+	</operation>
+	<instruction name="PALIGNR" form="xmm, xmm, imm8" xed="PALIGNR_XMMdq_XMMdq_IMMb"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_alignr_pi8">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Miscellaneous</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="UI8"/>
+	<parameter type="int" varname="imm8" etype="IMM" immwidth="4"/>
+	<description>Concatenate 8-byte blocks in "a" and "b" into a 16-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst".</description>
+	<operation>
+tmp[127:0] := ((a[63:0] &lt;&lt; 64)[127:0] OR b[63:0]) &gt;&gt; (imm8*8)
+dst[63:0] := tmp[63:0]
+	</operation>
+	<instruction name="PALIGNR" form="mm, mm, imm8" xed="PALIGNR_MMXq_MMXq_IMMb"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_hadd_epi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[31:16] + a[15:0]
+dst[31:16] := a[63:48] + a[47:32]
+dst[47:32] := a[95:80] + a[79:64]
+dst[63:48] := a[127:112] + a[111:96]
+dst[79:64] := b[31:16] + b[15:0]
+dst[95:80] := b[63:48] + b[47:32]
+dst[111:96] := b[95:80] + b[79:64]
+dst[127:112] := b[127:112] + b[111:96]
+	</operation>
+	<instruction name="PHADDW" form="xmm, xmm" xed="PHADDW_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_hadds_epi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:16] + a[15:0])
+dst[31:16] := Saturate16(a[63:48] + a[47:32])
+dst[47:32] := Saturate16(a[95:80] + a[79:64])
+dst[63:48] := Saturate16(a[127:112] + a[111:96])
+dst[79:64] := Saturate16(b[31:16] + b[15:0])
+dst[95:80] := Saturate16(b[63:48] + b[47:32])
+dst[111:96] := Saturate16(b[95:80] + b[79:64])
+dst[127:112] := Saturate16(b[127:112] + b[111:96])
+	</operation>
+	<instruction name="PHADDSW" form="xmm, xmm" xed="PHADDSW_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_hadd_epi32">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+	</operation>
+	<instruction name="PHADDD" form="xmm, xmm" xed="PHADDD_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_hadd_pi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[31:16] + a[15:0]
+dst[31:16] := a[63:48] + a[47:32]
+dst[47:32] := b[31:16] + b[15:0]
+dst[63:48] := b[63:48] + b[47:32]
+	</operation>
+	<instruction name="PHADDW" form="mm, mm" xed="PHADDW_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_hadd_pi32">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="SI32"/>
+	<parameter type="__m64" varname="a" etype="SI32"/>
+	<parameter type="__m64" varname="b" etype="SI32"/>
+	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := b[63:32] + b[31:0]
+	</operation>
+	<instruction name="PHADDW" form="mm, mm" xed="PHADDW_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_hadds_pi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[31:16] + a[15:0])
+dst[31:16] := Saturate16(a[63:48] + a[47:32])
+dst[47:32] := Saturate16(b[31:16] + b[15:0])
+dst[63:48] := Saturate16(b[63:48] + b[47:32])
+	</operation>
+	<instruction name="PHADDSW" form="mm, mm" xed="PHADDSW_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_hsub_epi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[15:0] - a[31:16]
+dst[31:16] := a[47:32] - a[63:48]
+dst[47:32] := a[79:64] - a[95:80]
+dst[63:48] := a[111:96] - a[127:112]
+dst[79:64] := b[15:0] - b[31:16]
+dst[95:80] := b[47:32] - b[63:48]
+dst[111:96] := b[79:64] - b[95:80]
+dst[127:112] := b[111:96] - b[127:112]
+	</operation>
+	<instruction name="PHSUBW" form="xmm, xmm" xed="PHSUBW_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_hsubs_epi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[15:0] - a[31:16])
+dst[31:16] := Saturate16(a[47:32] - a[63:48])
+dst[47:32] := Saturate16(a[79:64] - a[95:80])
+dst[63:48] := Saturate16(a[111:96] - a[127:112])
+dst[79:64] := Saturate16(b[15:0] - b[31:16])
+dst[95:80] := Saturate16(b[47:32] - b[63:48])
+dst[111:96] := Saturate16(b[79:64] - b[95:80])
+dst[127:112] := Saturate16(b[111:96] - b[127:112])
+	</operation>
+	<instruction name="PHSUBSW" form="xmm, xmm" xed="PHSUBSW_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_hsub_epi32">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+	</operation>
+	<instruction name="PHSUBD" form="xmm, xmm" xed="PHSUBD_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_hsub_pi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[15:0] - a[31:16]
+dst[31:16] := a[47:32] - a[63:48]
+dst[47:32] := b[15:0] - b[31:16]
+dst[63:48] := b[47:32] - b[63:48]
+	</operation>
+	<instruction name="PHSUBW" form="mm, mm" xed="PHSUBW_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_hsub_pi32">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="SI32"/>
+	<parameter type="__m64" varname="a" etype="SI32"/>
+	<parameter type="__m64" varname="b" etype="SI32"/>
+	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := b[31:0] - b[63:32]
+	</operation>
+	<instruction name="PHSUBD" form="mm, mm" xed="PHSUBD_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_hsubs_pi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate16(a[15:0] - a[31:16])
+dst[31:16] := Saturate16(a[47:32] - a[63:48])
+dst[47:32] := Saturate16(b[15:0] - b[31:16])
+dst[63:48] := Saturate16(b[47:32] - b[63:48])
+	</operation>
+	<instruction name="PHSUBSW" form="mm, mm" xed="PHSUBSW_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="SI16"/>
+	<parameter type="__m128i" varname="a" etype="UI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name="PMADDUBSW" form="xmm, xmm" xed="PMADDUBSW_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_maddubs_pi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="SI16"/>
+	<parameter type="__m64" varname="a" etype="UI8"/>
+	<parameter type="__m64" varname="b" etype="SI8"/>
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name="PMADDUBSW" form="mm, mm" xed="PMADDUBSW_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+	</operation>
+	<instruction name="PMULHRSW" form="xmm, xmm" xed="PMULHRSW_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_mulhrs_pi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+	</operation>
+	<instruction name="PMULHRSW" form="mm, mm" xed="PMULHRSW_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_sign_epi8">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI8"/>
+	<parameter type="__m128i" varname="a" etype="SI8"/>
+	<parameter type="__m128i" varname="b" etype="SI8"/>
+	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF b[i+7:i] &lt; 0
+		dst[i+7:i] := -(a[i+7:i])
+	ELSE IF b[i+7:i] == 0
+		dst[i+7:i] := 0
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSIGNB" form="xmm, xmm" xed="PSIGNB_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_sign_epi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI16"/>
+	<parameter type="__m128i" varname="a" etype="SI16"/>
+	<parameter type="__m128i" varname="b" etype="SI16"/>
+	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF b[i+15:i] &lt; 0
+		dst[i+15:i] := -(a[i+15:i])
+	ELSE IF b[i+15:i] == 0
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSIGNW" form="xmm, xmm" xed="PSIGNW_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" vexEq="TRUE" name="_mm_sign_epi32">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m128i" varname="dst" etype="UI32"/>
+	<parameter type="__m128i" varname="a" etype="SI32"/>
+	<parameter type="__m128i" varname="b" etype="SI32"/>
+	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF b[i+31:i] &lt; 0
+		dst[i+31:i] := -(a[i+31:i])
+	ELSE IF b[i+31:i] == 0
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSIGND" form="xmm, xmm" xed="PSIGND_XMMdq_XMMdq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_sign_pi8">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="UI8"/>
+	<parameter type="__m64" varname="a" etype="SI8"/>
+	<parameter type="__m64" varname="b" etype="SI8"/>
+	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF b[i+7:i] &lt; 0
+		dst[i+7:i] := -(a[i+7:i])
+	ELSE IF b[i+7:i] == 0
+		dst[i+7:i] := 0
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSIGNB" form="mm, mm" xed="PSIGNB_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_sign_pi16">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="UI16"/>
+	<parameter type="__m64" varname="a" etype="SI16"/>
+	<parameter type="__m64" varname="b" etype="SI16"/>
+	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF b[i+15:i] &lt; 0
+		dst[i+15:i] := -(a[i+15:i])
+	ELSE IF b[i+15:i] == 0
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSIGNW" form="mm, mm" xed="PSIGNW_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="SSSE3" name="_mm_sign_pi32">
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<return type="__m64" varname="dst" etype="UI32"/>
+	<parameter type="__m64" varname="a" etype="SI32"/>
+	<parameter type="__m64" varname="b" etype="SI32"/>
+	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF b[i+31:i] &lt; 0
+		dst[i+31:i] := -(a[i+31:i])
+	ELSE IF b[i+31:i] == 0
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="PSIGND" form="mm, mm" xed="PSIGND_MMXq_MMXq"/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_rdtsc">
+	<CPUID>TSC</CPUID>
+	<category>General Support</category>
+	<return type="__int64" varname="dst" etype="UI64"/>
+	<parameter type="void"/>
+	<description>Copy the current 64-bit value of the processor's time-stamp counter into "dst".</description>
+	<operation>dst[63:0] := TimeStampCounter
+	</operation>
+	<instruction name="RDTSC" xed="RDTSC"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsusldtrk">
+	<CPUID>TSXLDTRK</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<description>Mark the start of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a transactional region, subsequent loads are not added to the read set of the transaction. If this is used inside a suspend load address tracking region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP.</description>
+	<instruction name="XSUSLDTRK" xed="XSUSLDTRK"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xresldtrk">
+	<CPUID>TSXLDTRK</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<description>Mark the end of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a suspend load address tracking region it will end the suspend region and all following load addresses will be added to the transaction read set. If this is used inside an active transaction but not in a suspend region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP.</description>
+	<instruction name="XRESLDTRK" xed="XRESLDTRK"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_aesenclast_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>VAES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="__m256i" varname="RoundKey" etype="M128"/>
+	<description>Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"."</description>
+	<operation>FOR j := 0 to 1
+	i := j*128
+	a[i+127:i] := ShiftRows(a[i+127:i])
+	a[i+127:i] := SubBytes(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VAESENCLAST" form="ymm, ymm" xed="VAESENCLAST_YMMu128_YMMu128_YMMu128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_aesenc_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>VAES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="__m256i" varname="RoundKey" etype="M128"/>
+	<description>Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"."</description>
+	<operation>FOR j := 0 to 1
+	i := j*128
+	a[i+127:i] := ShiftRows(a[i+127:i])
+	a[i+127:i] := SubBytes(a[i+127:i])
+	a[i+127:i] := MixColumns(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VAESENC" form="ymm, ymm" xed="VAESENC_YMMu128_YMMu128_YMMu128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_aesdeclast_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>VAES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="__m256i" varname="RoundKey" etype="M128"/>
+	<description>Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*128
+	a[i+127:i] := InvShiftRows(a[i+127:i])
+	a[i+127:i] := InvSubBytes(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VAESDECLAST" form="ymm, ymm" xed="VAESDECLAST_YMMu128_YMMu128_YMMu128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_aesdec_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>VAES</CPUID>
+	<category>Cryptography</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="a" etype="M128"/>
+	<parameter type="__m256i" varname="RoundKey" etype="M128"/>
+	<description>Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst".</description>
+	<operation>FOR j := 0 to 1
+	i := j*128
+	a[i+127:i] := InvShiftRows(a[i+127:i])
+	a[i+127:i] := InvSubBytes(a[i+127:i])
+	a[i+127:i] := InvMixColumns(a[i+127:i])
+	dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VAESDEC" form="ymm, ymm" xed="VAESDEC_YMMu128_YMMu128_YMMu128"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm512_clmulepi64_epi128">
+	<type>Integer</type>
+	<CPUID>VPCLMULQDQ</CPUID>
+	<category>Application-Targeted</category>
+	<return type="__m512i" varname="dst" etype="M128"/>
+	<parameter type="__m512i" varname="b" etype="M128"/>
+	<parameter type="__m512i" varname="c" etype="M128"/>
+	<parameter type="const int" varname="Imm8" etype="IMM" immwidth="8"/>
+	<description>Carry-less multiplication of one quadword of
+		'b' by one quadword of 'c', stores
+		the 128-bit result in 'dst'. The immediate 'Imm8' is
+		used to determine which quadwords of 'b'
+		and 'c' should be used.</description>
+	<operation>
+DEFINE PCLMUL128(X,Y) {
+	FOR i := 0 to 63
+		TMP[i] := X[ 0 ] and Y[ i ]
+		FOR j := 1 to i
+			TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ])
+		ENDFOR
+		DEST[ i ] := TMP[ i ]
+	ENDFOR
+	FOR i := 64 to 126
+		TMP[i] := 0
+		FOR j := i - 63 to 63
+			TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ])
+		ENDFOR
+		DEST[ i ] := TMP[ i ]
+	ENDFOR
+	DEST[127] := 0
+	RETURN DEST // 128b vector
+}
+FOR i := 0 to 3
+	IF Imm8[0] == 0
+		TEMP1 := b.m128[i].qword[0]
+	ELSE
+		TEMP1 := b.m128[i].qword[1]
+	FI
+	IF Imm8[4] == 0
+		TEMP2 := c.m128[i].qword[0]
+	ELSE
+		TEMP2 := c.m128[i].qword[1]
+	FI
+	dst.m128[i] := PCLMUL128(TEMP1, TEMP2)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="VPCLMULQDQ" form="zmm, zmm, zmm, imm8" xed="VPCLMULQDQ_ZMMu128_ZMMu64_ZMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_mm256_clmulepi64_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>VPCLMULQDQ</CPUID>
+	<category>Application-Targeted</category>
+	<return type="__m256i" varname="dst" etype="M128"/>
+	<parameter type="__m256i" varname="b" etype="M128"/>
+	<parameter type="__m256i" varname="c" etype="M128"/>
+	<parameter type="const int" varname="Imm8" etype="IMM" immwidth="8"/>
+	<description>Carry-less multiplication of one quadword of
+		'b' by one quadword of 'c', stores
+		the 128-bit result in 'dst'. The immediate 'Imm8' is
+		used to determine which quadwords of 'b'
+		and 'c' should be used.</description>
+	<operation>
+DEFINE PCLMUL128(X,Y) {
+	FOR i := 0 to 63
+		TMP[i] := X[ 0 ] and Y[ i ]
+		FOR j := 1 to i
+			TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ])
+		ENDFOR
+		DEST[ i ] := TMP[ i ]
+	ENDFOR
+	FOR i := 64 to 126
+		TMP[i] := 0
+		FOR j := i - 63 to 63
+			TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ])
+		ENDFOR
+		DEST[ i ] := TMP[ i ]
+	ENDFOR
+	DEST[127] := 0
+	RETURN DEST // 128b vector
+}
+FOR i := 0 to 1
+	IF Imm8[0] == 0
+		TEMP1 := b.m128[i].qword[0]
+	ELSE
+		TEMP1 := b.m128[i].qword[1]
+	FI
+	IF Imm8[4] == 0
+		TEMP2 := c.m128[i].qword[0]
+	ELSE
+		TEMP2 := c.m128[i].qword[1]
+	FI
+	dst.m128[i] := PCLMUL128(TEMP1, TEMP2)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="VPCLMULQDQ" form="ymm, ymm, ymm, imm8" xed="VPCLMULQDQ_YMMu128_YMMu64_YMMu64_IMM8_AVX512"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_tpause">
+	<type>Flag</type>
+	<CPUID>WAITPKG</CPUID>
+	<category>Miscellaneous</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned int" varname="ctrl" etype="UI32"/>
+	<parameter type="unsigned __int64" varname="counter" etype="UI64"/>
+	<description>Directs the processor to enter an implementation-dependent optimized state until the TSC reaches or exceeds the value specified in "counter". Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF).</description>
+	<instruction name="TPAUSE" form="r32" xed="TPAUSE_GPR32u32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_umwait">
+	<type>Flag</type>
+	<CPUID>WAITPKG</CPUID>
+	<category>Miscellaneous</category>
+	<return type="unsigned char" varname="dst" etype="UI8"/>
+	<parameter type="unsigned int" varname="ctrl" etype="UI32"/>
+	<parameter type="unsigned __int64" varname="counter" etype="UI64"/>
+	<description>Directs the processor to enter an implementation-dependent optimized state while monitoring a range of addresses. The instruction wakes up when the TSC reaches or exceeds the value specified in "counter" (if the monitoring hardware did not trigger beforehand). Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF).</description>
+	<instruction name="UMWAIT" form="r32" xed="UMWAIT_GPR32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_umonitor">
+	<CPUID>WAITPKG</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="void*" varname="a"/>
+	<description>Sets up a linear address range to be
+		monitored by hardware and activates the
+		monitor. The address range should be a writeback
+		memory caching type. The address is
+		contained in "a".</description>
+	<instruction name="UMONITOR" form="r16/r32/r64" xed="UMONITOR_GPRa"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_wbnoinvd">
+	<CPUID>WBNOINVD</CPUID>
+	<category>Miscellaneous</category>
+	<return type="void"/>
+	<parameter type="void"/>
+	<description>Write back and do not flush internal caches.
+		Initiate writing-back without flushing of external
+		caches.</description>
+	<instruction name="WBNOINVD" xed="WBNOINVD"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xgetbv">
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<return type="unsigned __int64" varname="dst" etype="UI64"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<description>Copy up to 64-bits from the value of the extended control register (XCR) specified by "a" into "dst". Currently only XFEATURE_ENABLED_MASK XCR is supported.</description>
+	<operation>dst[63:0] := XCR[a]
+	</operation>
+	<instruction name="XGETBV" xed="XGETBV"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xrstor">
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="rs_mask" etype="UI64"/>
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>st_mask := mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XRSTOR" form="m8" xed="XRSTOR_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xrstor64">
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="rs_mask" etype="UI64"/>
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>st_mask := mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XRSTOR64" form="m8" xed="XRSTOR64_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsave">
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="save_mask" etype="UI64"/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XSAVE" form="m8" xed="XSAVE_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsave64">
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="save_mask" etype="UI64"/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XSAVE64" form="m8" xed="XSAVE64_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsaveopt">
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEOPT</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="save_mask" etype="UI64"/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE instruction.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XSAVEOPT" form="m8" xed="XSAVEOPT_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsaveopt64">
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEOPT</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="save_mask" etype="UI64"/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE64 instruction.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XSAVEOPT64" form="m8" xed="XSAVEOPT64_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsetbv">
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="unsigned int" varname="a" etype="UI32"/>
+	<parameter type="unsigned __int64" varname="val" etype="UI64"/>
+	<description>Copy 64-bits from "val" to the extended control register (XCR) specified by "a". Currently only XFEATURE_ENABLED_MASK XCR is supported.</description>
+	<operation>
+XCR[a] := val[63:0]
+	</operation>
+	<instruction name="XSETBV" xed="XSETBV"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsavec">
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEC</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="save_mask" etype="UI64"/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XSAVEC" form="m8" xed="XSAVEC_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsaves">
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="save_mask" etype="UI64"/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XSAVES" form="m8" xed="XSAVES_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsavec64">
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEC</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="save_mask" etype="UI64"/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XSAVEC64" form="m8" xed="XSAVEC64_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xsaves64">
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="save_mask" etype="UI64"/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>mask[62:0] := save_mask[62:0] AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XSAVEC64" form="m8" xed="XSAVEC64_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xrstors">
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="const void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="rs_mask" etype="UI64"/>
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>st_mask := mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XRSTORS" form="m8" xed="XRSTORS_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" name="_xrstors64">
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<category>OS-Targeted</category>
+	<return type="void"/>
+	<parameter type="const void *" varname="mem_addr"/>
+	<parameter type="unsigned __int64" varname="rs_mask" etype="UI64"/>
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>st_mask := mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name="XRSTORS64" form="m8" xed="XRSTORS64_MEMmxsave"/>
+	<header>immintrin.h</header>
+</intrinsic>
+</intrinsics_list>
+\ No newline at end of file
diff --git a/library/stdarch/examples/Cargo.toml b/library/stdarch/examples/Cargo.toml
new file mode 100644
index 000000000..e2590ed9f
--- /dev/null
+++ b/library/stdarch/examples/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "stdarch_examples"
+version = "0.0.0"
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Andrew Gallant <jamslam@gmail.com>",
+    "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
+]
+description = "Examples of the stdarch crate."
+edition = "2018"
+default-run = "hex"
+
+[dependencies]
+core_arch = { path = "../crates/core_arch" }
+std_detect = { path = "../crates/std_detect" }
+quickcheck = "0.9"
+rand = "0.7"
+
+[[bin]]
+name = "hex"
+path = "hex.rs"
+
+[[bin]]
+name = "connect5"
+path = "connect5.rs"
+
+[[example]]
+name = "wasm"
+crate-type = ["cdylib"]
+path = "wasm.rs"
diff --git a/library/stdarch/examples/connect5.rs b/library/stdarch/examples/connect5.rs
new file mode 100644
index 000000000..1b3325785
--- /dev/null
+++ b/library/stdarch/examples/connect5.rs
@@ -0,0 +1,1272 @@
+//! <b>Outer-Open Gomoku</b> is a board game which is a enhanced version of connect5 (Gomoku).\
+//! The game is a two-player game which played on a 15x15 Go board.\
+//! Two players take turns placing a move on an empty intersection in this board.\
+//! The winner is the first player to form an unbroken chain of five moves horizontally, vertically, or diagonally.\
+//! Unlike Gomoku, the first move is required to be placed at the two outer rows or columns of this board.\
+//! This program provides an AI playing with Minimax search with alpha-beta pruning which uses
+//! patterns on evaluation.\
+//! The avx512 intrinsic can do 32 pattern matching at one time.\
+//! This avx512 is tested with non-avx512 code to verify its correctness.\
+//!
+//! On Intel i7-7800x using single thread with fixed AVX-512 clock at 4.0GHz, the avx512 is speed up about 9x.\
+//! The average time for each move in the avx512 is around 14.00s <span>&#177;</span> 1.31s and in the non-avx512
+//! is 129.02s <span>&#177;</span> 4.96s.\
+//! On Intel Tiger Lake i7-1165G7, the avx512 is around 11.11s <span>&#177;</span> 1.31s.
+//!
+//! <b>Pattern Matching</b>\
+//! Use 512-bit to present the board state. The location 0 is top left.\
+//! 0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  <b>15</b>\
+//! 16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  <b>31</b>\
+//! ...\
+//! Pattern "OOOOO" is matching through "0 1 2 3 4", "1 2 3 4 5", ...\
+//! Using avx512, "0 1 2 3 4", "16 17 18 19 20", ... can be matched simultaneously.\
+//!
+//! //! You can test out this program via:
+//!
+//!     cargo +nightly run --release --bin connect5
+//!
+//! You should see a game self-playing. In the end of the game, it shows the average time for
+//! each move.
+
+#![feature(stdsimd, avx512_target_feature)]
+#![feature(stmt_expr_attributes)]
+
+use rand::seq::SliceRandom;
+use rand::thread_rng;
+
+use std::cmp;
+use std::time::Instant;
+
+#[cfg(target_arch = "x86")]
+use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected};
+#[cfg(target_arch = "x86_64")]
+use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected};
+
+// types
+
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum Color {
+    Black = 0,
+    White = 1,
+    Empty = 2,
+    Border = 3,
+}
+
+type Square = i32;
+type Move = i32;
+type Side = Color;
+
+// constants
+
+const FILE_SIZE: i32 = 15;
+const RANK_SIZE: i32 = 15;
+const SQUARE_SIZE: i32 = (FILE_SIZE + 1) * (FILE_SIZE + 4) + 16 + 4;
+
+const EVAL_INF: i32 = FILE_SIZE * RANK_SIZE * 100;
+const MOVE_NONE: Move = -1;
+const SCORE_NONE: i32 = -EVAL_INF - 1;
+
+/// DIRECTION 0: left to right\
+/// DIRECTION 1: top to bottom\
+/// DIRECTION 2: top left to bottom right\
+/// DIRECTION 3: top right to bottom left
+#[rustfmt::skip]
+const DIRECTION: [[i32; 5]; 4] = [ [1, 2, 3, 4, 5],
+                                   [1 * (FILE_SIZE + 1), 2 * (FILE_SIZE + 1), 3 * (FILE_SIZE + 1), 4 * (FILE_SIZE + 1), 5 * (FILE_SIZE + 1)],
+                                   [1 * (FILE_SIZE + 2), 2 * (FILE_SIZE + 2), 3 * (FILE_SIZE + 2), 4 * (FILE_SIZE + 2), 5 * (FILE_SIZE + 2)],
+                                   [1 * (FILE_SIZE + 0), 2 * (FILE_SIZE + 0), 3 * (FILE_SIZE + 0), 4 * (FILE_SIZE + 0), 5 * (FILE_SIZE + 0)]];
+
+/// A table to encode each location to a value in bit 31-0 in the bitboard for 4 direction
+#[rustfmt::skip]
+const MAPMOVEVALUE: [[i32; 239]; 4] = [ [// Direction 0 
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0,
+                                         1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17],
+                                        [// Direction 1
+                                         1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 0,
+                                         1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 0,
+                                         1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 0,
+                                         1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 0,
+                                         1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 0,
+                                         1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 0,
+                                         1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 0,
+                                         1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 0,
+                                         1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 0,
+                                         1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 0,
+                                         1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 0,
+                                         1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 0,
+                                         1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 0,
+                                         1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 0,
+                                         1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17],
+                                        [// Direction 2 
+                                         1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 0,     0,     0,     0,     0,
+                                         1<<15, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 0,     0,     0,     0,
+                                         1<<15, 1<<14, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 0,     0,     0,
+                                         1<<15, 1<<14, 1<<13, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 0,     0,
+                                         1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 0,
+                                         1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 0,
+                                         1<<9,  1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  0,
+                                         1<<8,  1<<8,  1<<13, 1<<12, 1<<11, 1<<10, 1<<9,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  0,
+                                         1<<7,  1<<7,  1<<7,  1<<12, 1<<11, 1<<10, 1<<9,  1<<8,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  0,
+                                         1<<6,  1<<6,  1<<6,  1<<6,  1<<11, 1<<10, 1<<9,  1<<8,  1<<7,  1<<6,  1<<6,  1<<6,  1<<6,  1<<6,  1<<6,  0,
+                                         1<<5,  1<<5,  1<<5,  1<<5,  1<<5,  1<<10, 1<<9,  1<<8,  1<<7,  1<<6,  1<<5,  1<<5,  1<<5,  1<<5,  1<<5,  0,
+                                         0,     1<<4,  1<<4,  1<<4,  1<<4,  1<<4,  1<<9,  1<<8,  1<<7,  1<<6,  1<<5,  1<<4,  1<<4,  1<<4,  1<<4,  0,
+                                         0,     0,     1<<3,  1<<3,  1<<3,  1<<3,  1<<3,  1<<8,  1<<7,  1<<6,  1<<5,  1<<4,  1<<3,  1<<3,  1<<3,  0,
+                                         0,     0,     0,     1<<2,  1<<2,  1<<2,  1<<2,  1<<2,  1<<7,  1<<6,  1<<5,  1<<4,  1<<3,  1<<2,  1<<2,  0,
+                                         0,     0,     0,     0,     1<<1,  1<<1,  1<<1,  1<<1,  1<<1,  1<<6,  1<<5,  1<<4,  1<<3,  1<<2,  1<<1],
+                                        [// Direction 3
+                                         0,     0,     0,     0,     1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 0,
+                                         0,     0,     0,     1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<15, 0,
+                                         0,     0,     1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<14, 1<<15, 0,
+                                         0,     1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<13, 1<<14, 1<<15, 0,
+                                         1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15, 0,
+                                         1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15, 0,
+                                         1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<9,  1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<9,  0,
+                                         1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<8,  1<<9,  1<<10, 1<<11, 1<<12, 1<<13, 1<<8,  1<<8,  0,
+                                         1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<7,  1<<8,  1<<9,  1<<10, 1<<11, 1<<12, 1<<7,  1<<7,  1<<7,  0,
+                                         1<<6,  1<<6,  1<<6,  1<<6,  1<<6,  1<<6,  1<<7,  1<<8,  1<<9,  1<<10, 1<<11, 1<<6,  1<<6,  1<<6,  1<<6,  0,
+                                         1<<5,  1<<5,  1<<5,  1<<5,  1<<5,  1<<6,  1<<7,  1<<8,  1<<9,  1<<10, 1<<5,  1<<5,  1<<5,  1<<5,  1<<5,  0,
+                                         1<<4,  1<<4,  1<<4,  1<<4,  1<<5,  1<<6,  1<<7,  1<<8,  1<<9,  1<<4,  1<<4,  1<<4,  1<<4,  1<<4,  0,     0,
+                                         1<<3,  1<<3,  1<<3,  1<<4,  1<<5,  1<<6,  1<<7,  1<<8,  1<<3,  1<<3,  1<<3,  1<<3,  1<<3,  0,     0,     0,
+                                         1<<2,  1<<2,  1<<3,  1<<4,  1<<5,  1<<6,  1<<7,  1<<2,  1<<2,  1<<2,  1<<2,  1<<2,  0,     0,     0,     0,
+                                         1<<1,  1<<2,  1<<3,  1<<4,  1<<5,  1<<6,  1<<1,  1<<1,  1<<1,  1<<1,  1<<1,  0,     0,     0,     0]
+                                        ];
+
+/// A table to encode each location to an index in the bitboard for 4 direction 
+#[rustfmt::skip]
+const MAPMOVEIDX: [[i32; 239]; 4] = [ [// Direction 0 
+                                       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+                                       1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+                                       2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  0,
+                                       3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  0,
+                                       4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  0,
+                                       5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  0,
+                                       6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  0,
+                                       7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  0,
+                                       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  0,
+                                       9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  0,
+                                       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0,
+                                       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0,
+                                       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0,
+                                       13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 0,
+                                       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],
+                                      [// Direction 1 
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
+                                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+                                      [// Direction 2
+                                       10, 9,  8,   7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,  0,  0,
+                                       11, 10, 9,   8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,  0,
+                                       12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,
+                                       13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,
+                                       14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,
+                                       15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
+                                        1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  0,
+                                        2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  0,
+                                        3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  0,
+                                        4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  0,
+                                        5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  0,
+                                        0,  5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  7,  0,
+                                        0,  0,  5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  8,  0,
+                                        0,  0,  0,  5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10,  9,  0,
+                                        0,  0,  0,  0,  5,  4,  3,  2,  1, 15, 14, 13, 12, 11, 10],
+                                      [// Direction 3
+                                       0,  0,  0,  0,   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,
+                                       0,  0,  0,  0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,
+                                       0,  0,  0,  1,   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,
+                                       0,  0,  1,  2,   3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,
+                                       0,  1,  2,  3,   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
+                                       1,  2,  3,  4,   5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0,
+                                       2,  3,  4,  5,   6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  1,  0,
+                                       3,  4,  5,  6,   7,  8,  9, 10, 11, 12, 13, 14, 15,  1,  2,  0,
+                                       4,  5,  6,  7,   8,  9, 10, 11, 12, 13, 14, 15,  1,  2,  3,  0,
+                                       5,  6,  7,  8,   9, 10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  0,
+                                       6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,
+                                       7,  8,  9,  10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,  0,
+                                       8,  9,  10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,  0,  0,
+                                       9,  10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,  0,  0,  0,
+                                       10, 11, 12, 13, 14, 15,  1,  2,  3,  4,  5,  0,  0,  0,  0]
+                                ];
+
+// structures
+
+/// Use one-dimensional array to store the board state. The location 0 is top left.\
+/// 0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  <b>15</b>\
+/// 16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  <b>31</b>\
+/// ... \
+/// position 15, 31, ... are Borders.\
+/// position 0 is file 0, rank 0.\
+/// position 17 is file 1, rank 1.\
+///
+/// Use a three-dimensional array to store the bitboard.\
+/// The first dimension is color: Black, White and Empty.\
+/// The second and third one are 2 x 512-bit. Direction 0 and 2 use the first 512-bit. Direction 1 and
+/// 3 use the second 512-bit.\
+/// Each 512-bit is a 32-bit x 16 array. Direction 0 and 1 store at bit 31-16 and Direction 2 and 3 store at bit 15-0.  
+
+pub struct Pos {
+    // position
+    state: [Color; SQUARE_SIZE as usize],
+    p_turn: Side,
+    bitboard: [[[i32; 16]; 2]; 3],
+}
+
+impl Pos {
+    pub fn init(&mut self) {
+        // starting position
+        // Set up the Border
+        for i in 0..SQUARE_SIZE as usize {
+            self.state[i] = Color::Border;
+        }
+
+        // In the beginning, all is Empty
+        for rk in 0..RANK_SIZE {
+            for fl in 0..FILE_SIZE {
+                let sq: Square = square_make(fl, rk);
+                self.state[sq as usize] = Color::Empty;
+            }
+        }
+
+        // first move is Black
+        self.p_turn = Color::Black;
+
+        let black = Color::Black as usize;
+        let white = Color::White as usize;
+        let empty = Color::Empty as usize;
+
+        // set up the corresponding bitboard
+        for i in 0..2 {
+            for j in 0..16 {
+                self.bitboard[black][i][j] = 0;
+                self.bitboard[white][i][j] = 0;
+                self.bitboard[empty][i][j] = 0;
+            }
+        }
+
+        for i in 0..2 {
+            // use bit 31-16 to store direction 0 and 1
+            #[rustfmt::skip]
+            for j in 0..FILE_SIZE as usize {
+                self.bitboard[empty][i][j] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
+            }
+        }
+
+        // use bit 15-0 to store direction 2 and 3. There are 21 for each one. We combine row1 and row16, row2 and row17, row3 and row18, row4 and row19, and row 5 and row20
+        #[rustfmt::skip]
+        for i in 0..2 {
+            self.bitboard[empty][i][0]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11); //row 0
+            self.bitboard[empty][i][1]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)/*row1*/|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row16
+            self.bitboard[empty][i][2]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)/*row2*/|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row17
+            self.bitboard[empty][i][3]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)/*row3*/|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row18
+            self.bitboard[empty][i][4]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)/*row4*/|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row19
+            self.bitboard[empty][i][5]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)/*row5*/|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row20
+            self.bitboard[empty][i][6]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5);//row6
+            self.bitboard[empty][i][7]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4);//row7
+            self.bitboard[empty][i][8]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3);//row8
+            self.bitboard[empty][i][9]  |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2);//row9
+            self.bitboard[empty][i][10] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row10
+            self.bitboard[empty][i][11] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2);//row11
+            self.bitboard[empty][i][12] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3);//row12
+            self.bitboard[empty][i][13] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4);//row13
+            self.bitboard[empty][i][14] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5);//row14
+            self.bitboard[empty][i][15] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6);//row15
+        }
+    }
+
+    pub fn do_move(&mut self, mv: Move) {
+        let atk: Side = self.p_turn;
+        let def: Side = side_opp(atk);
+
+        let mv = mv as usize;
+        let black = Color::Black as usize;
+        let white = Color::White as usize;
+        let empty = Color::Empty as usize;
+
+        match self.p_turn {
+            Color::Black => {
+                self.state[mv as usize] = Color::Black;
+                // update black move and remove empty move in bitboard
+                self.bitboard[black][0][MAPMOVEIDX[0][mv] as usize] |= MAPMOVEVALUE[0][mv];
+                self.bitboard[empty][0][MAPMOVEIDX[0][mv] as usize] ^= MAPMOVEVALUE[0][mv];
+                self.bitboard[black][1][MAPMOVEIDX[1][mv] as usize] |= MAPMOVEVALUE[1][mv];
+                self.bitboard[empty][1][MAPMOVEIDX[1][mv] as usize] ^= MAPMOVEVALUE[1][mv];
+                self.bitboard[black][0][MAPMOVEIDX[2][mv] as usize] |= MAPMOVEVALUE[2][mv];
+                self.bitboard[empty][0][MAPMOVEIDX[2][mv] as usize] ^= MAPMOVEVALUE[2][mv];
+                self.bitboard[black][1][MAPMOVEIDX[3][mv] as usize] |= MAPMOVEVALUE[3][mv];
+                self.bitboard[empty][1][MAPMOVEIDX[3][mv] as usize] ^= MAPMOVEVALUE[3][mv];
+            }
+            Color::White => {
+                self.state[mv as usize] = Color::White;
+                // update white move and remove empty move in bitboard
+                self.bitboard[white][0][MAPMOVEIDX[0][mv] as usize] |= MAPMOVEVALUE[0][mv];
+                self.bitboard[empty][0][MAPMOVEIDX[0][mv] as usize] ^= MAPMOVEVALUE[0][mv];
+                self.bitboard[white][1][MAPMOVEIDX[1][mv] as usize] |= MAPMOVEVALUE[1][mv];
+                self.bitboard[empty][1][MAPMOVEIDX[1][mv] as usize] ^= MAPMOVEVALUE[1][mv];
+                self.bitboard[white][0][MAPMOVEIDX[2][mv] as usize] |= MAPMOVEVALUE[2][mv];
+                self.bitboard[empty][0][MAPMOVEIDX[2][mv] as usize] ^= MAPMOVEVALUE[2][mv];
+                self.bitboard[white][1][MAPMOVEIDX[3][mv] as usize] |= MAPMOVEVALUE[3][mv];
+                self.bitboard[empty][1][MAPMOVEIDX[3][mv] as usize] ^= MAPMOVEVALUE[3][mv];
+            }
+            _ => panic! {},
+        }
+
+        self.p_turn = def;
+    }
+
+    fn turn(&self) -> Side {
+        self.p_turn
+    }
+
+    pub fn can_play(&self, from: Square) -> bool {
+        if self.state[from as usize] == Color::Empty {
+            true
+        } else {
+            false
+        }
+    }
+}
+
+pub struct List {
+    // legal move list
+    p_move: [Move; (FILE_SIZE * RANK_SIZE) as usize],
+    p_size: i32,
+}
+
+/// Use List to store legal moves.
+impl List {
+    pub fn clear(&mut self) {
+        self.p_size = 0;
+    }
+
+    pub fn add(&mut self, mv: Move) {
+        self.p_move[self.p_size as usize] = mv;
+        self.p_size += 1;
+    }
+
+    pub fn size(&self) -> i32 {
+        self.p_size
+    }
+
+    pub fn shuffle(&mut self) {
+        let mut rng = thread_rng();
+        let num = self.p_size;
+        let mut new_move: Vec<Move> = vec![];
+
+        for x in 0..(num as usize) {
+            new_move.push(self.p_move[x]);
+        }
+
+        new_move.shuffle(&mut rng);
+
+        for x in 0..(self.p_size as usize) {
+            self.p_move[x] = new_move[x];
+        }
+    }
+}
+
+// functions
+
+fn square_make(fl: i32, rk: i32) -> Square {
+    rk * (FILE_SIZE + 1) + fl
+}
+
+fn side_opp(sd: Side) -> Side {
+    match sd {
+        Side::White => Side::Black,
+        Side::Black => Side::White,
+        _ => panic!(""),
+    }
+}
+
+fn pos_is_winner(pos: &Pos) -> bool {
+    let current_side = side_opp(pos.p_turn);
+    check_pattern5(&pos, current_side)
+}
+
+fn pos_is_draw(pos: &Pos) -> bool {
+    let mut found: bool = true;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+            if pos.can_play(sq) {
+                found = false;
+                break;
+            }
+
+            if found == false {
+                break;
+            }
+        }
+    }
+
+    let mut out: bool = false;
+    if found == true && !pos_is_winner(pos) {
+        out = true;
+    }
+
+    out
+}
+
+#[target_feature(enable = "avx512f,avx512bw")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn pos_is_draw_avx512(pos: &Pos) -> bool {
+    let empty = Color::Empty as usize;
+
+    let board0org = _mm512_loadu_epi32(&pos.bitboard[empty][0][0]);
+
+    let answer = _mm512_set1_epi32(0);
+
+    // if all empty is 0, all board is filled.
+    let temp_mask = _mm512_mask_cmpneq_epi32_mask(0b11111111_11111111, answer, board0org);
+
+    if _popcnt32(temp_mask as i32) == 0 && !pos_is_winner_avx512(pos) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+fn pos_is_end(pos: &Pos) -> bool {
+    if pos_is_winner(pos) || pos_is_draw(pos) {
+        true
+    } else {
+        false
+    }
+}
+
+fn pos_disp(pos: &Pos) {
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            match pos.state[sq as usize] {
+                Color::Black => print!("# "),
+                Color::White => print!("O "),
+                Color::Empty => print!("- "),
+                Color::Border => print!("| "),
+            }
+        }
+
+        println!("");
+    }
+
+    match pos.turn() {
+        Color::Black => println!("black to play"),
+        Color::White => println!("white to play"),
+        _ => panic!(),
+    }
+}
+
+fn gen_moves(list: &mut List, pos: &Pos) {
+    list.clear();
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+            if pos.can_play(sq) {
+                list.add(sq);
+            }
+        }
+    }
+}
+
+/// AI: use Minimax search with alpha-beta pruning
+fn search(pos: &Pos, alpha: i32, beta: i32, depth: i32, _ply: i32) -> i32 {
+    assert!(-EVAL_INF <= alpha && alpha < beta && beta <= EVAL_INF);
+    // leaf?
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx512bw") {
+            unsafe {
+                if pos_is_winner_avx512(&pos) {
+                    return -EVAL_INF + _ply;
+                }
+
+                if pos_is_draw_avx512(&pos) {
+                    return 0;
+                }
+            }
+        } else {
+            if pos_is_winner(&pos) {
+                return -EVAL_INF + _ply;
+            }
+
+            if pos_is_draw(&pos) {
+                return 0;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        if pos_is_winner(&pos) {
+            return -EVAL_INF + _ply;
+        }
+
+        if pos_is_draw(&pos) {
+            return 0;
+        }
+    }
+
+    if depth == 0 {
+        return eval(&pos, _ply);
+    }
+
+    let p_move_new: [Move; (FILE_SIZE * RANK_SIZE) as usize] =
+        [0; (FILE_SIZE * RANK_SIZE) as usize];
+
+    let mut list = List {
+        p_move: p_move_new,
+        p_size: 0,
+    };
+
+    let mut bm: Move = MOVE_NONE;
+    let mut bs: i32 = SCORE_NONE;
+
+    gen_moves(&mut list, &pos);
+
+    // move loop
+
+    if _ply == 0 {
+        list.shuffle();
+    }
+
+    for i in 0..list.size() {
+        if bs < beta {
+            let mv: Move = list.p_move[i as usize];
+
+            let mut new_pos = Pos {
+                state: pos.state,
+                p_turn: pos.p_turn,
+                bitboard: pos.bitboard,
+            };
+
+            new_pos.do_move(mv);
+
+            let sc: i32 = -search(&new_pos, -beta, -cmp::max(alpha, bs), depth - 1, _ply + 1);
+
+            if sc > bs {
+                bm = mv;
+                bs = sc;
+            }
+        }
+    }
+
+    assert!(bm != MOVE_NONE);
+    assert!(bs >= -EVAL_INF && bs <= EVAL_INF);
+
+    if _ply == 0 {
+        bm
+    } else {
+        bs
+    } //best move at the root node, best score elsewhere
+}
+
+/// Evaluation function: give different scores to different patterns after a fixed depth.
+fn eval(pos: &Pos, _ply: i32) -> i32 {
+    let atk: Side = pos.turn();
+    let def: Side = side_opp(atk);
+
+    // check if opp has live4 which will win playing next move
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx512bw") {
+            unsafe {
+                if check_patternlive4_avx512(&pos, def) {
+                    return -4096;
+                }
+            }
+        } else {
+            if check_patternlive4(&pos, def) {
+                return -4096;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        if check_patternlive4(&pos, def) {
+            return -4096;
+        }
+    }
+
+    // check if self has live4 which will win playing next move
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx512bw") {
+            unsafe {
+                if check_patternlive4_avx512(&pos, atk) {
+                    return 2560;
+                }
+            }
+        } else {
+            if check_patternlive4(&pos, atk) {
+                return 2560;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        if check_patternlive4(&pos, atk) {
+            return 2560;
+        }
+    }
+
+    // check if self has dead4 which will win playing next move
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx512bw") {
+            unsafe {
+                if check_patterndead4_avx512(&pos, atk) > 0 {
+                    return 2560;
+                }
+            }
+        } else {
+            if check_patterndead4(&pos, atk) > 0 {
+                return 2560;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        if check_patterndead4(&pos, atk) > 0 {
+            return 2560;
+        }
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx512bw") {
+            unsafe {
+                let n_c4: i32 = check_patterndead4_avx512(&pos, def);
+                let n_c3: i32 = check_patternlive3_avx512(&pos, def);
+
+                // check if opp has 2 dead4 which will win playing next move
+                if n_c4 > 1 {
+                    return -2048;
+                }
+
+                // check if opp has a dead 4 and live 3 which will win playing the next two move
+                if n_c4 == 1 && n_c3 > 0 {
+                    return -2048;
+                }
+
+                if check_patternlive3_avx512(&pos, atk) > 1 {
+                    return 2560;
+                }
+
+                // check if opp has 2 live3 which will win playing the next two move
+                if n_c3 > 1 {
+                    return -2048;
+                }
+            }
+        } else {
+            let n_c4: i32 = check_patterndead4(&pos, def);
+            let n_c3: i32 = check_patternlive3(&pos, def);
+
+            // check if opp has 2 dead4 which will win playing next move
+            if n_c4 > 1 {
+                return -2048;
+            }
+
+            // check if opp has a dead 4 and live 3 which will win playing the next two move
+            if n_c4 == 1 && n_c3 > 0 {
+                return -2048;
+            }
+
+            // check if self has 2 live3 which will win playing the next two move
+            if check_patternlive3(&pos, atk) > 1 {
+                return 2560;
+            }
+
+            // check if opp has 2 live3 which will win playing the next two move
+            if n_c3 > 1 {
+                return -2048;
+            }
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        let n_c4: i32 = check_patterndead4(&pos, def);
+        let n_c3: i32 = check_patternlive3(&pos, def);
+
+        // check if opp has 2 dead4 which will win playing next move
+        if n_c4 > 1 {
+            return -2048;
+        }
+
+        // check if opp has a dead 4 and live 3 which will win playing the next two move
+        if n_c4 == 1 && n_c3 > 0 {
+            return -2048;
+        }
+
+        // check if self has 2 live3 which will win playing the next two move
+        if check_patternlive3(&pos, atk) > 1 {
+            return 2560;
+        }
+
+        // check if opp has 2 live3 which will win playing the next two move
+        if n_c3 > 1 {
+            return -2048;
+        }
+    }
+
+    0
+}
+
+/// Check <b>OOOOO</b>
+fn check_pattern5(pos: &Pos, sd: Side) -> bool {
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            for pat in 0..4 {
+                let idx0 = sq;
+                let idx1 = sq + DIRECTION[pat][0];
+                let idx2 = sq + DIRECTION[pat][1];
+                let idx3 = sq + DIRECTION[pat][2];
+                let idx4 = sq + DIRECTION[pat][3];
+
+                let val0 = pos.state[idx0 as usize];
+                let val1 = pos.state[idx1 as usize];
+                let val2 = pos.state[idx2 as usize];
+                let val3 = pos.state[idx3 as usize];
+                let val4 = pos.state[idx4 as usize];
+
+                #[rustfmt::skip]
+                if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+            }
+        }
+    }
+
+    if n > 0 {
+        true
+    } else {
+        false
+    }
+}
+
+/// Check <b>-OOOO-</b>
+fn check_patternlive4(pos: &Pos, sd: Side) -> bool {
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            for pat in 0..4 {
+                let idx0 = sq;
+                let idx1 = sq + DIRECTION[pat][0];
+                let idx2 = sq + DIRECTION[pat][1];
+                let idx3 = sq + DIRECTION[pat][2];
+                let idx4 = sq + DIRECTION[pat][3];
+                let idx5 = sq + DIRECTION[pat][4];
+
+                let val0 = pos.state[idx0 as usize];
+                let val1 = pos.state[idx1 as usize];
+                let val2 = pos.state[idx2 as usize];
+                let val3 = pos.state[idx3 as usize];
+                let val4 = pos.state[idx4 as usize];
+                let val5 = pos.state[idx5 as usize];
+
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+            }
+        }
+    }
+
+    if n > 0 {
+        true
+    } else {
+        false
+    }
+}
+
+/// Check <b>OOOO-, OOO-O, OO-OO, O-OOO, -OOOO</b>
+fn check_patterndead4(pos: &Pos, sd: Side) -> i32 {
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            for dir in 0..4 {
+                let idx0 = sq;
+                let idx1 = sq + DIRECTION[dir][0];
+                let idx2 = sq + DIRECTION[dir][1];
+                let idx3 = sq + DIRECTION[dir][2];
+                let idx4 = sq + DIRECTION[dir][3];
+
+                let val0 = pos.state[idx0 as usize];
+                let val1 = pos.state[idx1 as usize];
+                let val2 = pos.state[idx2 as usize];
+                let val3 = pos.state[idx3 as usize];
+                let val4 = pos.state[idx4 as usize];
+
+                #[rustfmt::skip]
+                if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+                #[rustfmt::skip]
+                if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+                #[rustfmt::skip]
+                if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+                #[rustfmt::skip]
+                if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+            }
+        }
+    }
+
+    n
+}
+
+/// Check <b>-OOO-, -OO-O-, -O-OO-</br>
+fn check_patternlive3(pos: &Pos, sd: Side) -> i32 {
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq: Square = square_make(fl, rk);
+
+            for dir in 0..4 {
+                let idx0 = sq;
+                let idx1 = sq + DIRECTION[dir][0];
+                let idx2 = sq + DIRECTION[dir][1];
+                let idx3 = sq + DIRECTION[dir][2];
+                let idx4 = sq + DIRECTION[dir][3];
+                let idx5 = sq + DIRECTION[dir][4];
+
+                let val0 = pos.state[idx0 as usize];
+                let val1 = pos.state[idx1 as usize];
+                let val2 = pos.state[idx2 as usize];
+                let val3 = pos.state[idx3 as usize];
+                let val4 = pos.state[idx4 as usize];
+                let val5 = pos.state[idx5 as usize];
+
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n +=1; }
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+                #[rustfmt::skip]
+                if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+            }
+        }
+    }
+
+    n
+}
+
+#[target_feature(enable = "avx512f,avx512bw")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn pos_is_winner_avx512(pos: &Pos) -> bool {
+    let current_side = side_opp(pos.p_turn);
+    let coloridx = current_side as usize;
+
+    let board0org: [__m512i; 2] = [
+        _mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]),
+        _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0]),
+    ]; // load states from bitboard
+
+    #[rustfmt::skip]
+    let answer = _mm512_set1_epi16((1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)); // an unbroken chain of five moves
+
+    // use Mask to filter out which data is not processed.
+    //    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
+    // 1  x x x x _ _ _ _ _ _  _  _  _  _  _  0  x  o  x  o  x  0  0  0  0  0  0  0  0  0  0  0
+    // 2  x _ _ _ _ o _ x o _  _  _  _  _  _  0  x  o  _  _  _  _  _| x  x  o  o  o  x  x  _  _
+    // .  ...
+    // .  ...
+    // .  ...
+    // 16 0 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  x  o  x  o  o  o  o  o  o  o  0  0  0  0  0  0
+    //
+    // answer_mask[0]: 01_11..............: "0" is in row 16 and column 1-16.
+    // There is no data to match (x = black, o = white, _ = empty, 0 = no data).
+    //
+    //
+    // Then, shift one space left.
+    //    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
+    // 1  x x x _ _ _ _ _ _ _  _  _  _  _  0  x  o  x  o  x  0  0  0  0  0  0  0  0  0  0  0  0
+    // .  ...
+    // .  ...
+    // .  ...
+    // 16 0 0 0 0 0 0 0 0 0 0  0  0  0  0  0  x  o  x  o  o  o  o  o  o  o  0  0  0  0  0  0  0
+    // answer_mask[1]: ................_10: "0" is in row 1 and column 17-32;
+    // There is no enough data to match (o x o x but we want to match o o o o o).
+    //
+    // answer_mask[2]: mix 2 data together (column 17-23 and column 24-32). Using Mask to make it match correctly.
+    // For example, column 23,24,25,26,27 is not a pattern and 24,25,26,27,28 is a pattern.
+    // That is why some mask bits are set to 0 from answer_mask[2] to answer_mask[10].
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 11] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_11_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_11_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_11_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_11_11_11_11_11_10];
+    let mut count_match: i32 = 0;
+
+    for dir in 0..2 {
+        // direction 0 and 1
+        let mut board0 = board0org[dir];
+        let boardf = _mm512_and_si512(answer, board0);
+        let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+        count_match += _popcnt32(temp_mask as i32);
+
+        for i in 1..11 {
+            // OOOOOOOOOOO----, the last 4 "-" cannot make an unbroken chain of five.
+            board0 = _mm512_slli_epi32(board0, 1); // shift one space left
+            let boardf = _mm512_and_si512(answer, board0); // focus on the pattern
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf); // see if it matches the pattern
+            count_match += _popcnt32(temp_mask as i32);
+        }
+    }
+
+    if count_match > 0 {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+#[target_feature(enable = "avx512f,avx512bw")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn check_patternlive4_avx512(pos: &Pos, sd: Side) -> bool {
+    let coloridx = sd as usize;
+    let emptyidx = Color::Empty as usize;
+
+    #[rustfmt::skip]
+    let answer_color = _mm512_set1_epi16(         (1<<14)|(1<<13)|(1<<12)|(1<<11)         );
+    #[rustfmt::skip]
+    let answer_empty = _mm512_set1_epi16( (1<<15)|                                (1<<10) );
+    #[rustfmt::skip]
+    let answer       = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10) );
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 10] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_10_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_10_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_10_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_10_11_11_11_11_10];
+    let board0org: [__m512i; 2] = [
+        _mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]),
+        _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0]),
+    ];
+    let board1org: [__m512i; 2] = [
+        _mm512_loadu_epi32(&pos.bitboard[emptyidx][0][0]),
+        _mm512_loadu_epi32(&pos.bitboard[emptyidx][1][0]),
+    ];
+
+    let mut count_match: i32 = 0;
+
+    for dir in 0..2 {
+        let mut board0 = board0org[dir];
+        let mut board1 = board1org[dir];
+
+        let boardf1 = _mm512_and_si512(answer_color, board0);
+        let boardf2 = _mm512_and_si512(answer_empty, board1);
+        let boardf = _mm512_or_si512(boardf1, boardf2);
+
+        let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+        count_match += _popcnt32(temp_mask as i32);
+
+        for i in 1..10 {
+            board0 = _mm512_slli_epi32(board0, 1);
+            board1 = _mm512_slli_epi32(board1, 1);
+
+            let boardf1 = _mm512_and_si512(answer_color, board0);
+            let boardf2 = _mm512_and_si512(answer_empty, board1);
+            let boardf = _mm512_or_si512(boardf1, boardf2);
+
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf);
+            count_match += _popcnt32(temp_mask as i32);
+        }
+    }
+
+    if count_match > 0 {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+#[target_feature(enable = "avx512f,avx512bw")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn check_patterndead4_avx512(pos: &Pos, sd: Side) -> i32 {
+    let coloridx = sd as usize;
+    let emptyidx = Color::Empty as usize;
+
+    #[rustfmt::skip]
+    let answer_color: [__m512i; 5] = [_mm512_set1_epi16(         (1<<14)|(1<<13)|(1<<12)|(1<<11) ),
+                                      _mm512_set1_epi16( (1<<15)|        (1<<13)|(1<<12)|(1<<11) ),
+                                      _mm512_set1_epi16( (1<<15)|(1<<14)        |(1<<12)|(1<<11) ),
+                                      _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)        |(1<<11) ),
+                                      _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)         )];
+    #[rustfmt::skip]
+    let answer_empty: [__m512i; 5]= [_mm512_set1_epi16( 1<<15 ),
+                                     _mm512_set1_epi16(          1<<14 ),
+                                     _mm512_set1_epi16(                  1<<13 ),
+                                     _mm512_set1_epi16(                          1<<12 ),
+                                     _mm512_set1_epi16(                                   1<<11)];
+    #[rustfmt::skip]
+    let answer       = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11));
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 11] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_11_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_11_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_11_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_11_11_11_11_11_10];
+    let board0org: [__m512i; 2] = [
+        _mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]),
+        _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0]),
+    ];
+    let board1org: [__m512i; 2] = [
+        _mm512_loadu_epi32(&pos.bitboard[emptyidx][0][0]),
+        _mm512_loadu_epi32(&pos.bitboard[emptyidx][1][0]),
+    ];
+
+    let mut count_match: i32 = 0;
+
+    for pattern in 0..5 {
+        for dir in 0..2 {
+            let mut board0 = board0org[dir];
+            let mut board1 = board1org[dir];
+
+            let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+            let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+            let boardf = _mm512_or_si512(boardf1, boardf2);
+
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+            count_match += _popcnt32(temp_mask as i32);
+
+            for i in 1..11 {
+                board0 = _mm512_slli_epi32(board0, 1);
+                board1 = _mm512_slli_epi32(board1, 1);
+
+                let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+                let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+                let boardf = _mm512_or_si512(boardf1, boardf2);
+
+                let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf);
+                count_match += _popcnt32(temp_mask as i32);
+            }
+        }
+    }
+
+    count_match
+}
+
+#[target_feature(enable = "avx512f,avx512bw")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn check_patternlive3_avx512(pos: &Pos, sd: Side) -> i32 {
+    let coloridx = sd as usize;
+    let emptyidx = Color::Empty as usize;
+
+    #[rustfmt::skip]
+    let board0org: [__m512i; 2]  = [_mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]), _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0])];
+    #[rustfmt::skip]
+    let board1org: [__m512i; 2]  = [_mm512_loadu_epi32(&pos.bitboard[emptyidx][0][0]), _mm512_loadu_epi32(&pos.bitboard[emptyidx][1][0])];
+
+    #[rustfmt::skip]
+    let answer_color: [__m512i; 1] = [_mm512_set1_epi16(         (1<<14)|(1<<13)|(1<<12)         )];
+    #[rustfmt::skip]
+    let answer_empty: [__m512i; 1] = [_mm512_set1_epi16( (1<<15)|                        (1<<11) )];
+    #[rustfmt::skip]
+    let answer: __m512i = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11) );
+
+    let mut count_match: i32 = 0;
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 11] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_11_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_11_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_11_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_11_11_11_11_11_10];
+    for pattern in 0..1 {
+        for dir in 0..2 {
+            let mut board0 = board0org[dir];
+            let mut board1 = board1org[dir];
+
+            let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+            let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+            let boardf = _mm512_or_si512(boardf1, boardf2);
+
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+            count_match += _popcnt32(temp_mask as i32);
+
+            for i in 1..11 {
+                board0 = _mm512_slli_epi32(board0, 1);
+                board1 = _mm512_slli_epi32(board1, 1);
+
+                let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+                let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+                let boardf = _mm512_or_si512(boardf1, boardf2);
+
+                let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf);
+                count_match += _popcnt32(temp_mask as i32);
+            }
+        }
+    }
+
+    #[rustfmt::skip]
+    let answer_color: [__m512i; 2] = [_mm512_set1_epi16(          (1<<14)|        (1<<12)|(1<<11) ),
+                                      _mm512_set1_epi16(          (1<<14)|(1<<13)        |(1<<11) )];
+    #[rustfmt::skip]
+    let answer_empty: [__m512i; 2] = [_mm512_set1_epi16( (1<<15)|         (1<<13)|                (1<<10) ),
+                                      _mm512_set1_epi16( (1<<15)|                 (1<<12)|        (1<<10) )];
+    #[rustfmt::skip]
+    let answer: __m512i = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10) );
+
+    #[rustfmt::skip]
+    let answer_mask: [__mmask32; 10] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10,
+                                        0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10,
+                                        0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_10_10,
+                                        0b00_10_11_11_11_11_11_11_11_10_10_10_10_10_11_10,
+                                        0b00_10_10_11_11_11_11_11_10_10_10_10_10_11_11_10,
+                                        0b00_10_10_10_11_11_11_10_10_10_10_10_11_11_11_10,
+                                        0b00_10_10_10_10_11_10_10_10_10_10_11_11_11_11_10];
+    for pattern in 0..2 {
+        for dir in 0..2 {
+            let mut board0 = board0org[dir];
+            let mut board1 = board1org[dir];
+
+            let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+            let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+            let boardf = _mm512_or_si512(boardf1, boardf2);
+
+            let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf);
+            count_match += _popcnt32(temp_mask as i32);
+
+            for i in 1..10 {
+                board0 = _mm512_slli_epi32(board0, 1);
+                board1 = _mm512_slli_epi32(board1, 1);
+
+                let boardf1 = _mm512_and_si512(answer_color[pattern], board0);
+                let boardf2 = _mm512_and_si512(answer_empty[pattern], board1);
+                let boardf = _mm512_or_si512(boardf1, boardf2);
+
+                let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf);
+                count_match += _popcnt32(temp_mask as i32);
+            }
+        }
+    }
+
+    count_match
+}
+
+fn main() {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx512bw") {
+            println!("\n\nThe program is running with avx512f and avx512bw intrinsics\n\n");
+        } else {
+            println!("\n\nThe program is running with NO intrinsics.\n\n");
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    {
+        println!("\n\nThe program is running with NO intrinsics.\n\n");
+    }
+
+    loop {
+        let start = Instant::now();
+
+        println!("Hello, this is Connect5 (Outer-Open Gomoku)!");
+        println!("Self-playing with search depth = 4");
+
+        let test_state: [Color; SQUARE_SIZE as usize] = [Color::Empty; SQUARE_SIZE as usize];
+        let test_bitboard: [[[i32; 16]; 2]; 3] = [[[0; 16]; 2]; 3];
+
+        let mut test1 = Pos {
+            state: test_state,
+            p_turn: Color::Black,
+            bitboard: test_bitboard,
+        };
+
+        test1.init();
+
+        let mut count: i32 = 0;
+
+        for i in 0..(FILE_SIZE * RANK_SIZE) {
+            let mut next_move: Move = square_make(1, 7); // set the first move is (1,7)
+
+            if i > 0 {
+                next_move = search(&test1, -EVAL_INF, EVAL_INF, 4, 0);
+            } // search depth = 4
+
+            test1.do_move(next_move);
+            pos_disp(&test1);
+
+            if pos_is_end(&test1) {
+                println!("Game over!!!!!! at Move {}", i);
+                count = i + 1;
+                break;
+            }
+        }
+
+        let duration = start.elapsed();
+
+        println!(
+            "Average time for each move is: {:?}",
+            duration / count as u32
+        );
+    }
+}
diff --git a/library/stdarch/examples/hex.rs b/library/stdarch/examples/hex.rs
new file mode 100644
index 000000000..812836d66
--- /dev/null
+++ b/library/stdarch/examples/hex.rs
@@ -0,0 +1,401 @@
+//! An example showing runtime dispatch to an architecture-optimized
+//! implementation.
+//!
+//! This program implements hex encoding a slice into a predetermined
+//! destination using various different instruction sets. This selects at
+//! runtime the most optimized implementation and uses that rather than being
+//! required to be compiled differently.
+//!
+//! You can test out this program via:
+//!
+//!     echo test | cargo +nightly run --release hex
+//!
+//! and you should see `746573740a` get printed out.
+
+#![feature(stdsimd, wasm_target_feature)]
+#![cfg_attr(test, feature(test))]
+#![allow(
+    clippy::unwrap_used,
+    clippy::print_stdout,
+    clippy::unwrap_used,
+    clippy::shadow_reuse,
+    clippy::cast_possible_wrap,
+    clippy::cast_ptr_alignment,
+    clippy::cast_sign_loss,
+    clippy::missing_docs_in_private_items
+)]
+
+use std::{
+    io::{self, Read},
+    str,
+};
+
+#[cfg(target_arch = "x86")]
+use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected};
+#[cfg(target_arch = "x86_64")]
+use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected};
+
+fn main() {
+    let mut input = Vec::new();
+    io::stdin().read_to_end(&mut input).unwrap();
+    let mut dst = vec![0; 2 * input.len()];
+    let s = hex_encode(&input, &mut dst).unwrap();
+    println!("{}", s);
+}
+
+fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    let len = src.len().checked_mul(2).unwrap();
+    if dst.len() < len {
+        return Err(len);
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx2") {
+            return unsafe { hex_encode_avx2(src, dst) };
+        }
+        if is_x86_feature_detected!("sse4.1") {
+            return unsafe { hex_encode_sse41(src, dst) };
+        }
+    }
+    #[cfg(target_arch = "wasm32")]
+    {
+        if true {
+            return unsafe { hex_encode_simd128(src, dst) };
+        }
+    }
+
+    hex_encode_fallback(src, dst)
+}
+
+#[target_feature(enable = "avx2")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn hex_encode_avx2<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    let ascii_zero = _mm256_set1_epi8(b'0' as i8);
+    let nines = _mm256_set1_epi8(9);
+    let ascii_a = _mm256_set1_epi8((b'a' - 9 - 1) as i8);
+    let and4bits = _mm256_set1_epi8(0xf);
+
+    let mut i = 0_isize;
+    while src.len() >= 32 {
+        let invec = _mm256_loadu_si256(src.as_ptr() as *const _);
+
+        let masked1 = _mm256_and_si256(invec, and4bits);
+        let masked2 = _mm256_and_si256(_mm256_srli_epi64(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = _mm256_cmpgt_epi8(masked1, nines);
+        let cmpmask2 = _mm256_cmpgt_epi8(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = _mm256_add_epi8(masked1, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask1));
+        let masked2 = _mm256_add_epi8(masked2, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask2));
+
+        // interleave masked1 and masked2 bytes
+        let res1 = _mm256_unpacklo_epi8(masked2, masked1);
+        let res2 = _mm256_unpackhi_epi8(masked2, masked1);
+
+        // Store everything into the right destination now
+        let base = dst.as_mut_ptr().offset(i * 2);
+        let base1 = base.offset(0) as *mut _;
+        let base2 = base.offset(16) as *mut _;
+        let base3 = base.offset(32) as *mut _;
+        let base4 = base.offset(48) as *mut _;
+        _mm256_storeu2_m128i(base3, base1, res1);
+        _mm256_storeu2_m128i(base4, base2, res2);
+        src = &src[32..];
+        i += 32;
+    }
+
+    let i = i as usize;
+    let _ = hex_encode_sse41(src, &mut dst[i * 2..]);
+
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
+}
+
+// copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
+#[target_feature(enable = "sse4.1")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    let ascii_zero = _mm_set1_epi8(b'0' as i8);
+    let nines = _mm_set1_epi8(9);
+    let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
+    let and4bits = _mm_set1_epi8(0xf);
+
+    let mut i = 0_isize;
+    while src.len() >= 16 {
+        let invec = _mm_loadu_si128(src.as_ptr() as *const _);
+
+        let masked1 = _mm_and_si128(invec, and4bits);
+        let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
+        let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = _mm_add_epi8(masked1, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1));
+        let masked2 = _mm_add_epi8(masked2, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2));
+
+        // interleave masked1 and masked2 bytes
+        let res1 = _mm_unpacklo_epi8(masked2, masked1);
+        let res2 = _mm_unpackhi_epi8(masked2, masked1);
+
+        _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+        _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
+        src = &src[16..];
+        i += 16;
+    }
+
+    let i = i as usize;
+    let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
+
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
+}
+
+#[cfg(target_arch = "wasm32")]
+#[target_feature(enable = "simd128")]
+unsafe fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    use core_arch::arch::wasm32::*;
+
+    let ascii_zero = u8x16_splat(b'0');
+    let nines = u8x16_splat(9);
+    let ascii_a = u8x16_splat(b'a' - 9 - 1);
+    let and4bits = u8x16_splat(0xf);
+
+    let mut i = 0_isize;
+    while src.len() >= 16 {
+        let invec = v128_load(src.as_ptr() as *const _);
+
+        let masked1 = v128_and(invec, and4bits);
+        let masked2 = v128_and(u8x16_shr(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = u8x16_gt(masked1, nines);
+        let cmpmask2 = u8x16_gt(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = u8x16_add(masked1, v128_bitselect(ascii_a, ascii_zero, cmpmask1));
+        let masked2 = u8x16_add(masked2, v128_bitselect(ascii_a, ascii_zero, cmpmask2));
+
+        // Next we need to shuffle around masked{1,2} to get back to the
+        // original source text order. The first element (res1) we'll store uses
+        // all the low bytes from the 2 masks and the second element (res2) uses
+        // all the upper bytes.
+        let res1 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+            masked2, masked1,
+        );
+        let res2 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+            masked2, masked1,
+        );
+
+        v128_store(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+        v128_store(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
+        src = &src[16..];
+        i += 16;
+    }
+
+    let i = i as usize;
+    let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
+
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
+}
+
+fn hex_encode_fallback<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    fn hex(byte: u8) -> u8 {
+        static TABLE: &[u8] = b"0123456789abcdef";
+        TABLE[byte as usize]
+    }
+
+    for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
+        slots[0] = hex((*byte >> 4) & 0xf);
+        slots[1] = hex(*byte & 0xf);
+    }
+
+    unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) }
+}
+
+// Run these with `cargo +nightly test --example hex -p stdarch`
+#[cfg(test)]
+mod tests {
+    use std::iter;
+
+    use super::*;
+
+    fn test(input: &[u8], output: &str) {
+        let tmp = || vec![0; input.len() * 2];
+
+        assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output);
+        assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output);
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        unsafe {
+            if self::is_x86_feature_detected!("avx2") {
+                assert_eq!(hex_encode_avx2(input, &mut tmp()).unwrap(), output);
+            }
+            if self::is_x86_feature_detected!("sse4.1") {
+                assert_eq!(hex_encode_sse41(input, &mut tmp()).unwrap(), output);
+            }
+        }
+    }
+
+    #[test]
+    fn empty() {
+        test(b"", "");
+    }
+
+    #[test]
+    fn big() {
+        test(
+            &[0; 1024],
+            &iter::repeat('0').take(2048).collect::<String>(),
+        );
+    }
+
+    #[test]
+    fn odd() {
+        test(
+            &[0; 313],
+            &iter::repeat('0').take(313 * 2).collect::<String>(),
+        );
+    }
+
+    #[test]
+    fn avx_works() {
+        let mut input = [0; 33];
+        input[4] = 3;
+        input[16] = 3;
+        input[17] = 0x30;
+        input[21] = 1;
+        input[31] = 0x24;
+        test(
+            &input,
+            "\
+             0000000003000000\
+             0000000000000000\
+             0330000000010000\
+             0000000000000024\
+             00\
+             ",
+        );
+    }
+
+    quickcheck::quickcheck! {
+        fn encode_equals_fallback(input: Vec<u8>) -> bool {
+            let mut space1 = vec![0; input.len() * 2];
+            let mut space2 = vec![0; input.len() * 2];
+            let a = hex_encode(&input, &mut space1).unwrap();
+            let b = hex_encode_fallback(&input, &mut space2).unwrap();
+            a == b
+        }
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        fn avx_equals_fallback(input: Vec<u8>) -> bool {
+            if !self::is_x86_feature_detected!("avx2") {
+                return true
+            }
+            let mut space1 = vec![0; input.len() * 2];
+            let mut space2 = vec![0; input.len() * 2];
+            let a = unsafe { hex_encode_avx2(&input, &mut space1).unwrap() };
+            let b = hex_encode_fallback(&input, &mut space2).unwrap();
+            a == b
+        }
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        fn sse41_equals_fallback(input: Vec<u8>) -> bool {
+            if !self::is_x86_feature_detected!("avx2") {
+                return true
+            }
+            let mut space1 = vec![0; input.len() * 2];
+            let mut space2 = vec![0; input.len() * 2];
+            let a = unsafe { hex_encode_sse41(&input, &mut space1).unwrap() };
+            let b = hex_encode_fallback(&input, &mut space2).unwrap();
+            a == b
+        }
+    }
+}
+
+// Run these with `cargo +nightly bench --example hex -p stdarch`
+#[cfg(test)]
+mod benches {
+    extern crate rand;
+    extern crate test;
+
+    use self::rand::Rng;
+
+    use super::*;
+
+    const SMALL_LEN: usize = 117;
+    const LARGE_LEN: usize = 1 * 1024 * 1024;
+
+    fn doit(
+        b: &mut test::Bencher,
+        len: usize,
+        f: for<'a> unsafe fn(&[u8], &'a mut [u8]) -> Result<&'a str, usize>,
+    ) {
+        let mut rng = rand::thread_rng();
+        let input = std::iter::repeat(())
+            .map(|()| rng.gen::<u8>())
+            .take(len)
+            .collect::<Vec<_>>();
+        let mut dst = vec![0; input.len() * 2];
+        b.bytes = len as u64;
+        b.iter(|| unsafe {
+            f(&input, &mut dst).unwrap();
+            dst[0]
+        });
+    }
+
+    #[bench]
+    fn small_default(b: &mut test::Bencher) {
+        doit(b, SMALL_LEN, hex_encode);
+    }
+
+    #[bench]
+    fn small_fallback(b: &mut test::Bencher) {
+        doit(b, SMALL_LEN, hex_encode_fallback);
+    }
+
+    #[bench]
+    fn large_default(b: &mut test::Bencher) {
+        doit(b, LARGE_LEN, hex_encode);
+    }
+
+    #[bench]
+    fn large_fallback(b: &mut test::Bencher) {
+        doit(b, LARGE_LEN, hex_encode_fallback);
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    mod x86 {
+        use super::*;
+
+        #[bench]
+        fn small_avx2(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("avx2") {
+                doit(b, SMALL_LEN, hex_encode_avx2);
+            }
+        }
+
+        #[bench]
+        fn small_sse41(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("sse4.1") {
+                doit(b, SMALL_LEN, hex_encode_sse41);
+            }
+        }
+
+        #[bench]
+        fn large_avx2(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("avx2") {
+                doit(b, LARGE_LEN, hex_encode_avx2);
+            }
+        }
+
+        #[bench]
+        fn large_sse41(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("sse4.1") {
+                doit(b, LARGE_LEN, hex_encode_sse41);
+            }
+        }
+    }
+}
diff --git a/library/stdarch/examples/wasm.rs b/library/stdarch/examples/wasm.rs
new file mode 100644
index 000000000..6b92ae9b8
--- /dev/null
+++ b/library/stdarch/examples/wasm.rs
@@ -0,0 +1,45 @@
+//! A simple slab allocator for pages in wasm
+
+#![feature(stdsimd)]
+#![cfg(target_arch = "wasm32")]
+
+use std::ptr;
+
+use core_arch::arch::wasm32::*;
+
+static mut HEAD: *mut *mut u8 = 0 as _;
+
+#[no_mangle]
+pub unsafe extern "C" fn page_alloc() -> *mut u8 {
+    if !HEAD.is_null() {
+        let next = *HEAD;
+        let ret = HEAD;
+        HEAD = next as *mut _;
+        return ret as *mut u8;
+    }
+
+    let ret = memory_grow(0, 1);
+
+    // if we failed to allocate a page then return null
+    if ret == usize::MAX {
+        return ptr::null_mut();
+    }
+
+    ((ret as u32) * page_size()) as *mut u8
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn page_free(page: *mut u8) {
+    let page = page as *mut *mut u8;
+    *page = HEAD as *mut u8;
+    HEAD = page;
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn memory_used() -> usize {
+    (page_size() * (memory_size(0) as u32)) as usize
+}
+
+fn page_size() -> u32 {
+    64 * 1024
+}
diff --git a/library/stdarch/rustfmt.toml b/library/stdarch/rustfmt.toml
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/library/stdarch/rustfmt.toml
diff --git a/library/stdarch/triagebot.toml b/library/stdarch/triagebot.toml
new file mode 100644
index 000000000..fa0824ac5
--- /dev/null
+++ b/library/stdarch/triagebot.toml
@@ -0,0 +1 @@
+[assign]
diff --git a/library/stdarch/vendor.yml b/library/stdarch/vendor.yml
new file mode 100644
index 000000000..ac5b408ac
--- /dev/null
+++ b/library/stdarch/vendor.yml
@@ -0,0 +1,2 @@
+- crates/stdarch-verify/arm-intrinsics.html
+- crates/stdarch-verify/x86-intel.xml
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:02:58 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:02:58 +0000
commit	698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree	173a775858bd501c378080a10dca74132f05bc50 /library/stdarch
parent	Initial commit. (diff)
download	rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip